Commit ba74c1448f127649046615ec017bded7b2a76f29

Authored by Thomas Gleixner
Committed by Ingo Molnar
1 parent bd2f55361f

sched/rt: Document scheduler related skip-resched-check sites

Create a distinction between scheduler related preempt_enable_no_resched()
calls and the nearly one hundred other places in the kernel that do not
want to reschedule, for one reason or another.

This distinction matters for -rt, where the scheduler and the non-scheduler
preempt models (and checks) are different. For upstream it's purely
documentational.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/n/tip-gs88fvx2mdv5psnzxnv575ke@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 5 changed files with 11 additions and 8 deletions Inline Diff

arch/powerpc/kernel/idle.c
1 /* 1 /*
2 * Idle daemon for PowerPC. Idle daemon will handle any action 2 * Idle daemon for PowerPC. Idle daemon will handle any action
3 * that needs to be taken when the system becomes idle. 3 * that needs to be taken when the system becomes idle.
4 * 4 *
5 * Originally written by Cort Dougan (cort@cs.nmt.edu). 5 * Originally written by Cort Dougan (cort@cs.nmt.edu).
6 * Subsequent 32-bit hacking by Tom Rini, Armin Kuster, 6 * Subsequent 32-bit hacking by Tom Rini, Armin Kuster,
7 * Paul Mackerras and others. 7 * Paul Mackerras and others.
8 * 8 *
9 * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com> 9 * iSeries supported added by Mike Corrigan <mikejc@us.ibm.com>
10 * 10 *
11 * Additional shared processor, SMT, and firmware support 11 * Additional shared processor, SMT, and firmware support
12 * Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com> 12 * Copyright (c) 2003 Dave Engebretsen <engebret@us.ibm.com>
13 * 13 *
14 * 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org> 14 * 32-bit and 64-bit versions merged by Paul Mackerras <paulus@samba.org>
15 * 15 *
16 * This program is free software; you can redistribute it and/or 16 * This program is free software; you can redistribute it and/or
17 * modify it under the terms of the GNU General Public License 17 * modify it under the terms of the GNU General Public License
18 * as published by the Free Software Foundation; either version 18 * as published by the Free Software Foundation; either version
19 * 2 of the License, or (at your option) any later version. 19 * 2 of the License, or (at your option) any later version.
20 */ 20 */
21 21
22 #include <linux/sched.h> 22 #include <linux/sched.h>
23 #include <linux/kernel.h> 23 #include <linux/kernel.h>
24 #include <linux/smp.h> 24 #include <linux/smp.h>
25 #include <linux/cpu.h> 25 #include <linux/cpu.h>
26 #include <linux/sysctl.h> 26 #include <linux/sysctl.h>
27 #include <linux/tick.h> 27 #include <linux/tick.h>
28 28
29 #include <asm/system.h> 29 #include <asm/system.h>
30 #include <asm/processor.h> 30 #include <asm/processor.h>
31 #include <asm/cputable.h> 31 #include <asm/cputable.h>
32 #include <asm/time.h> 32 #include <asm/time.h>
33 #include <asm/machdep.h> 33 #include <asm/machdep.h>
34 #include <asm/smp.h> 34 #include <asm/smp.h>
35 35
36 #ifdef CONFIG_HOTPLUG_CPU 36 #ifdef CONFIG_HOTPLUG_CPU
37 #define cpu_should_die() cpu_is_offline(smp_processor_id()) 37 #define cpu_should_die() cpu_is_offline(smp_processor_id())
38 #else 38 #else
39 #define cpu_should_die() 0 39 #define cpu_should_die() 0
40 #endif 40 #endif
41 41
42 unsigned long cpuidle_disable = IDLE_NO_OVERRIDE; 42 unsigned long cpuidle_disable = IDLE_NO_OVERRIDE;
43 EXPORT_SYMBOL(cpuidle_disable); 43 EXPORT_SYMBOL(cpuidle_disable);
44 44
45 static int __init powersave_off(char *arg) 45 static int __init powersave_off(char *arg)
46 { 46 {
47 ppc_md.power_save = NULL; 47 ppc_md.power_save = NULL;
48 cpuidle_disable = IDLE_POWERSAVE_OFF; 48 cpuidle_disable = IDLE_POWERSAVE_OFF;
49 return 0; 49 return 0;
50 } 50 }
51 __setup("powersave=off", powersave_off); 51 __setup("powersave=off", powersave_off);
52 52
53 /* 53 /*
54 * The body of the idle task. 54 * The body of the idle task.
55 */ 55 */
56 void cpu_idle(void) 56 void cpu_idle(void)
57 { 57 {
58 if (ppc_md.idle_loop) 58 if (ppc_md.idle_loop)
59 ppc_md.idle_loop(); /* doesn't return */ 59 ppc_md.idle_loop(); /* doesn't return */
60 60
61 set_thread_flag(TIF_POLLING_NRFLAG); 61 set_thread_flag(TIF_POLLING_NRFLAG);
62 while (1) { 62 while (1) {
63 tick_nohz_idle_enter(); 63 tick_nohz_idle_enter();
64 rcu_idle_enter(); 64 rcu_idle_enter();
65 65
66 while (!need_resched() && !cpu_should_die()) { 66 while (!need_resched() && !cpu_should_die()) {
67 ppc64_runlatch_off(); 67 ppc64_runlatch_off();
68 68
69 if (ppc_md.power_save) { 69 if (ppc_md.power_save) {
70 clear_thread_flag(TIF_POLLING_NRFLAG); 70 clear_thread_flag(TIF_POLLING_NRFLAG);
71 /* 71 /*
72 * smp_mb is so clearing of TIF_POLLING_NRFLAG 72 * smp_mb is so clearing of TIF_POLLING_NRFLAG
73 * is ordered w.r.t. need_resched() test. 73 * is ordered w.r.t. need_resched() test.
74 */ 74 */
75 smp_mb(); 75 smp_mb();
76 local_irq_disable(); 76 local_irq_disable();
77 77
78 /* Don't trace irqs off for idle */ 78 /* Don't trace irqs off for idle */
79 stop_critical_timings(); 79 stop_critical_timings();
80 80
81 /* check again after disabling irqs */ 81 /* check again after disabling irqs */
82 if (!need_resched() && !cpu_should_die()) 82 if (!need_resched() && !cpu_should_die())
83 ppc_md.power_save(); 83 ppc_md.power_save();
84 84
85 start_critical_timings(); 85 start_critical_timings();
86 86
87 local_irq_enable(); 87 local_irq_enable();
88 set_thread_flag(TIF_POLLING_NRFLAG); 88 set_thread_flag(TIF_POLLING_NRFLAG);
89 89
90 } else { 90 } else {
91 /* 91 /*
92 * Go into low thread priority and possibly 92 * Go into low thread priority and possibly
93 * low power mode. 93 * low power mode.
94 */ 94 */
95 HMT_low(); 95 HMT_low();
96 HMT_very_low(); 96 HMT_very_low();
97 } 97 }
98 } 98 }
99 99
100 HMT_medium(); 100 HMT_medium();
101 ppc64_runlatch_on(); 101 ppc64_runlatch_on();
102 rcu_idle_exit(); 102 rcu_idle_exit();
103 tick_nohz_idle_exit(); 103 tick_nohz_idle_exit();
104 if (cpu_should_die()) { 104 if (cpu_should_die()) {
105 preempt_enable_no_resched(); 105 sched_preempt_enable_no_resched();
106 cpu_die(); 106 cpu_die();
107 } 107 }
108 schedule_preempt_disabled(); 108 schedule_preempt_disabled();
109 } 109 }
110 } 110 }
111 111
112 112
113 /* 113 /*
114 * cpu_idle_wait - Used to ensure that all the CPUs come out of the old 114 * cpu_idle_wait - Used to ensure that all the CPUs come out of the old
115 * idle loop and start using the new idle loop. 115 * idle loop and start using the new idle loop.
116 * Required while changing idle handler on SMP systems. 116 * Required while changing idle handler on SMP systems.
117 * Caller must have changed idle handler to the new value before the call. 117 * Caller must have changed idle handler to the new value before the call.
118 * This window may be larger on shared systems. 118 * This window may be larger on shared systems.
119 */ 119 */
120 void cpu_idle_wait(void) 120 void cpu_idle_wait(void)
121 { 121 {
122 int cpu; 122 int cpu;
123 smp_mb(); 123 smp_mb();
124 124
125 /* kick all the CPUs so that they exit out of old idle routine */ 125 /* kick all the CPUs so that they exit out of old idle routine */
126 get_online_cpus(); 126 get_online_cpus();
127 for_each_online_cpu(cpu) { 127 for_each_online_cpu(cpu) {
128 if (cpu != smp_processor_id()) 128 if (cpu != smp_processor_id())
129 smp_send_reschedule(cpu); 129 smp_send_reschedule(cpu);
130 } 130 }
131 put_online_cpus(); 131 put_online_cpus();
132 } 132 }
133 EXPORT_SYMBOL_GPL(cpu_idle_wait); 133 EXPORT_SYMBOL_GPL(cpu_idle_wait);
134 134
135 int powersave_nap; 135 int powersave_nap;
136 136
137 #ifdef CONFIG_SYSCTL 137 #ifdef CONFIG_SYSCTL
138 /* 138 /*
139 * Register the sysctl to set/clear powersave_nap. 139 * Register the sysctl to set/clear powersave_nap.
140 */ 140 */
141 static ctl_table powersave_nap_ctl_table[]={ 141 static ctl_table powersave_nap_ctl_table[]={
142 { 142 {
143 .procname = "powersave-nap", 143 .procname = "powersave-nap",
144 .data = &powersave_nap, 144 .data = &powersave_nap,
145 .maxlen = sizeof(int), 145 .maxlen = sizeof(int),
146 .mode = 0644, 146 .mode = 0644,
147 .proc_handler = proc_dointvec, 147 .proc_handler = proc_dointvec,
148 }, 148 },
149 {} 149 {}
150 }; 150 };
151 static ctl_table powersave_nap_sysctl_root[] = { 151 static ctl_table powersave_nap_sysctl_root[] = {
152 { 152 {
153 .procname = "kernel", 153 .procname = "kernel",
154 .mode = 0555, 154 .mode = 0555,
155 .child = powersave_nap_ctl_table, 155 .child = powersave_nap_ctl_table,
156 }, 156 },
157 {} 157 {}
158 }; 158 };
159 159
160 static int __init 160 static int __init
161 register_powersave_nap_sysctl(void) 161 register_powersave_nap_sysctl(void)
162 { 162 {
163 register_sysctl_table(powersave_nap_sysctl_root); 163 register_sysctl_table(powersave_nap_sysctl_root);
164 164
165 return 0; 165 return 0;
166 } 166 }
167 __initcall(register_powersave_nap_sysctl); 167 __initcall(register_powersave_nap_sysctl);
168 #endif 168 #endif
169 169
arch/sparc/kernel/process_64.c
1 /* arch/sparc64/kernel/process.c 1 /* arch/sparc64/kernel/process.c
2 * 2 *
3 * Copyright (C) 1995, 1996, 2008 David S. Miller (davem@davemloft.net) 3 * Copyright (C) 1995, 1996, 2008 David S. Miller (davem@davemloft.net)
4 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) 4 * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be)
5 * Copyright (C) 1997, 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) 5 * Copyright (C) 1997, 1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz)
6 */ 6 */
7 7
8 /* 8 /*
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12 #include <stdarg.h> 12 #include <stdarg.h>
13 13
14 #include <linux/errno.h> 14 #include <linux/errno.h>
15 #include <linux/export.h> 15 #include <linux/export.h>
16 #include <linux/sched.h> 16 #include <linux/sched.h>
17 #include <linux/kernel.h> 17 #include <linux/kernel.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/fs.h> 19 #include <linux/fs.h>
20 #include <linux/smp.h> 20 #include <linux/smp.h>
21 #include <linux/stddef.h> 21 #include <linux/stddef.h>
22 #include <linux/ptrace.h> 22 #include <linux/ptrace.h>
23 #include <linux/slab.h> 23 #include <linux/slab.h>
24 #include <linux/user.h> 24 #include <linux/user.h>
25 #include <linux/delay.h> 25 #include <linux/delay.h>
26 #include <linux/compat.h> 26 #include <linux/compat.h>
27 #include <linux/tick.h> 27 #include <linux/tick.h>
28 #include <linux/init.h> 28 #include <linux/init.h>
29 #include <linux/cpu.h> 29 #include <linux/cpu.h>
30 #include <linux/elfcore.h> 30 #include <linux/elfcore.h>
31 #include <linux/sysrq.h> 31 #include <linux/sysrq.h>
32 #include <linux/nmi.h> 32 #include <linux/nmi.h>
33 33
34 #include <asm/uaccess.h> 34 #include <asm/uaccess.h>
35 #include <asm/system.h> 35 #include <asm/system.h>
36 #include <asm/page.h> 36 #include <asm/page.h>
37 #include <asm/pgalloc.h> 37 #include <asm/pgalloc.h>
38 #include <asm/pgtable.h> 38 #include <asm/pgtable.h>
39 #include <asm/processor.h> 39 #include <asm/processor.h>
40 #include <asm/pstate.h> 40 #include <asm/pstate.h>
41 #include <asm/elf.h> 41 #include <asm/elf.h>
42 #include <asm/fpumacro.h> 42 #include <asm/fpumacro.h>
43 #include <asm/head.h> 43 #include <asm/head.h>
44 #include <asm/cpudata.h> 44 #include <asm/cpudata.h>
45 #include <asm/mmu_context.h> 45 #include <asm/mmu_context.h>
46 #include <asm/unistd.h> 46 #include <asm/unistd.h>
47 #include <asm/hypervisor.h> 47 #include <asm/hypervisor.h>
48 #include <asm/syscalls.h> 48 #include <asm/syscalls.h>
49 #include <asm/irq_regs.h> 49 #include <asm/irq_regs.h>
50 #include <asm/smp.h> 50 #include <asm/smp.h>
51 51
52 #include "kstack.h" 52 #include "kstack.h"
53 53
54 static void sparc64_yield(int cpu) 54 static void sparc64_yield(int cpu)
55 { 55 {
56 if (tlb_type != hypervisor) { 56 if (tlb_type != hypervisor) {
57 touch_nmi_watchdog(); 57 touch_nmi_watchdog();
58 return; 58 return;
59 } 59 }
60 60
61 clear_thread_flag(TIF_POLLING_NRFLAG); 61 clear_thread_flag(TIF_POLLING_NRFLAG);
62 smp_mb__after_clear_bit(); 62 smp_mb__after_clear_bit();
63 63
64 while (!need_resched() && !cpu_is_offline(cpu)) { 64 while (!need_resched() && !cpu_is_offline(cpu)) {
65 unsigned long pstate; 65 unsigned long pstate;
66 66
67 /* Disable interrupts. */ 67 /* Disable interrupts. */
68 __asm__ __volatile__( 68 __asm__ __volatile__(
69 "rdpr %%pstate, %0\n\t" 69 "rdpr %%pstate, %0\n\t"
70 "andn %0, %1, %0\n\t" 70 "andn %0, %1, %0\n\t"
71 "wrpr %0, %%g0, %%pstate" 71 "wrpr %0, %%g0, %%pstate"
72 : "=&r" (pstate) 72 : "=&r" (pstate)
73 : "i" (PSTATE_IE)); 73 : "i" (PSTATE_IE));
74 74
75 if (!need_resched() && !cpu_is_offline(cpu)) 75 if (!need_resched() && !cpu_is_offline(cpu))
76 sun4v_cpu_yield(); 76 sun4v_cpu_yield();
77 77
78 /* Re-enable interrupts. */ 78 /* Re-enable interrupts. */
79 __asm__ __volatile__( 79 __asm__ __volatile__(
80 "rdpr %%pstate, %0\n\t" 80 "rdpr %%pstate, %0\n\t"
81 "or %0, %1, %0\n\t" 81 "or %0, %1, %0\n\t"
82 "wrpr %0, %%g0, %%pstate" 82 "wrpr %0, %%g0, %%pstate"
83 : "=&r" (pstate) 83 : "=&r" (pstate)
84 : "i" (PSTATE_IE)); 84 : "i" (PSTATE_IE));
85 } 85 }
86 86
87 set_thread_flag(TIF_POLLING_NRFLAG); 87 set_thread_flag(TIF_POLLING_NRFLAG);
88 } 88 }
89 89
90 /* The idle loop on sparc64. */ 90 /* The idle loop on sparc64. */
91 void cpu_idle(void) 91 void cpu_idle(void)
92 { 92 {
93 int cpu = smp_processor_id(); 93 int cpu = smp_processor_id();
94 94
95 set_thread_flag(TIF_POLLING_NRFLAG); 95 set_thread_flag(TIF_POLLING_NRFLAG);
96 96
97 while(1) { 97 while(1) {
98 tick_nohz_idle_enter(); 98 tick_nohz_idle_enter();
99 rcu_idle_enter(); 99 rcu_idle_enter();
100 100
101 while (!need_resched() && !cpu_is_offline(cpu)) 101 while (!need_resched() && !cpu_is_offline(cpu))
102 sparc64_yield(cpu); 102 sparc64_yield(cpu);
103 103
104 rcu_idle_exit(); 104 rcu_idle_exit();
105 tick_nohz_idle_exit(); 105 tick_nohz_idle_exit();
106 106
107 #ifdef CONFIG_HOTPLUG_CPU 107 #ifdef CONFIG_HOTPLUG_CPU
108 if (cpu_is_offline(cpu)) { 108 if (cpu_is_offline(cpu)) {
109 preempt_enable_no_resched(); 109 sched_preempt_enable_no_resched();
110 cpu_play_dead(); 110 cpu_play_dead();
111 } 111 }
112 #endif 112 #endif
113 schedule_preempt_disabled(); 113 schedule_preempt_disabled();
114 } 114 }
115 } 115 }
116 116
117 #ifdef CONFIG_COMPAT 117 #ifdef CONFIG_COMPAT
118 static void show_regwindow32(struct pt_regs *regs) 118 static void show_regwindow32(struct pt_regs *regs)
119 { 119 {
120 struct reg_window32 __user *rw; 120 struct reg_window32 __user *rw;
121 struct reg_window32 r_w; 121 struct reg_window32 r_w;
122 mm_segment_t old_fs; 122 mm_segment_t old_fs;
123 123
124 __asm__ __volatile__ ("flushw"); 124 __asm__ __volatile__ ("flushw");
125 rw = compat_ptr((unsigned)regs->u_regs[14]); 125 rw = compat_ptr((unsigned)regs->u_regs[14]);
126 old_fs = get_fs(); 126 old_fs = get_fs();
127 set_fs (USER_DS); 127 set_fs (USER_DS);
128 if (copy_from_user (&r_w, rw, sizeof(r_w))) { 128 if (copy_from_user (&r_w, rw, sizeof(r_w))) {
129 set_fs (old_fs); 129 set_fs (old_fs);
130 return; 130 return;
131 } 131 }
132 132
133 set_fs (old_fs); 133 set_fs (old_fs);
134 printk("l0: %08x l1: %08x l2: %08x l3: %08x " 134 printk("l0: %08x l1: %08x l2: %08x l3: %08x "
135 "l4: %08x l5: %08x l6: %08x l7: %08x\n", 135 "l4: %08x l5: %08x l6: %08x l7: %08x\n",
136 r_w.locals[0], r_w.locals[1], r_w.locals[2], r_w.locals[3], 136 r_w.locals[0], r_w.locals[1], r_w.locals[2], r_w.locals[3],
137 r_w.locals[4], r_w.locals[5], r_w.locals[6], r_w.locals[7]); 137 r_w.locals[4], r_w.locals[5], r_w.locals[6], r_w.locals[7]);
138 printk("i0: %08x i1: %08x i2: %08x i3: %08x " 138 printk("i0: %08x i1: %08x i2: %08x i3: %08x "
139 "i4: %08x i5: %08x i6: %08x i7: %08x\n", 139 "i4: %08x i5: %08x i6: %08x i7: %08x\n",
140 r_w.ins[0], r_w.ins[1], r_w.ins[2], r_w.ins[3], 140 r_w.ins[0], r_w.ins[1], r_w.ins[2], r_w.ins[3],
141 r_w.ins[4], r_w.ins[5], r_w.ins[6], r_w.ins[7]); 141 r_w.ins[4], r_w.ins[5], r_w.ins[6], r_w.ins[7]);
142 } 142 }
143 #else 143 #else
144 #define show_regwindow32(regs) do { } while (0) 144 #define show_regwindow32(regs) do { } while (0)
145 #endif 145 #endif
146 146
147 static void show_regwindow(struct pt_regs *regs) 147 static void show_regwindow(struct pt_regs *regs)
148 { 148 {
149 struct reg_window __user *rw; 149 struct reg_window __user *rw;
150 struct reg_window *rwk; 150 struct reg_window *rwk;
151 struct reg_window r_w; 151 struct reg_window r_w;
152 mm_segment_t old_fs; 152 mm_segment_t old_fs;
153 153
154 if ((regs->tstate & TSTATE_PRIV) || !(test_thread_flag(TIF_32BIT))) { 154 if ((regs->tstate & TSTATE_PRIV) || !(test_thread_flag(TIF_32BIT))) {
155 __asm__ __volatile__ ("flushw"); 155 __asm__ __volatile__ ("flushw");
156 rw = (struct reg_window __user *) 156 rw = (struct reg_window __user *)
157 (regs->u_regs[14] + STACK_BIAS); 157 (regs->u_regs[14] + STACK_BIAS);
158 rwk = (struct reg_window *) 158 rwk = (struct reg_window *)
159 (regs->u_regs[14] + STACK_BIAS); 159 (regs->u_regs[14] + STACK_BIAS);
160 if (!(regs->tstate & TSTATE_PRIV)) { 160 if (!(regs->tstate & TSTATE_PRIV)) {
161 old_fs = get_fs(); 161 old_fs = get_fs();
162 set_fs (USER_DS); 162 set_fs (USER_DS);
163 if (copy_from_user (&r_w, rw, sizeof(r_w))) { 163 if (copy_from_user (&r_w, rw, sizeof(r_w))) {
164 set_fs (old_fs); 164 set_fs (old_fs);
165 return; 165 return;
166 } 166 }
167 rwk = &r_w; 167 rwk = &r_w;
168 set_fs (old_fs); 168 set_fs (old_fs);
169 } 169 }
170 } else { 170 } else {
171 show_regwindow32(regs); 171 show_regwindow32(regs);
172 return; 172 return;
173 } 173 }
174 printk("l0: %016lx l1: %016lx l2: %016lx l3: %016lx\n", 174 printk("l0: %016lx l1: %016lx l2: %016lx l3: %016lx\n",
175 rwk->locals[0], rwk->locals[1], rwk->locals[2], rwk->locals[3]); 175 rwk->locals[0], rwk->locals[1], rwk->locals[2], rwk->locals[3]);
176 printk("l4: %016lx l5: %016lx l6: %016lx l7: %016lx\n", 176 printk("l4: %016lx l5: %016lx l6: %016lx l7: %016lx\n",
177 rwk->locals[4], rwk->locals[5], rwk->locals[6], rwk->locals[7]); 177 rwk->locals[4], rwk->locals[5], rwk->locals[6], rwk->locals[7]);
178 printk("i0: %016lx i1: %016lx i2: %016lx i3: %016lx\n", 178 printk("i0: %016lx i1: %016lx i2: %016lx i3: %016lx\n",
179 rwk->ins[0], rwk->ins[1], rwk->ins[2], rwk->ins[3]); 179 rwk->ins[0], rwk->ins[1], rwk->ins[2], rwk->ins[3]);
180 printk("i4: %016lx i5: %016lx i6: %016lx i7: %016lx\n", 180 printk("i4: %016lx i5: %016lx i6: %016lx i7: %016lx\n",
181 rwk->ins[4], rwk->ins[5], rwk->ins[6], rwk->ins[7]); 181 rwk->ins[4], rwk->ins[5], rwk->ins[6], rwk->ins[7]);
182 if (regs->tstate & TSTATE_PRIV) 182 if (regs->tstate & TSTATE_PRIV)
183 printk("I7: <%pS>\n", (void *) rwk->ins[7]); 183 printk("I7: <%pS>\n", (void *) rwk->ins[7]);
184 } 184 }
185 185
186 void show_regs(struct pt_regs *regs) 186 void show_regs(struct pt_regs *regs)
187 { 187 {
188 printk("TSTATE: %016lx TPC: %016lx TNPC: %016lx Y: %08x %s\n", regs->tstate, 188 printk("TSTATE: %016lx TPC: %016lx TNPC: %016lx Y: %08x %s\n", regs->tstate,
189 regs->tpc, regs->tnpc, regs->y, print_tainted()); 189 regs->tpc, regs->tnpc, regs->y, print_tainted());
190 printk("TPC: <%pS>\n", (void *) regs->tpc); 190 printk("TPC: <%pS>\n", (void *) regs->tpc);
191 printk("g0: %016lx g1: %016lx g2: %016lx g3: %016lx\n", 191 printk("g0: %016lx g1: %016lx g2: %016lx g3: %016lx\n",
192 regs->u_regs[0], regs->u_regs[1], regs->u_regs[2], 192 regs->u_regs[0], regs->u_regs[1], regs->u_regs[2],
193 regs->u_regs[3]); 193 regs->u_regs[3]);
194 printk("g4: %016lx g5: %016lx g6: %016lx g7: %016lx\n", 194 printk("g4: %016lx g5: %016lx g6: %016lx g7: %016lx\n",
195 regs->u_regs[4], regs->u_regs[5], regs->u_regs[6], 195 regs->u_regs[4], regs->u_regs[5], regs->u_regs[6],
196 regs->u_regs[7]); 196 regs->u_regs[7]);
197 printk("o0: %016lx o1: %016lx o2: %016lx o3: %016lx\n", 197 printk("o0: %016lx o1: %016lx o2: %016lx o3: %016lx\n",
198 regs->u_regs[8], regs->u_regs[9], regs->u_regs[10], 198 regs->u_regs[8], regs->u_regs[9], regs->u_regs[10],
199 regs->u_regs[11]); 199 regs->u_regs[11]);
200 printk("o4: %016lx o5: %016lx sp: %016lx ret_pc: %016lx\n", 200 printk("o4: %016lx o5: %016lx sp: %016lx ret_pc: %016lx\n",
201 regs->u_regs[12], regs->u_regs[13], regs->u_regs[14], 201 regs->u_regs[12], regs->u_regs[13], regs->u_regs[14],
202 regs->u_regs[15]); 202 regs->u_regs[15]);
203 printk("RPC: <%pS>\n", (void *) regs->u_regs[15]); 203 printk("RPC: <%pS>\n", (void *) regs->u_regs[15]);
204 show_regwindow(regs); 204 show_regwindow(regs);
205 show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]); 205 show_stack(current, (unsigned long *) regs->u_regs[UREG_FP]);
206 } 206 }
207 207
208 struct global_reg_snapshot global_reg_snapshot[NR_CPUS]; 208 struct global_reg_snapshot global_reg_snapshot[NR_CPUS];
209 static DEFINE_SPINLOCK(global_reg_snapshot_lock); 209 static DEFINE_SPINLOCK(global_reg_snapshot_lock);
210 210
211 static void __global_reg_self(struct thread_info *tp, struct pt_regs *regs, 211 static void __global_reg_self(struct thread_info *tp, struct pt_regs *regs,
212 int this_cpu) 212 int this_cpu)
213 { 213 {
214 flushw_all(); 214 flushw_all();
215 215
216 global_reg_snapshot[this_cpu].tstate = regs->tstate; 216 global_reg_snapshot[this_cpu].tstate = regs->tstate;
217 global_reg_snapshot[this_cpu].tpc = regs->tpc; 217 global_reg_snapshot[this_cpu].tpc = regs->tpc;
218 global_reg_snapshot[this_cpu].tnpc = regs->tnpc; 218 global_reg_snapshot[this_cpu].tnpc = regs->tnpc;
219 global_reg_snapshot[this_cpu].o7 = regs->u_regs[UREG_I7]; 219 global_reg_snapshot[this_cpu].o7 = regs->u_regs[UREG_I7];
220 220
221 if (regs->tstate & TSTATE_PRIV) { 221 if (regs->tstate & TSTATE_PRIV) {
222 struct reg_window *rw; 222 struct reg_window *rw;
223 223
224 rw = (struct reg_window *) 224 rw = (struct reg_window *)
225 (regs->u_regs[UREG_FP] + STACK_BIAS); 225 (regs->u_regs[UREG_FP] + STACK_BIAS);
226 if (kstack_valid(tp, (unsigned long) rw)) { 226 if (kstack_valid(tp, (unsigned long) rw)) {
227 global_reg_snapshot[this_cpu].i7 = rw->ins[7]; 227 global_reg_snapshot[this_cpu].i7 = rw->ins[7];
228 rw = (struct reg_window *) 228 rw = (struct reg_window *)
229 (rw->ins[6] + STACK_BIAS); 229 (rw->ins[6] + STACK_BIAS);
230 if (kstack_valid(tp, (unsigned long) rw)) 230 if (kstack_valid(tp, (unsigned long) rw))
231 global_reg_snapshot[this_cpu].rpc = rw->ins[7]; 231 global_reg_snapshot[this_cpu].rpc = rw->ins[7];
232 } 232 }
233 } else { 233 } else {
234 global_reg_snapshot[this_cpu].i7 = 0; 234 global_reg_snapshot[this_cpu].i7 = 0;
235 global_reg_snapshot[this_cpu].rpc = 0; 235 global_reg_snapshot[this_cpu].rpc = 0;
236 } 236 }
237 global_reg_snapshot[this_cpu].thread = tp; 237 global_reg_snapshot[this_cpu].thread = tp;
238 } 238 }
239 239
240 /* In order to avoid hangs we do not try to synchronize with the 240 /* In order to avoid hangs we do not try to synchronize with the
241 * global register dump client cpus. The last store they make is to 241 * global register dump client cpus. The last store they make is to
242 * the thread pointer, so do a short poll waiting for that to become 242 * the thread pointer, so do a short poll waiting for that to become
243 * non-NULL. 243 * non-NULL.
244 */ 244 */
245 static void __global_reg_poll(struct global_reg_snapshot *gp) 245 static void __global_reg_poll(struct global_reg_snapshot *gp)
246 { 246 {
247 int limit = 0; 247 int limit = 0;
248 248
249 while (!gp->thread && ++limit < 100) { 249 while (!gp->thread && ++limit < 100) {
250 barrier(); 250 barrier();
251 udelay(1); 251 udelay(1);
252 } 252 }
253 } 253 }
254 254
255 void arch_trigger_all_cpu_backtrace(void) 255 void arch_trigger_all_cpu_backtrace(void)
256 { 256 {
257 struct thread_info *tp = current_thread_info(); 257 struct thread_info *tp = current_thread_info();
258 struct pt_regs *regs = get_irq_regs(); 258 struct pt_regs *regs = get_irq_regs();
259 unsigned long flags; 259 unsigned long flags;
260 int this_cpu, cpu; 260 int this_cpu, cpu;
261 261
262 if (!regs) 262 if (!regs)
263 regs = tp->kregs; 263 regs = tp->kregs;
264 264
265 spin_lock_irqsave(&global_reg_snapshot_lock, flags); 265 spin_lock_irqsave(&global_reg_snapshot_lock, flags);
266 266
267 memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot)); 267 memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot));
268 268
269 this_cpu = raw_smp_processor_id(); 269 this_cpu = raw_smp_processor_id();
270 270
271 __global_reg_self(tp, regs, this_cpu); 271 __global_reg_self(tp, regs, this_cpu);
272 272
273 smp_fetch_global_regs(); 273 smp_fetch_global_regs();
274 274
275 for_each_online_cpu(cpu) { 275 for_each_online_cpu(cpu) {
276 struct global_reg_snapshot *gp = &global_reg_snapshot[cpu]; 276 struct global_reg_snapshot *gp = &global_reg_snapshot[cpu];
277 277
278 __global_reg_poll(gp); 278 __global_reg_poll(gp);
279 279
280 tp = gp->thread; 280 tp = gp->thread;
281 printk("%c CPU[%3d]: TSTATE[%016lx] TPC[%016lx] TNPC[%016lx] TASK[%s:%d]\n", 281 printk("%c CPU[%3d]: TSTATE[%016lx] TPC[%016lx] TNPC[%016lx] TASK[%s:%d]\n",
282 (cpu == this_cpu ? '*' : ' '), cpu, 282 (cpu == this_cpu ? '*' : ' '), cpu,
283 gp->tstate, gp->tpc, gp->tnpc, 283 gp->tstate, gp->tpc, gp->tnpc,
284 ((tp && tp->task) ? tp->task->comm : "NULL"), 284 ((tp && tp->task) ? tp->task->comm : "NULL"),
285 ((tp && tp->task) ? tp->task->pid : -1)); 285 ((tp && tp->task) ? tp->task->pid : -1));
286 286
287 if (gp->tstate & TSTATE_PRIV) { 287 if (gp->tstate & TSTATE_PRIV) {
288 printk(" TPC[%pS] O7[%pS] I7[%pS] RPC[%pS]\n", 288 printk(" TPC[%pS] O7[%pS] I7[%pS] RPC[%pS]\n",
289 (void *) gp->tpc, 289 (void *) gp->tpc,
290 (void *) gp->o7, 290 (void *) gp->o7,
291 (void *) gp->i7, 291 (void *) gp->i7,
292 (void *) gp->rpc); 292 (void *) gp->rpc);
293 } else { 293 } else {
294 printk(" TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n", 294 printk(" TPC[%lx] O7[%lx] I7[%lx] RPC[%lx]\n",
295 gp->tpc, gp->o7, gp->i7, gp->rpc); 295 gp->tpc, gp->o7, gp->i7, gp->rpc);
296 } 296 }
297 } 297 }
298 298
299 memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot)); 299 memset(global_reg_snapshot, 0, sizeof(global_reg_snapshot));
300 300
301 spin_unlock_irqrestore(&global_reg_snapshot_lock, flags); 301 spin_unlock_irqrestore(&global_reg_snapshot_lock, flags);
302 } 302 }
303 303
304 #ifdef CONFIG_MAGIC_SYSRQ 304 #ifdef CONFIG_MAGIC_SYSRQ
305 305
306 static void sysrq_handle_globreg(int key) 306 static void sysrq_handle_globreg(int key)
307 { 307 {
308 arch_trigger_all_cpu_backtrace(); 308 arch_trigger_all_cpu_backtrace();
309 } 309 }
310 310
311 static struct sysrq_key_op sparc_globalreg_op = { 311 static struct sysrq_key_op sparc_globalreg_op = {
312 .handler = sysrq_handle_globreg, 312 .handler = sysrq_handle_globreg,
313 .help_msg = "Globalregs", 313 .help_msg = "Globalregs",
314 .action_msg = "Show Global CPU Regs", 314 .action_msg = "Show Global CPU Regs",
315 }; 315 };
316 316
317 static int __init sparc_globreg_init(void) 317 static int __init sparc_globreg_init(void)
318 { 318 {
319 return register_sysrq_key('y', &sparc_globalreg_op); 319 return register_sysrq_key('y', &sparc_globalreg_op);
320 } 320 }
321 321
322 core_initcall(sparc_globreg_init); 322 core_initcall(sparc_globreg_init);
323 323
324 #endif 324 #endif
325 325
326 unsigned long thread_saved_pc(struct task_struct *tsk) 326 unsigned long thread_saved_pc(struct task_struct *tsk)
327 { 327 {
328 struct thread_info *ti = task_thread_info(tsk); 328 struct thread_info *ti = task_thread_info(tsk);
329 unsigned long ret = 0xdeadbeefUL; 329 unsigned long ret = 0xdeadbeefUL;
330 330
331 if (ti && ti->ksp) { 331 if (ti && ti->ksp) {
332 unsigned long *sp; 332 unsigned long *sp;
333 sp = (unsigned long *)(ti->ksp + STACK_BIAS); 333 sp = (unsigned long *)(ti->ksp + STACK_BIAS);
334 if (((unsigned long)sp & (sizeof(long) - 1)) == 0UL && 334 if (((unsigned long)sp & (sizeof(long) - 1)) == 0UL &&
335 sp[14]) { 335 sp[14]) {
336 unsigned long *fp; 336 unsigned long *fp;
337 fp = (unsigned long *)(sp[14] + STACK_BIAS); 337 fp = (unsigned long *)(sp[14] + STACK_BIAS);
338 if (((unsigned long)fp & (sizeof(long) - 1)) == 0UL) 338 if (((unsigned long)fp & (sizeof(long) - 1)) == 0UL)
339 ret = fp[15]; 339 ret = fp[15];
340 } 340 }
341 } 341 }
342 return ret; 342 return ret;
343 } 343 }
344 344
345 /* Free current thread data structures etc.. */ 345 /* Free current thread data structures etc.. */
346 void exit_thread(void) 346 void exit_thread(void)
347 { 347 {
348 struct thread_info *t = current_thread_info(); 348 struct thread_info *t = current_thread_info();
349 349
350 if (t->utraps) { 350 if (t->utraps) {
351 if (t->utraps[0] < 2) 351 if (t->utraps[0] < 2)
352 kfree (t->utraps); 352 kfree (t->utraps);
353 else 353 else
354 t->utraps[0]--; 354 t->utraps[0]--;
355 } 355 }
356 } 356 }
357 357
358 void flush_thread(void) 358 void flush_thread(void)
359 { 359 {
360 struct thread_info *t = current_thread_info(); 360 struct thread_info *t = current_thread_info();
361 struct mm_struct *mm; 361 struct mm_struct *mm;
362 362
363 mm = t->task->mm; 363 mm = t->task->mm;
364 if (mm) 364 if (mm)
365 tsb_context_switch(mm); 365 tsb_context_switch(mm);
366 366
367 set_thread_wsaved(0); 367 set_thread_wsaved(0);
368 368
369 /* Clear FPU register state. */ 369 /* Clear FPU register state. */
370 t->fpsaved[0] = 0; 370 t->fpsaved[0] = 0;
371 } 371 }
372 372
373 /* It's a bit more tricky when 64-bit tasks are involved... */ 373 /* It's a bit more tricky when 64-bit tasks are involved... */
374 static unsigned long clone_stackframe(unsigned long csp, unsigned long psp) 374 static unsigned long clone_stackframe(unsigned long csp, unsigned long psp)
375 { 375 {
376 unsigned long fp, distance, rval; 376 unsigned long fp, distance, rval;
377 377
378 if (!(test_thread_flag(TIF_32BIT))) { 378 if (!(test_thread_flag(TIF_32BIT))) {
379 csp += STACK_BIAS; 379 csp += STACK_BIAS;
380 psp += STACK_BIAS; 380 psp += STACK_BIAS;
381 __get_user(fp, &(((struct reg_window __user *)psp)->ins[6])); 381 __get_user(fp, &(((struct reg_window __user *)psp)->ins[6]));
382 fp += STACK_BIAS; 382 fp += STACK_BIAS;
383 } else 383 } else
384 __get_user(fp, &(((struct reg_window32 __user *)psp)->ins[6])); 384 __get_user(fp, &(((struct reg_window32 __user *)psp)->ins[6]));
385 385
386 /* Now align the stack as this is mandatory in the Sparc ABI 386 /* Now align the stack as this is mandatory in the Sparc ABI
387 * due to how register windows work. This hides the 387 * due to how register windows work. This hides the
388 * restriction from thread libraries etc. 388 * restriction from thread libraries etc.
389 */ 389 */
390 csp &= ~15UL; 390 csp &= ~15UL;
391 391
392 distance = fp - psp; 392 distance = fp - psp;
393 rval = (csp - distance); 393 rval = (csp - distance);
394 if (copy_in_user((void __user *) rval, (void __user *) psp, distance)) 394 if (copy_in_user((void __user *) rval, (void __user *) psp, distance))
395 rval = 0; 395 rval = 0;
396 else if (test_thread_flag(TIF_32BIT)) { 396 else if (test_thread_flag(TIF_32BIT)) {
397 if (put_user(((u32)csp), 397 if (put_user(((u32)csp),
398 &(((struct reg_window32 __user *)rval)->ins[6]))) 398 &(((struct reg_window32 __user *)rval)->ins[6])))
399 rval = 0; 399 rval = 0;
400 } else { 400 } else {
401 if (put_user(((u64)csp - STACK_BIAS), 401 if (put_user(((u64)csp - STACK_BIAS),
402 &(((struct reg_window __user *)rval)->ins[6]))) 402 &(((struct reg_window __user *)rval)->ins[6])))
403 rval = 0; 403 rval = 0;
404 else 404 else
405 rval = rval - STACK_BIAS; 405 rval = rval - STACK_BIAS;
406 } 406 }
407 407
408 return rval; 408 return rval;
409 } 409 }
410 410
411 /* Standard stuff. */ 411 /* Standard stuff. */
412 static inline void shift_window_buffer(int first_win, int last_win, 412 static inline void shift_window_buffer(int first_win, int last_win,
413 struct thread_info *t) 413 struct thread_info *t)
414 { 414 {
415 int i; 415 int i;
416 416
417 for (i = first_win; i < last_win; i++) { 417 for (i = first_win; i < last_win; i++) {
418 t->rwbuf_stkptrs[i] = t->rwbuf_stkptrs[i+1]; 418 t->rwbuf_stkptrs[i] = t->rwbuf_stkptrs[i+1];
419 memcpy(&t->reg_window[i], &t->reg_window[i+1], 419 memcpy(&t->reg_window[i], &t->reg_window[i+1],
420 sizeof(struct reg_window)); 420 sizeof(struct reg_window));
421 } 421 }
422 } 422 }
423 423
424 void synchronize_user_stack(void) 424 void synchronize_user_stack(void)
425 { 425 {
426 struct thread_info *t = current_thread_info(); 426 struct thread_info *t = current_thread_info();
427 unsigned long window; 427 unsigned long window;
428 428
429 flush_user_windows(); 429 flush_user_windows();
430 if ((window = get_thread_wsaved()) != 0) { 430 if ((window = get_thread_wsaved()) != 0) {
431 int winsize = sizeof(struct reg_window); 431 int winsize = sizeof(struct reg_window);
432 int bias = 0; 432 int bias = 0;
433 433
434 if (test_thread_flag(TIF_32BIT)) 434 if (test_thread_flag(TIF_32BIT))
435 winsize = sizeof(struct reg_window32); 435 winsize = sizeof(struct reg_window32);
436 else 436 else
437 bias = STACK_BIAS; 437 bias = STACK_BIAS;
438 438
439 window -= 1; 439 window -= 1;
440 do { 440 do {
441 unsigned long sp = (t->rwbuf_stkptrs[window] + bias); 441 unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
442 struct reg_window *rwin = &t->reg_window[window]; 442 struct reg_window *rwin = &t->reg_window[window];
443 443
444 if (!copy_to_user((char __user *)sp, rwin, winsize)) { 444 if (!copy_to_user((char __user *)sp, rwin, winsize)) {
445 shift_window_buffer(window, get_thread_wsaved() - 1, t); 445 shift_window_buffer(window, get_thread_wsaved() - 1, t);
446 set_thread_wsaved(get_thread_wsaved() - 1); 446 set_thread_wsaved(get_thread_wsaved() - 1);
447 } 447 }
448 } while (window--); 448 } while (window--);
449 } 449 }
450 } 450 }
451 451
452 static void stack_unaligned(unsigned long sp) 452 static void stack_unaligned(unsigned long sp)
453 { 453 {
454 siginfo_t info; 454 siginfo_t info;
455 455
456 info.si_signo = SIGBUS; 456 info.si_signo = SIGBUS;
457 info.si_errno = 0; 457 info.si_errno = 0;
458 info.si_code = BUS_ADRALN; 458 info.si_code = BUS_ADRALN;
459 info.si_addr = (void __user *) sp; 459 info.si_addr = (void __user *) sp;
460 info.si_trapno = 0; 460 info.si_trapno = 0;
461 force_sig_info(SIGBUS, &info, current); 461 force_sig_info(SIGBUS, &info, current);
462 } 462 }
463 463
464 void fault_in_user_windows(void) 464 void fault_in_user_windows(void)
465 { 465 {
466 struct thread_info *t = current_thread_info(); 466 struct thread_info *t = current_thread_info();
467 unsigned long window; 467 unsigned long window;
468 int winsize = sizeof(struct reg_window); 468 int winsize = sizeof(struct reg_window);
469 int bias = 0; 469 int bias = 0;
470 470
471 if (test_thread_flag(TIF_32BIT)) 471 if (test_thread_flag(TIF_32BIT))
472 winsize = sizeof(struct reg_window32); 472 winsize = sizeof(struct reg_window32);
473 else 473 else
474 bias = STACK_BIAS; 474 bias = STACK_BIAS;
475 475
476 flush_user_windows(); 476 flush_user_windows();
477 window = get_thread_wsaved(); 477 window = get_thread_wsaved();
478 478
479 if (likely(window != 0)) { 479 if (likely(window != 0)) {
480 window -= 1; 480 window -= 1;
481 do { 481 do {
482 unsigned long sp = (t->rwbuf_stkptrs[window] + bias); 482 unsigned long sp = (t->rwbuf_stkptrs[window] + bias);
483 struct reg_window *rwin = &t->reg_window[window]; 483 struct reg_window *rwin = &t->reg_window[window];
484 484
485 if (unlikely(sp & 0x7UL)) 485 if (unlikely(sp & 0x7UL))
486 stack_unaligned(sp); 486 stack_unaligned(sp);
487 487
488 if (unlikely(copy_to_user((char __user *)sp, 488 if (unlikely(copy_to_user((char __user *)sp,
489 rwin, winsize))) 489 rwin, winsize)))
490 goto barf; 490 goto barf;
491 } while (window--); 491 } while (window--);
492 } 492 }
493 set_thread_wsaved(0); 493 set_thread_wsaved(0);
494 return; 494 return;
495 495
496 barf: 496 barf:
497 set_thread_wsaved(window + 1); 497 set_thread_wsaved(window + 1);
498 do_exit(SIGILL); 498 do_exit(SIGILL);
499 } 499 }
500 500
501 asmlinkage long sparc_do_fork(unsigned long clone_flags, 501 asmlinkage long sparc_do_fork(unsigned long clone_flags,
502 unsigned long stack_start, 502 unsigned long stack_start,
503 struct pt_regs *regs, 503 struct pt_regs *regs,
504 unsigned long stack_size) 504 unsigned long stack_size)
505 { 505 {
506 int __user *parent_tid_ptr, *child_tid_ptr; 506 int __user *parent_tid_ptr, *child_tid_ptr;
507 unsigned long orig_i1 = regs->u_regs[UREG_I1]; 507 unsigned long orig_i1 = regs->u_regs[UREG_I1];
508 long ret; 508 long ret;
509 509
510 #ifdef CONFIG_COMPAT 510 #ifdef CONFIG_COMPAT
511 if (test_thread_flag(TIF_32BIT)) { 511 if (test_thread_flag(TIF_32BIT)) {
512 parent_tid_ptr = compat_ptr(regs->u_regs[UREG_I2]); 512 parent_tid_ptr = compat_ptr(regs->u_regs[UREG_I2]);
513 child_tid_ptr = compat_ptr(regs->u_regs[UREG_I4]); 513 child_tid_ptr = compat_ptr(regs->u_regs[UREG_I4]);
514 } else 514 } else
515 #endif 515 #endif
516 { 516 {
517 parent_tid_ptr = (int __user *) regs->u_regs[UREG_I2]; 517 parent_tid_ptr = (int __user *) regs->u_regs[UREG_I2];
518 child_tid_ptr = (int __user *) regs->u_regs[UREG_I4]; 518 child_tid_ptr = (int __user *) regs->u_regs[UREG_I4];
519 } 519 }
520 520
521 ret = do_fork(clone_flags, stack_start, 521 ret = do_fork(clone_flags, stack_start,
522 regs, stack_size, 522 regs, stack_size,
523 parent_tid_ptr, child_tid_ptr); 523 parent_tid_ptr, child_tid_ptr);
524 524
525 /* If we get an error and potentially restart the system 525 /* If we get an error and potentially restart the system
526 * call, we're screwed because copy_thread() clobbered 526 * call, we're screwed because copy_thread() clobbered
527 * the parent's %o1. So detect that case and restore it 527 * the parent's %o1. So detect that case and restore it
528 * here. 528 * here.
529 */ 529 */
530 if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK) 530 if ((unsigned long)ret >= -ERESTART_RESTARTBLOCK)
531 regs->u_regs[UREG_I1] = orig_i1; 531 regs->u_regs[UREG_I1] = orig_i1;
532 532
533 return ret; 533 return ret;
534 } 534 }
535 535
536 /* Copy a Sparc thread. The fork() return value conventions 536 /* Copy a Sparc thread. The fork() return value conventions
537 * under SunOS are nothing short of bletcherous: 537 * under SunOS are nothing short of bletcherous:
538 * Parent --> %o0 == childs pid, %o1 == 0 538 * Parent --> %o0 == childs pid, %o1 == 0
539 * Child --> %o0 == parents pid, %o1 == 1 539 * Child --> %o0 == parents pid, %o1 == 1
540 */ 540 */
541 int copy_thread(unsigned long clone_flags, unsigned long sp, 541 int copy_thread(unsigned long clone_flags, unsigned long sp,
542 unsigned long unused, 542 unsigned long unused,
543 struct task_struct *p, struct pt_regs *regs) 543 struct task_struct *p, struct pt_regs *regs)
544 { 544 {
545 struct thread_info *t = task_thread_info(p); 545 struct thread_info *t = task_thread_info(p);
546 struct sparc_stackf *parent_sf; 546 struct sparc_stackf *parent_sf;
547 unsigned long child_stack_sz; 547 unsigned long child_stack_sz;
548 char *child_trap_frame; 548 char *child_trap_frame;
549 int kernel_thread; 549 int kernel_thread;
550 550
551 kernel_thread = (regs->tstate & TSTATE_PRIV) ? 1 : 0; 551 kernel_thread = (regs->tstate & TSTATE_PRIV) ? 1 : 0;
552 parent_sf = ((struct sparc_stackf *) regs) - 1; 552 parent_sf = ((struct sparc_stackf *) regs) - 1;
553 553
554 /* Calculate offset to stack_frame & pt_regs */ 554 /* Calculate offset to stack_frame & pt_regs */
555 child_stack_sz = ((STACKFRAME_SZ + TRACEREG_SZ) + 555 child_stack_sz = ((STACKFRAME_SZ + TRACEREG_SZ) +
556 (kernel_thread ? STACKFRAME_SZ : 0)); 556 (kernel_thread ? STACKFRAME_SZ : 0));
557 child_trap_frame = (task_stack_page(p) + 557 child_trap_frame = (task_stack_page(p) +
558 (THREAD_SIZE - child_stack_sz)); 558 (THREAD_SIZE - child_stack_sz));
559 memcpy(child_trap_frame, parent_sf, child_stack_sz); 559 memcpy(child_trap_frame, parent_sf, child_stack_sz);
560 560
561 t->flags = (t->flags & ~((0xffUL << TI_FLAG_CWP_SHIFT) | 561 t->flags = (t->flags & ~((0xffUL << TI_FLAG_CWP_SHIFT) |
562 (0xffUL << TI_FLAG_CURRENT_DS_SHIFT))) | 562 (0xffUL << TI_FLAG_CURRENT_DS_SHIFT))) |
563 (((regs->tstate + 1) & TSTATE_CWP) << TI_FLAG_CWP_SHIFT); 563 (((regs->tstate + 1) & TSTATE_CWP) << TI_FLAG_CWP_SHIFT);
564 t->new_child = 1; 564 t->new_child = 1;
565 t->ksp = ((unsigned long) child_trap_frame) - STACK_BIAS; 565 t->ksp = ((unsigned long) child_trap_frame) - STACK_BIAS;
566 t->kregs = (struct pt_regs *) (child_trap_frame + 566 t->kregs = (struct pt_regs *) (child_trap_frame +
567 sizeof(struct sparc_stackf)); 567 sizeof(struct sparc_stackf));
568 t->fpsaved[0] = 0; 568 t->fpsaved[0] = 0;
569 569
570 if (kernel_thread) { 570 if (kernel_thread) {
571 struct sparc_stackf *child_sf = (struct sparc_stackf *) 571 struct sparc_stackf *child_sf = (struct sparc_stackf *)
572 (child_trap_frame + (STACKFRAME_SZ + TRACEREG_SZ)); 572 (child_trap_frame + (STACKFRAME_SZ + TRACEREG_SZ));
573 573
574 /* Zero terminate the stack backtrace. */ 574 /* Zero terminate the stack backtrace. */
575 child_sf->fp = NULL; 575 child_sf->fp = NULL;
576 t->kregs->u_regs[UREG_FP] = 576 t->kregs->u_regs[UREG_FP] =
577 ((unsigned long) child_sf) - STACK_BIAS; 577 ((unsigned long) child_sf) - STACK_BIAS;
578 578
579 t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT); 579 t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT);
580 t->kregs->u_regs[UREG_G6] = (unsigned long) t; 580 t->kregs->u_regs[UREG_G6] = (unsigned long) t;
581 t->kregs->u_regs[UREG_G4] = (unsigned long) t->task; 581 t->kregs->u_regs[UREG_G4] = (unsigned long) t->task;
582 } else { 582 } else {
583 if (t->flags & _TIF_32BIT) { 583 if (t->flags & _TIF_32BIT) {
584 sp &= 0x00000000ffffffffUL; 584 sp &= 0x00000000ffffffffUL;
585 regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL; 585 regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;
586 } 586 }
587 t->kregs->u_regs[UREG_FP] = sp; 587 t->kregs->u_regs[UREG_FP] = sp;
588 t->flags |= ((long)ASI_AIUS << TI_FLAG_CURRENT_DS_SHIFT); 588 t->flags |= ((long)ASI_AIUS << TI_FLAG_CURRENT_DS_SHIFT);
589 if (sp != regs->u_regs[UREG_FP]) { 589 if (sp != regs->u_regs[UREG_FP]) {
590 unsigned long csp; 590 unsigned long csp;
591 591
592 csp = clone_stackframe(sp, regs->u_regs[UREG_FP]); 592 csp = clone_stackframe(sp, regs->u_regs[UREG_FP]);
593 if (!csp) 593 if (!csp)
594 return -EFAULT; 594 return -EFAULT;
595 t->kregs->u_regs[UREG_FP] = csp; 595 t->kregs->u_regs[UREG_FP] = csp;
596 } 596 }
597 if (t->utraps) 597 if (t->utraps)
598 t->utraps[0]++; 598 t->utraps[0]++;
599 } 599 }
600 600
601 /* Set the return value for the child. */ 601 /* Set the return value for the child. */
602 t->kregs->u_regs[UREG_I0] = current->pid; 602 t->kregs->u_regs[UREG_I0] = current->pid;
603 t->kregs->u_regs[UREG_I1] = 1; 603 t->kregs->u_regs[UREG_I1] = 1;
604 604
605 /* Set the second return value for the parent. */ 605 /* Set the second return value for the parent. */
606 regs->u_regs[UREG_I1] = 0; 606 regs->u_regs[UREG_I1] = 0;
607 607
608 if (clone_flags & CLONE_SETTLS) 608 if (clone_flags & CLONE_SETTLS)
609 t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3]; 609 t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3];
610 610
611 return 0; 611 return 0;
612 } 612 }
613 613
614 /* 614 /*
615 * This is the mechanism for creating a new kernel thread. 615 * This is the mechanism for creating a new kernel thread.
616 * 616 *
617 * NOTE! Only a kernel-only process(ie the swapper or direct descendants 617 * NOTE! Only a kernel-only process(ie the swapper or direct descendants
618 * who haven't done an "execve()") should use this: it will work within 618 * who haven't done an "execve()") should use this: it will work within
619 * a system call from a "real" process, but the process memory space will 619 * a system call from a "real" process, but the process memory space will
620 * not be freed until both the parent and the child have exited. 620 * not be freed until both the parent and the child have exited.
621 */ 621 */
622 pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) 622 pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
623 { 623 {
624 long retval; 624 long retval;
625 625
626 /* If the parent runs before fn(arg) is called by the child, 626 /* If the parent runs before fn(arg) is called by the child,
627 * the input registers of this function can be clobbered. 627 * the input registers of this function can be clobbered.
628 * So we stash 'fn' and 'arg' into global registers which 628 * So we stash 'fn' and 'arg' into global registers which
629 * will not be modified by the parent. 629 * will not be modified by the parent.
630 */ 630 */
631 __asm__ __volatile__("mov %4, %%g2\n\t" /* Save FN into global */ 631 __asm__ __volatile__("mov %4, %%g2\n\t" /* Save FN into global */
632 "mov %5, %%g3\n\t" /* Save ARG into global */ 632 "mov %5, %%g3\n\t" /* Save ARG into global */
633 "mov %1, %%g1\n\t" /* Clone syscall nr. */ 633 "mov %1, %%g1\n\t" /* Clone syscall nr. */
634 "mov %2, %%o0\n\t" /* Clone flags. */ 634 "mov %2, %%o0\n\t" /* Clone flags. */
635 "mov 0, %%o1\n\t" /* usp arg == 0 */ 635 "mov 0, %%o1\n\t" /* usp arg == 0 */
636 "t 0x6d\n\t" /* Linux/Sparc clone(). */ 636 "t 0x6d\n\t" /* Linux/Sparc clone(). */
637 "brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */ 637 "brz,a,pn %%o1, 1f\n\t" /* Parent, just return. */
638 " mov %%o0, %0\n\t" 638 " mov %%o0, %0\n\t"
639 "jmpl %%g2, %%o7\n\t" /* Call the function. */ 639 "jmpl %%g2, %%o7\n\t" /* Call the function. */
640 " mov %%g3, %%o0\n\t" /* Set arg in delay. */ 640 " mov %%g3, %%o0\n\t" /* Set arg in delay. */
641 "mov %3, %%g1\n\t" 641 "mov %3, %%g1\n\t"
642 "t 0x6d\n\t" /* Linux/Sparc exit(). */ 642 "t 0x6d\n\t" /* Linux/Sparc exit(). */
643 /* Notreached by child. */ 643 /* Notreached by child. */
644 "1:" : 644 "1:" :
645 "=r" (retval) : 645 "=r" (retval) :
646 "i" (__NR_clone), "r" (flags | CLONE_VM | CLONE_UNTRACED), 646 "i" (__NR_clone), "r" (flags | CLONE_VM | CLONE_UNTRACED),
647 "i" (__NR_exit), "r" (fn), "r" (arg) : 647 "i" (__NR_exit), "r" (fn), "r" (arg) :
648 "g1", "g2", "g3", "o0", "o1", "memory", "cc"); 648 "g1", "g2", "g3", "o0", "o1", "memory", "cc");
649 return retval; 649 return retval;
650 } 650 }
651 EXPORT_SYMBOL(kernel_thread); 651 EXPORT_SYMBOL(kernel_thread);
652 652
653 typedef struct { 653 typedef struct {
654 union { 654 union {
655 unsigned int pr_regs[32]; 655 unsigned int pr_regs[32];
656 unsigned long pr_dregs[16]; 656 unsigned long pr_dregs[16];
657 } pr_fr; 657 } pr_fr;
658 unsigned int __unused; 658 unsigned int __unused;
659 unsigned int pr_fsr; 659 unsigned int pr_fsr;
660 unsigned char pr_qcnt; 660 unsigned char pr_qcnt;
661 unsigned char pr_q_entrysize; 661 unsigned char pr_q_entrysize;
662 unsigned char pr_en; 662 unsigned char pr_en;
663 unsigned int pr_q[64]; 663 unsigned int pr_q[64];
664 } elf_fpregset_t32; 664 } elf_fpregset_t32;
665 665
666 /* 666 /*
667 * fill in the fpu structure for a core dump. 667 * fill in the fpu structure for a core dump.
668 */ 668 */
669 int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs) 669 int dump_fpu (struct pt_regs * regs, elf_fpregset_t * fpregs)
670 { 670 {
671 unsigned long *kfpregs = current_thread_info()->fpregs; 671 unsigned long *kfpregs = current_thread_info()->fpregs;
672 unsigned long fprs = current_thread_info()->fpsaved[0]; 672 unsigned long fprs = current_thread_info()->fpsaved[0];
673 673
674 if (test_thread_flag(TIF_32BIT)) { 674 if (test_thread_flag(TIF_32BIT)) {
675 elf_fpregset_t32 *fpregs32 = (elf_fpregset_t32 *)fpregs; 675 elf_fpregset_t32 *fpregs32 = (elf_fpregset_t32 *)fpregs;
676 676
677 if (fprs & FPRS_DL) 677 if (fprs & FPRS_DL)
678 memcpy(&fpregs32->pr_fr.pr_regs[0], kfpregs, 678 memcpy(&fpregs32->pr_fr.pr_regs[0], kfpregs,
679 sizeof(unsigned int) * 32); 679 sizeof(unsigned int) * 32);
680 else 680 else
681 memset(&fpregs32->pr_fr.pr_regs[0], 0, 681 memset(&fpregs32->pr_fr.pr_regs[0], 0,
682 sizeof(unsigned int) * 32); 682 sizeof(unsigned int) * 32);
683 fpregs32->pr_qcnt = 0; 683 fpregs32->pr_qcnt = 0;
684 fpregs32->pr_q_entrysize = 8; 684 fpregs32->pr_q_entrysize = 8;
685 memset(&fpregs32->pr_q[0], 0, 685 memset(&fpregs32->pr_q[0], 0,
686 (sizeof(unsigned int) * 64)); 686 (sizeof(unsigned int) * 64));
687 if (fprs & FPRS_FEF) { 687 if (fprs & FPRS_FEF) {
688 fpregs32->pr_fsr = (unsigned int) current_thread_info()->xfsr[0]; 688 fpregs32->pr_fsr = (unsigned int) current_thread_info()->xfsr[0];
689 fpregs32->pr_en = 1; 689 fpregs32->pr_en = 1;
690 } else { 690 } else {
691 fpregs32->pr_fsr = 0; 691 fpregs32->pr_fsr = 0;
692 fpregs32->pr_en = 0; 692 fpregs32->pr_en = 0;
693 } 693 }
694 } else { 694 } else {
695 if(fprs & FPRS_DL) 695 if(fprs & FPRS_DL)
696 memcpy(&fpregs->pr_regs[0], kfpregs, 696 memcpy(&fpregs->pr_regs[0], kfpregs,
697 sizeof(unsigned int) * 32); 697 sizeof(unsigned int) * 32);
698 else 698 else
699 memset(&fpregs->pr_regs[0], 0, 699 memset(&fpregs->pr_regs[0], 0,
700 sizeof(unsigned int) * 32); 700 sizeof(unsigned int) * 32);
701 if(fprs & FPRS_DU) 701 if(fprs & FPRS_DU)
702 memcpy(&fpregs->pr_regs[16], kfpregs+16, 702 memcpy(&fpregs->pr_regs[16], kfpregs+16,
703 sizeof(unsigned int) * 32); 703 sizeof(unsigned int) * 32);
704 else 704 else
705 memset(&fpregs->pr_regs[16], 0, 705 memset(&fpregs->pr_regs[16], 0,
706 sizeof(unsigned int) * 32); 706 sizeof(unsigned int) * 32);
707 if(fprs & FPRS_FEF) { 707 if(fprs & FPRS_FEF) {
708 fpregs->pr_fsr = current_thread_info()->xfsr[0]; 708 fpregs->pr_fsr = current_thread_info()->xfsr[0];
709 fpregs->pr_gsr = current_thread_info()->gsr[0]; 709 fpregs->pr_gsr = current_thread_info()->gsr[0];
710 } else { 710 } else {
711 fpregs->pr_fsr = fpregs->pr_gsr = 0; 711 fpregs->pr_fsr = fpregs->pr_gsr = 0;
712 } 712 }
713 fpregs->pr_fprs = fprs; 713 fpregs->pr_fprs = fprs;
714 } 714 }
715 return 1; 715 return 1;
716 } 716 }
717 EXPORT_SYMBOL(dump_fpu); 717 EXPORT_SYMBOL(dump_fpu);
718 718
719 /* 719 /*
720 * sparc_execve() executes a new program after the asm stub has set 720 * sparc_execve() executes a new program after the asm stub has set
721 * things up for us. This should basically do what I want it to. 721 * things up for us. This should basically do what I want it to.
722 */ 722 */
723 asmlinkage int sparc_execve(struct pt_regs *regs) 723 asmlinkage int sparc_execve(struct pt_regs *regs)
724 { 724 {
725 int error, base = 0; 725 int error, base = 0;
726 char *filename; 726 char *filename;
727 727
728 /* User register window flush is done by entry.S */ 728 /* User register window flush is done by entry.S */
729 729
730 /* Check for indirect call. */ 730 /* Check for indirect call. */
731 if (regs->u_regs[UREG_G1] == 0) 731 if (regs->u_regs[UREG_G1] == 0)
732 base = 1; 732 base = 1;
733 733
734 filename = getname((char __user *)regs->u_regs[base + UREG_I0]); 734 filename = getname((char __user *)regs->u_regs[base + UREG_I0]);
735 error = PTR_ERR(filename); 735 error = PTR_ERR(filename);
736 if (IS_ERR(filename)) 736 if (IS_ERR(filename))
737 goto out; 737 goto out;
738 error = do_execve(filename, 738 error = do_execve(filename,
739 (const char __user *const __user *) 739 (const char __user *const __user *)
740 regs->u_regs[base + UREG_I1], 740 regs->u_regs[base + UREG_I1],
741 (const char __user *const __user *) 741 (const char __user *const __user *)
742 regs->u_regs[base + UREG_I2], regs); 742 regs->u_regs[base + UREG_I2], regs);
743 putname(filename); 743 putname(filename);
744 if (!error) { 744 if (!error) {
745 fprs_write(0); 745 fprs_write(0);
746 current_thread_info()->xfsr[0] = 0; 746 current_thread_info()->xfsr[0] = 0;
747 current_thread_info()->fpsaved[0] = 0; 747 current_thread_info()->fpsaved[0] = 0;
748 regs->tstate &= ~TSTATE_PEF; 748 regs->tstate &= ~TSTATE_PEF;
749 } 749 }
750 out: 750 out:
751 return error; 751 return error;
752 } 752 }
753 753
754 unsigned long get_wchan(struct task_struct *task) 754 unsigned long get_wchan(struct task_struct *task)
755 { 755 {
756 unsigned long pc, fp, bias = 0; 756 unsigned long pc, fp, bias = 0;
757 struct thread_info *tp; 757 struct thread_info *tp;
758 struct reg_window *rw; 758 struct reg_window *rw;
759 unsigned long ret = 0; 759 unsigned long ret = 0;
760 int count = 0; 760 int count = 0;
761 761
762 if (!task || task == current || 762 if (!task || task == current ||
763 task->state == TASK_RUNNING) 763 task->state == TASK_RUNNING)
764 goto out; 764 goto out;
765 765
766 tp = task_thread_info(task); 766 tp = task_thread_info(task);
767 bias = STACK_BIAS; 767 bias = STACK_BIAS;
768 fp = task_thread_info(task)->ksp + bias; 768 fp = task_thread_info(task)->ksp + bias;
769 769
770 do { 770 do {
771 if (!kstack_valid(tp, fp)) 771 if (!kstack_valid(tp, fp))
772 break; 772 break;
773 rw = (struct reg_window *) fp; 773 rw = (struct reg_window *) fp;
774 pc = rw->ins[7]; 774 pc = rw->ins[7];
775 if (!in_sched_functions(pc)) { 775 if (!in_sched_functions(pc)) {
776 ret = pc; 776 ret = pc;
777 goto out; 777 goto out;
778 } 778 }
779 fp = rw->ins[6] + bias; 779 fp = rw->ins[6] + bias;
780 } while (++count < 16); 780 } while (++count < 16);
781 781
782 out: 782 out:
783 return ret; 783 return ret;
784 } 784 }
785 785
include/linux/preempt.h
1 #ifndef __LINUX_PREEMPT_H 1 #ifndef __LINUX_PREEMPT_H
2 #define __LINUX_PREEMPT_H 2 #define __LINUX_PREEMPT_H
3 3
4 /* 4 /*
5 * include/linux/preempt.h - macros for accessing and manipulating 5 * include/linux/preempt.h - macros for accessing and manipulating
6 * preempt_count (used for kernel preemption, interrupt count, etc.) 6 * preempt_count (used for kernel preemption, interrupt count, etc.)
7 */ 7 */
8 8
9 #include <linux/thread_info.h> 9 #include <linux/thread_info.h>
10 #include <linux/linkage.h> 10 #include <linux/linkage.h>
11 #include <linux/list.h> 11 #include <linux/list.h>
12 12
13 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER) 13 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
14 extern void add_preempt_count(int val); 14 extern void add_preempt_count(int val);
15 extern void sub_preempt_count(int val); 15 extern void sub_preempt_count(int val);
16 #else 16 #else
17 # define add_preempt_count(val) do { preempt_count() += (val); } while (0) 17 # define add_preempt_count(val) do { preempt_count() += (val); } while (0)
18 # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0) 18 # define sub_preempt_count(val) do { preempt_count() -= (val); } while (0)
19 #endif 19 #endif
20 20
21 #define inc_preempt_count() add_preempt_count(1) 21 #define inc_preempt_count() add_preempt_count(1)
22 #define dec_preempt_count() sub_preempt_count(1) 22 #define dec_preempt_count() sub_preempt_count(1)
23 23
24 #define preempt_count() (current_thread_info()->preempt_count) 24 #define preempt_count() (current_thread_info()->preempt_count)
25 25
26 #ifdef CONFIG_PREEMPT 26 #ifdef CONFIG_PREEMPT
27 27
28 asmlinkage void preempt_schedule(void); 28 asmlinkage void preempt_schedule(void);
29 29
30 #define preempt_check_resched() \ 30 #define preempt_check_resched() \
31 do { \ 31 do { \
32 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ 32 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
33 preempt_schedule(); \ 33 preempt_schedule(); \
34 } while (0) 34 } while (0)
35 35
36 #else /* !CONFIG_PREEMPT */ 36 #else /* !CONFIG_PREEMPT */
37 37
38 #define preempt_check_resched() do { } while (0) 38 #define preempt_check_resched() do { } while (0)
39 39
40 #endif /* CONFIG_PREEMPT */ 40 #endif /* CONFIG_PREEMPT */
41 41
42 42
43 #ifdef CONFIG_PREEMPT_COUNT 43 #ifdef CONFIG_PREEMPT_COUNT
44 44
45 #define preempt_disable() \ 45 #define preempt_disable() \
46 do { \ 46 do { \
47 inc_preempt_count(); \ 47 inc_preempt_count(); \
48 barrier(); \ 48 barrier(); \
49 } while (0) 49 } while (0)
50 50
51 #define preempt_enable_no_resched() \ 51 #define sched_preempt_enable_no_resched() \
52 do { \ 52 do { \
53 barrier(); \ 53 barrier(); \
54 dec_preempt_count(); \ 54 dec_preempt_count(); \
55 } while (0) 55 } while (0)
56 56
57 #define preempt_enable_no_resched() sched_preempt_enable_no_resched()
58
57 #define preempt_enable() \ 59 #define preempt_enable() \
58 do { \ 60 do { \
59 preempt_enable_no_resched(); \ 61 preempt_enable_no_resched(); \
60 barrier(); \ 62 barrier(); \
61 preempt_check_resched(); \ 63 preempt_check_resched(); \
62 } while (0) 64 } while (0)
63 65
64 /* For debugging and tracer internals only! */ 66 /* For debugging and tracer internals only! */
65 #define add_preempt_count_notrace(val) \ 67 #define add_preempt_count_notrace(val) \
66 do { preempt_count() += (val); } while (0) 68 do { preempt_count() += (val); } while (0)
67 #define sub_preempt_count_notrace(val) \ 69 #define sub_preempt_count_notrace(val) \
68 do { preempt_count() -= (val); } while (0) 70 do { preempt_count() -= (val); } while (0)
69 #define inc_preempt_count_notrace() add_preempt_count_notrace(1) 71 #define inc_preempt_count_notrace() add_preempt_count_notrace(1)
70 #define dec_preempt_count_notrace() sub_preempt_count_notrace(1) 72 #define dec_preempt_count_notrace() sub_preempt_count_notrace(1)
71 73
72 #define preempt_disable_notrace() \ 74 #define preempt_disable_notrace() \
73 do { \ 75 do { \
74 inc_preempt_count_notrace(); \ 76 inc_preempt_count_notrace(); \
75 barrier(); \ 77 barrier(); \
76 } while (0) 78 } while (0)
77 79
78 #define preempt_enable_no_resched_notrace() \ 80 #define preempt_enable_no_resched_notrace() \
79 do { \ 81 do { \
80 barrier(); \ 82 barrier(); \
81 dec_preempt_count_notrace(); \ 83 dec_preempt_count_notrace(); \
82 } while (0) 84 } while (0)
83 85
84 /* preempt_check_resched is OK to trace */ 86 /* preempt_check_resched is OK to trace */
85 #define preempt_enable_notrace() \ 87 #define preempt_enable_notrace() \
86 do { \ 88 do { \
87 preempt_enable_no_resched_notrace(); \ 89 preempt_enable_no_resched_notrace(); \
88 barrier(); \ 90 barrier(); \
89 preempt_check_resched(); \ 91 preempt_check_resched(); \
90 } while (0) 92 } while (0)
91 93
92 #else /* !CONFIG_PREEMPT_COUNT */ 94 #else /* !CONFIG_PREEMPT_COUNT */
93 95
94 #define preempt_disable() do { } while (0) 96 #define preempt_disable() do { } while (0)
97 #define sched_preempt_enable_no_resched() do { } while (0)
95 #define preempt_enable_no_resched() do { } while (0) 98 #define preempt_enable_no_resched() do { } while (0)
96 #define preempt_enable() do { } while (0) 99 #define preempt_enable() do { } while (0)
97 100
98 #define preempt_disable_notrace() do { } while (0) 101 #define preempt_disable_notrace() do { } while (0)
99 #define preempt_enable_no_resched_notrace() do { } while (0) 102 #define preempt_enable_no_resched_notrace() do { } while (0)
100 #define preempt_enable_notrace() do { } while (0) 103 #define preempt_enable_notrace() do { } while (0)
101 104
102 #endif /* CONFIG_PREEMPT_COUNT */ 105 #endif /* CONFIG_PREEMPT_COUNT */
103 106
104 #ifdef CONFIG_PREEMPT_NOTIFIERS 107 #ifdef CONFIG_PREEMPT_NOTIFIERS
105 108
106 struct preempt_notifier; 109 struct preempt_notifier;
107 110
108 /** 111 /**
109 * preempt_ops - notifiers called when a task is preempted and rescheduled 112 * preempt_ops - notifiers called when a task is preempted and rescheduled
110 * @sched_in: we're about to be rescheduled: 113 * @sched_in: we're about to be rescheduled:
111 * notifier: struct preempt_notifier for the task being scheduled 114 * notifier: struct preempt_notifier for the task being scheduled
112 * cpu: cpu we're scheduled on 115 * cpu: cpu we're scheduled on
113 * @sched_out: we've just been preempted 116 * @sched_out: we've just been preempted
114 * notifier: struct preempt_notifier for the task being preempted 117 * notifier: struct preempt_notifier for the task being preempted
115 * next: the task that's kicking us out 118 * next: the task that's kicking us out
116 * 119 *
117 * Please note that sched_in and out are called under different 120 * Please note that sched_in and out are called under different
118 * contexts. sched_out is called with rq lock held and irq disabled 121 * contexts. sched_out is called with rq lock held and irq disabled
119 * while sched_in is called without rq lock and irq enabled. This 122 * while sched_in is called without rq lock and irq enabled. This
120 * difference is intentional and depended upon by its users. 123 * difference is intentional and depended upon by its users.
121 */ 124 */
122 struct preempt_ops { 125 struct preempt_ops {
123 void (*sched_in)(struct preempt_notifier *notifier, int cpu); 126 void (*sched_in)(struct preempt_notifier *notifier, int cpu);
124 void (*sched_out)(struct preempt_notifier *notifier, 127 void (*sched_out)(struct preempt_notifier *notifier,
125 struct task_struct *next); 128 struct task_struct *next);
126 }; 129 };
127 130
128 /** 131 /**
129 * preempt_notifier - key for installing preemption notifiers 132 * preempt_notifier - key for installing preemption notifiers
130 * @link: internal use 133 * @link: internal use
131 * @ops: defines the notifier functions to be called 134 * @ops: defines the notifier functions to be called
132 * 135 *
133 * Usually used in conjunction with container_of(). 136 * Usually used in conjunction with container_of().
134 */ 137 */
135 struct preempt_notifier { 138 struct preempt_notifier {
136 struct hlist_node link; 139 struct hlist_node link;
137 struct preempt_ops *ops; 140 struct preempt_ops *ops;
138 }; 141 };
139 142
140 void preempt_notifier_register(struct preempt_notifier *notifier); 143 void preempt_notifier_register(struct preempt_notifier *notifier);
141 void preempt_notifier_unregister(struct preempt_notifier *notifier); 144 void preempt_notifier_unregister(struct preempt_notifier *notifier);
142 145
143 static inline void preempt_notifier_init(struct preempt_notifier *notifier, 146 static inline void preempt_notifier_init(struct preempt_notifier *notifier,
144 struct preempt_ops *ops) 147 struct preempt_ops *ops)
145 { 148 {
146 INIT_HLIST_NODE(&notifier->link); 149 INIT_HLIST_NODE(&notifier->link);
147 notifier->ops = ops; 150 notifier->ops = ops;
148 } 151 }
149 152
150 #endif 153 #endif
151 154
152 #endif /* __LINUX_PREEMPT_H */ 155 #endif /* __LINUX_PREEMPT_H */
153 156
1 /* 1 /*
2 * kernel/sched/core.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz 26 * Thomas Gleixner, Mike Kravetz
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/uaccess.h> 33 #include <linux/uaccess.h>
34 #include <linux/highmem.h> 34 #include <linux/highmem.h>
35 #include <asm/mmu_context.h> 35 #include <asm/mmu_context.h>
36 #include <linux/interrupt.h> 36 #include <linux/interrupt.h>
37 #include <linux/capability.h> 37 #include <linux/capability.h>
38 #include <linux/completion.h> 38 #include <linux/completion.h>
39 #include <linux/kernel_stat.h> 39 #include <linux/kernel_stat.h>
40 #include <linux/debug_locks.h> 40 #include <linux/debug_locks.h>
41 #include <linux/perf_event.h> 41 #include <linux/perf_event.h>
42 #include <linux/security.h> 42 #include <linux/security.h>
43 #include <linux/notifier.h> 43 #include <linux/notifier.h>
44 #include <linux/profile.h> 44 #include <linux/profile.h>
45 #include <linux/freezer.h> 45 #include <linux/freezer.h>
46 #include <linux/vmalloc.h> 46 #include <linux/vmalloc.h>
47 #include <linux/blkdev.h> 47 #include <linux/blkdev.h>
48 #include <linux/delay.h> 48 #include <linux/delay.h>
49 #include <linux/pid_namespace.h> 49 #include <linux/pid_namespace.h>
50 #include <linux/smp.h> 50 #include <linux/smp.h>
51 #include <linux/threads.h> 51 #include <linux/threads.h>
52 #include <linux/timer.h> 52 #include <linux/timer.h>
53 #include <linux/rcupdate.h> 53 #include <linux/rcupdate.h>
54 #include <linux/cpu.h> 54 #include <linux/cpu.h>
55 #include <linux/cpuset.h> 55 #include <linux/cpuset.h>
56 #include <linux/percpu.h> 56 #include <linux/percpu.h>
57 #include <linux/proc_fs.h> 57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h> 58 #include <linux/seq_file.h>
59 #include <linux/sysctl.h> 59 #include <linux/sysctl.h>
60 #include <linux/syscalls.h> 60 #include <linux/syscalls.h>
61 #include <linux/times.h> 61 #include <linux/times.h>
62 #include <linux/tsacct_kern.h> 62 #include <linux/tsacct_kern.h>
63 #include <linux/kprobes.h> 63 #include <linux/kprobes.h>
64 #include <linux/delayacct.h> 64 #include <linux/delayacct.h>
65 #include <linux/unistd.h> 65 #include <linux/unistd.h>
66 #include <linux/pagemap.h> 66 #include <linux/pagemap.h>
67 #include <linux/hrtimer.h> 67 #include <linux/hrtimer.h>
68 #include <linux/tick.h> 68 #include <linux/tick.h>
69 #include <linux/debugfs.h> 69 #include <linux/debugfs.h>
70 #include <linux/ctype.h> 70 #include <linux/ctype.h>
71 #include <linux/ftrace.h> 71 #include <linux/ftrace.h>
72 #include <linux/slab.h> 72 #include <linux/slab.h>
73 #include <linux/init_task.h> 73 #include <linux/init_task.h>
74 74
75 #include <asm/tlb.h> 75 #include <asm/tlb.h>
76 #include <asm/irq_regs.h> 76 #include <asm/irq_regs.h>
77 #include <asm/mutex.h> 77 #include <asm/mutex.h>
78 #ifdef CONFIG_PARAVIRT 78 #ifdef CONFIG_PARAVIRT
79 #include <asm/paravirt.h> 79 #include <asm/paravirt.h>
80 #endif 80 #endif
81 81
82 #include "sched.h" 82 #include "sched.h"
83 #include "../workqueue_sched.h" 83 #include "../workqueue_sched.h"
84 84
85 #define CREATE_TRACE_POINTS 85 #define CREATE_TRACE_POINTS
86 #include <trace/events/sched.h> 86 #include <trace/events/sched.h>
87 87
88 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 88 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
89 { 89 {
90 unsigned long delta; 90 unsigned long delta;
91 ktime_t soft, hard, now; 91 ktime_t soft, hard, now;
92 92
93 for (;;) { 93 for (;;) {
94 if (hrtimer_active(period_timer)) 94 if (hrtimer_active(period_timer))
95 break; 95 break;
96 96
97 now = hrtimer_cb_get_time(period_timer); 97 now = hrtimer_cb_get_time(period_timer);
98 hrtimer_forward(period_timer, now, period); 98 hrtimer_forward(period_timer, now, period);
99 99
100 soft = hrtimer_get_softexpires(period_timer); 100 soft = hrtimer_get_softexpires(period_timer);
101 hard = hrtimer_get_expires(period_timer); 101 hard = hrtimer_get_expires(period_timer);
102 delta = ktime_to_ns(ktime_sub(hard, soft)); 102 delta = ktime_to_ns(ktime_sub(hard, soft));
103 __hrtimer_start_range_ns(period_timer, soft, delta, 103 __hrtimer_start_range_ns(period_timer, soft, delta,
104 HRTIMER_MODE_ABS_PINNED, 0); 104 HRTIMER_MODE_ABS_PINNED, 0);
105 } 105 }
106 } 106 }
107 107
108 DEFINE_MUTEX(sched_domains_mutex); 108 DEFINE_MUTEX(sched_domains_mutex);
109 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 109 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
110 110
111 static void update_rq_clock_task(struct rq *rq, s64 delta); 111 static void update_rq_clock_task(struct rq *rq, s64 delta);
112 112
113 void update_rq_clock(struct rq *rq) 113 void update_rq_clock(struct rq *rq)
114 { 114 {
115 s64 delta; 115 s64 delta;
116 116
117 if (rq->skip_clock_update > 0) 117 if (rq->skip_clock_update > 0)
118 return; 118 return;
119 119
120 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 120 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
121 rq->clock += delta; 121 rq->clock += delta;
122 update_rq_clock_task(rq, delta); 122 update_rq_clock_task(rq, delta);
123 } 123 }
124 124
125 /* 125 /*
126 * Debugging: various feature bits 126 * Debugging: various feature bits
127 */ 127 */
128 128
129 #define SCHED_FEAT(name, enabled) \ 129 #define SCHED_FEAT(name, enabled) \
130 (1UL << __SCHED_FEAT_##name) * enabled | 130 (1UL << __SCHED_FEAT_##name) * enabled |
131 131
132 const_debug unsigned int sysctl_sched_features = 132 const_debug unsigned int sysctl_sched_features =
133 #include "features.h" 133 #include "features.h"
134 0; 134 0;
135 135
136 #undef SCHED_FEAT 136 #undef SCHED_FEAT
137 137
138 #ifdef CONFIG_SCHED_DEBUG 138 #ifdef CONFIG_SCHED_DEBUG
139 #define SCHED_FEAT(name, enabled) \ 139 #define SCHED_FEAT(name, enabled) \
140 #name , 140 #name ,
141 141
142 static __read_mostly char *sched_feat_names[] = { 142 static __read_mostly char *sched_feat_names[] = {
143 #include "features.h" 143 #include "features.h"
144 NULL 144 NULL
145 }; 145 };
146 146
147 #undef SCHED_FEAT 147 #undef SCHED_FEAT
148 148
149 static int sched_feat_show(struct seq_file *m, void *v) 149 static int sched_feat_show(struct seq_file *m, void *v)
150 { 150 {
151 int i; 151 int i;
152 152
153 for (i = 0; i < __SCHED_FEAT_NR; i++) { 153 for (i = 0; i < __SCHED_FEAT_NR; i++) {
154 if (!(sysctl_sched_features & (1UL << i))) 154 if (!(sysctl_sched_features & (1UL << i)))
155 seq_puts(m, "NO_"); 155 seq_puts(m, "NO_");
156 seq_printf(m, "%s ", sched_feat_names[i]); 156 seq_printf(m, "%s ", sched_feat_names[i]);
157 } 157 }
158 seq_puts(m, "\n"); 158 seq_puts(m, "\n");
159 159
160 return 0; 160 return 0;
161 } 161 }
162 162
163 #ifdef HAVE_JUMP_LABEL 163 #ifdef HAVE_JUMP_LABEL
164 164
165 #define jump_label_key__true jump_label_key_enabled 165 #define jump_label_key__true jump_label_key_enabled
166 #define jump_label_key__false jump_label_key_disabled 166 #define jump_label_key__false jump_label_key_disabled
167 167
168 #define SCHED_FEAT(name, enabled) \ 168 #define SCHED_FEAT(name, enabled) \
169 jump_label_key__##enabled , 169 jump_label_key__##enabled ,
170 170
171 struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { 171 struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
172 #include "features.h" 172 #include "features.h"
173 }; 173 };
174 174
175 #undef SCHED_FEAT 175 #undef SCHED_FEAT
176 176
177 static void sched_feat_disable(int i) 177 static void sched_feat_disable(int i)
178 { 178 {
179 if (jump_label_enabled(&sched_feat_keys[i])) 179 if (jump_label_enabled(&sched_feat_keys[i]))
180 jump_label_dec(&sched_feat_keys[i]); 180 jump_label_dec(&sched_feat_keys[i]);
181 } 181 }
182 182
183 static void sched_feat_enable(int i) 183 static void sched_feat_enable(int i)
184 { 184 {
185 if (!jump_label_enabled(&sched_feat_keys[i])) 185 if (!jump_label_enabled(&sched_feat_keys[i]))
186 jump_label_inc(&sched_feat_keys[i]); 186 jump_label_inc(&sched_feat_keys[i]);
187 } 187 }
188 #else 188 #else
189 static void sched_feat_disable(int i) { }; 189 static void sched_feat_disable(int i) { };
190 static void sched_feat_enable(int i) { }; 190 static void sched_feat_enable(int i) { };
191 #endif /* HAVE_JUMP_LABEL */ 191 #endif /* HAVE_JUMP_LABEL */
192 192
193 static ssize_t 193 static ssize_t
194 sched_feat_write(struct file *filp, const char __user *ubuf, 194 sched_feat_write(struct file *filp, const char __user *ubuf,
195 size_t cnt, loff_t *ppos) 195 size_t cnt, loff_t *ppos)
196 { 196 {
197 char buf[64]; 197 char buf[64];
198 char *cmp; 198 char *cmp;
199 int neg = 0; 199 int neg = 0;
200 int i; 200 int i;
201 201
202 if (cnt > 63) 202 if (cnt > 63)
203 cnt = 63; 203 cnt = 63;
204 204
205 if (copy_from_user(&buf, ubuf, cnt)) 205 if (copy_from_user(&buf, ubuf, cnt))
206 return -EFAULT; 206 return -EFAULT;
207 207
208 buf[cnt] = 0; 208 buf[cnt] = 0;
209 cmp = strstrip(buf); 209 cmp = strstrip(buf);
210 210
211 if (strncmp(cmp, "NO_", 3) == 0) { 211 if (strncmp(cmp, "NO_", 3) == 0) {
212 neg = 1; 212 neg = 1;
213 cmp += 3; 213 cmp += 3;
214 } 214 }
215 215
216 for (i = 0; i < __SCHED_FEAT_NR; i++) { 216 for (i = 0; i < __SCHED_FEAT_NR; i++) {
217 if (strcmp(cmp, sched_feat_names[i]) == 0) { 217 if (strcmp(cmp, sched_feat_names[i]) == 0) {
218 if (neg) { 218 if (neg) {
219 sysctl_sched_features &= ~(1UL << i); 219 sysctl_sched_features &= ~(1UL << i);
220 sched_feat_disable(i); 220 sched_feat_disable(i);
221 } else { 221 } else {
222 sysctl_sched_features |= (1UL << i); 222 sysctl_sched_features |= (1UL << i);
223 sched_feat_enable(i); 223 sched_feat_enable(i);
224 } 224 }
225 break; 225 break;
226 } 226 }
227 } 227 }
228 228
229 if (i == __SCHED_FEAT_NR) 229 if (i == __SCHED_FEAT_NR)
230 return -EINVAL; 230 return -EINVAL;
231 231
232 *ppos += cnt; 232 *ppos += cnt;
233 233
234 return cnt; 234 return cnt;
235 } 235 }
236 236
237 static int sched_feat_open(struct inode *inode, struct file *filp) 237 static int sched_feat_open(struct inode *inode, struct file *filp)
238 { 238 {
239 return single_open(filp, sched_feat_show, NULL); 239 return single_open(filp, sched_feat_show, NULL);
240 } 240 }
241 241
242 static const struct file_operations sched_feat_fops = { 242 static const struct file_operations sched_feat_fops = {
243 .open = sched_feat_open, 243 .open = sched_feat_open,
244 .write = sched_feat_write, 244 .write = sched_feat_write,
245 .read = seq_read, 245 .read = seq_read,
246 .llseek = seq_lseek, 246 .llseek = seq_lseek,
247 .release = single_release, 247 .release = single_release,
248 }; 248 };
249 249
250 static __init int sched_init_debug(void) 250 static __init int sched_init_debug(void)
251 { 251 {
252 debugfs_create_file("sched_features", 0644, NULL, NULL, 252 debugfs_create_file("sched_features", 0644, NULL, NULL,
253 &sched_feat_fops); 253 &sched_feat_fops);
254 254
255 return 0; 255 return 0;
256 } 256 }
257 late_initcall(sched_init_debug); 257 late_initcall(sched_init_debug);
258 #endif /* CONFIG_SCHED_DEBUG */ 258 #endif /* CONFIG_SCHED_DEBUG */
259 259
260 /* 260 /*
261 * Number of tasks to iterate in a single balance run. 261 * Number of tasks to iterate in a single balance run.
262 * Limited because this is done with IRQs disabled. 262 * Limited because this is done with IRQs disabled.
263 */ 263 */
264 const_debug unsigned int sysctl_sched_nr_migrate = 32; 264 const_debug unsigned int sysctl_sched_nr_migrate = 32;
265 265
266 /* 266 /*
267 * period over which we average the RT time consumption, measured 267 * period over which we average the RT time consumption, measured
268 * in ms. 268 * in ms.
269 * 269 *
270 * default: 1s 270 * default: 1s
271 */ 271 */
272 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 272 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
273 273
274 /* 274 /*
275 * period over which we measure -rt task cpu usage in us. 275 * period over which we measure -rt task cpu usage in us.
276 * default: 1s 276 * default: 1s
277 */ 277 */
278 unsigned int sysctl_sched_rt_period = 1000000; 278 unsigned int sysctl_sched_rt_period = 1000000;
279 279
280 __read_mostly int scheduler_running; 280 __read_mostly int scheduler_running;
281 281
282 /* 282 /*
283 * part of the period that we allow rt tasks to run in us. 283 * part of the period that we allow rt tasks to run in us.
284 * default: 0.95s 284 * default: 0.95s
285 */ 285 */
286 int sysctl_sched_rt_runtime = 950000; 286 int sysctl_sched_rt_runtime = 950000;
287 287
288 288
289 289
290 /* 290 /*
291 * __task_rq_lock - lock the rq @p resides on. 291 * __task_rq_lock - lock the rq @p resides on.
292 */ 292 */
293 static inline struct rq *__task_rq_lock(struct task_struct *p) 293 static inline struct rq *__task_rq_lock(struct task_struct *p)
294 __acquires(rq->lock) 294 __acquires(rq->lock)
295 { 295 {
296 struct rq *rq; 296 struct rq *rq;
297 297
298 lockdep_assert_held(&p->pi_lock); 298 lockdep_assert_held(&p->pi_lock);
299 299
300 for (;;) { 300 for (;;) {
301 rq = task_rq(p); 301 rq = task_rq(p);
302 raw_spin_lock(&rq->lock); 302 raw_spin_lock(&rq->lock);
303 if (likely(rq == task_rq(p))) 303 if (likely(rq == task_rq(p)))
304 return rq; 304 return rq;
305 raw_spin_unlock(&rq->lock); 305 raw_spin_unlock(&rq->lock);
306 } 306 }
307 } 307 }
308 308
309 /* 309 /*
310 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 310 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
311 */ 311 */
312 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 312 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
313 __acquires(p->pi_lock) 313 __acquires(p->pi_lock)
314 __acquires(rq->lock) 314 __acquires(rq->lock)
315 { 315 {
316 struct rq *rq; 316 struct rq *rq;
317 317
318 for (;;) { 318 for (;;) {
319 raw_spin_lock_irqsave(&p->pi_lock, *flags); 319 raw_spin_lock_irqsave(&p->pi_lock, *flags);
320 rq = task_rq(p); 320 rq = task_rq(p);
321 raw_spin_lock(&rq->lock); 321 raw_spin_lock(&rq->lock);
322 if (likely(rq == task_rq(p))) 322 if (likely(rq == task_rq(p)))
323 return rq; 323 return rq;
324 raw_spin_unlock(&rq->lock); 324 raw_spin_unlock(&rq->lock);
325 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 325 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
326 } 326 }
327 } 327 }
328 328
329 static void __task_rq_unlock(struct rq *rq) 329 static void __task_rq_unlock(struct rq *rq)
330 __releases(rq->lock) 330 __releases(rq->lock)
331 { 331 {
332 raw_spin_unlock(&rq->lock); 332 raw_spin_unlock(&rq->lock);
333 } 333 }
334 334
335 static inline void 335 static inline void
336 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 336 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
337 __releases(rq->lock) 337 __releases(rq->lock)
338 __releases(p->pi_lock) 338 __releases(p->pi_lock)
339 { 339 {
340 raw_spin_unlock(&rq->lock); 340 raw_spin_unlock(&rq->lock);
341 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 341 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
342 } 342 }
343 343
344 /* 344 /*
345 * this_rq_lock - lock this runqueue and disable interrupts. 345 * this_rq_lock - lock this runqueue and disable interrupts.
346 */ 346 */
347 static struct rq *this_rq_lock(void) 347 static struct rq *this_rq_lock(void)
348 __acquires(rq->lock) 348 __acquires(rq->lock)
349 { 349 {
350 struct rq *rq; 350 struct rq *rq;
351 351
352 local_irq_disable(); 352 local_irq_disable();
353 rq = this_rq(); 353 rq = this_rq();
354 raw_spin_lock(&rq->lock); 354 raw_spin_lock(&rq->lock);
355 355
356 return rq; 356 return rq;
357 } 357 }
358 358
359 #ifdef CONFIG_SCHED_HRTICK 359 #ifdef CONFIG_SCHED_HRTICK
360 /* 360 /*
361 * Use HR-timers to deliver accurate preemption points. 361 * Use HR-timers to deliver accurate preemption points.
362 * 362 *
363 * Its all a bit involved since we cannot program an hrt while holding the 363 * Its all a bit involved since we cannot program an hrt while holding the
364 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 364 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
365 * reschedule event. 365 * reschedule event.
366 * 366 *
367 * When we get rescheduled we reprogram the hrtick_timer outside of the 367 * When we get rescheduled we reprogram the hrtick_timer outside of the
368 * rq->lock. 368 * rq->lock.
369 */ 369 */
370 370
371 static void hrtick_clear(struct rq *rq) 371 static void hrtick_clear(struct rq *rq)
372 { 372 {
373 if (hrtimer_active(&rq->hrtick_timer)) 373 if (hrtimer_active(&rq->hrtick_timer))
374 hrtimer_cancel(&rq->hrtick_timer); 374 hrtimer_cancel(&rq->hrtick_timer);
375 } 375 }
376 376
377 /* 377 /*
378 * High-resolution timer tick. 378 * High-resolution timer tick.
379 * Runs from hardirq context with interrupts disabled. 379 * Runs from hardirq context with interrupts disabled.
380 */ 380 */
381 static enum hrtimer_restart hrtick(struct hrtimer *timer) 381 static enum hrtimer_restart hrtick(struct hrtimer *timer)
382 { 382 {
383 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 383 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
384 384
385 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 385 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
386 386
387 raw_spin_lock(&rq->lock); 387 raw_spin_lock(&rq->lock);
388 update_rq_clock(rq); 388 update_rq_clock(rq);
389 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 389 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
390 raw_spin_unlock(&rq->lock); 390 raw_spin_unlock(&rq->lock);
391 391
392 return HRTIMER_NORESTART; 392 return HRTIMER_NORESTART;
393 } 393 }
394 394
395 #ifdef CONFIG_SMP 395 #ifdef CONFIG_SMP
396 /* 396 /*
397 * called from hardirq (IPI) context 397 * called from hardirq (IPI) context
398 */ 398 */
399 static void __hrtick_start(void *arg) 399 static void __hrtick_start(void *arg)
400 { 400 {
401 struct rq *rq = arg; 401 struct rq *rq = arg;
402 402
403 raw_spin_lock(&rq->lock); 403 raw_spin_lock(&rq->lock);
404 hrtimer_restart(&rq->hrtick_timer); 404 hrtimer_restart(&rq->hrtick_timer);
405 rq->hrtick_csd_pending = 0; 405 rq->hrtick_csd_pending = 0;
406 raw_spin_unlock(&rq->lock); 406 raw_spin_unlock(&rq->lock);
407 } 407 }
408 408
409 /* 409 /*
410 * Called to set the hrtick timer state. 410 * Called to set the hrtick timer state.
411 * 411 *
412 * called with rq->lock held and irqs disabled 412 * called with rq->lock held and irqs disabled
413 */ 413 */
414 void hrtick_start(struct rq *rq, u64 delay) 414 void hrtick_start(struct rq *rq, u64 delay)
415 { 415 {
416 struct hrtimer *timer = &rq->hrtick_timer; 416 struct hrtimer *timer = &rq->hrtick_timer;
417 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 417 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
418 418
419 hrtimer_set_expires(timer, time); 419 hrtimer_set_expires(timer, time);
420 420
421 if (rq == this_rq()) { 421 if (rq == this_rq()) {
422 hrtimer_restart(timer); 422 hrtimer_restart(timer);
423 } else if (!rq->hrtick_csd_pending) { 423 } else if (!rq->hrtick_csd_pending) {
424 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 424 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
425 rq->hrtick_csd_pending = 1; 425 rq->hrtick_csd_pending = 1;
426 } 426 }
427 } 427 }
428 428
429 static int 429 static int
430 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 430 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
431 { 431 {
432 int cpu = (int)(long)hcpu; 432 int cpu = (int)(long)hcpu;
433 433
434 switch (action) { 434 switch (action) {
435 case CPU_UP_CANCELED: 435 case CPU_UP_CANCELED:
436 case CPU_UP_CANCELED_FROZEN: 436 case CPU_UP_CANCELED_FROZEN:
437 case CPU_DOWN_PREPARE: 437 case CPU_DOWN_PREPARE:
438 case CPU_DOWN_PREPARE_FROZEN: 438 case CPU_DOWN_PREPARE_FROZEN:
439 case CPU_DEAD: 439 case CPU_DEAD:
440 case CPU_DEAD_FROZEN: 440 case CPU_DEAD_FROZEN:
441 hrtick_clear(cpu_rq(cpu)); 441 hrtick_clear(cpu_rq(cpu));
442 return NOTIFY_OK; 442 return NOTIFY_OK;
443 } 443 }
444 444
445 return NOTIFY_DONE; 445 return NOTIFY_DONE;
446 } 446 }
447 447
448 static __init void init_hrtick(void) 448 static __init void init_hrtick(void)
449 { 449 {
450 hotcpu_notifier(hotplug_hrtick, 0); 450 hotcpu_notifier(hotplug_hrtick, 0);
451 } 451 }
452 #else 452 #else
453 /* 453 /*
454 * Called to set the hrtick timer state. 454 * Called to set the hrtick timer state.
455 * 455 *
456 * called with rq->lock held and irqs disabled 456 * called with rq->lock held and irqs disabled
457 */ 457 */
458 void hrtick_start(struct rq *rq, u64 delay) 458 void hrtick_start(struct rq *rq, u64 delay)
459 { 459 {
460 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 460 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
461 HRTIMER_MODE_REL_PINNED, 0); 461 HRTIMER_MODE_REL_PINNED, 0);
462 } 462 }
463 463
464 static inline void init_hrtick(void) 464 static inline void init_hrtick(void)
465 { 465 {
466 } 466 }
467 #endif /* CONFIG_SMP */ 467 #endif /* CONFIG_SMP */
468 468
469 static void init_rq_hrtick(struct rq *rq) 469 static void init_rq_hrtick(struct rq *rq)
470 { 470 {
471 #ifdef CONFIG_SMP 471 #ifdef CONFIG_SMP
472 rq->hrtick_csd_pending = 0; 472 rq->hrtick_csd_pending = 0;
473 473
474 rq->hrtick_csd.flags = 0; 474 rq->hrtick_csd.flags = 0;
475 rq->hrtick_csd.func = __hrtick_start; 475 rq->hrtick_csd.func = __hrtick_start;
476 rq->hrtick_csd.info = rq; 476 rq->hrtick_csd.info = rq;
477 #endif 477 #endif
478 478
479 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 479 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
480 rq->hrtick_timer.function = hrtick; 480 rq->hrtick_timer.function = hrtick;
481 } 481 }
482 #else /* CONFIG_SCHED_HRTICK */ 482 #else /* CONFIG_SCHED_HRTICK */
483 static inline void hrtick_clear(struct rq *rq) 483 static inline void hrtick_clear(struct rq *rq)
484 { 484 {
485 } 485 }
486 486
487 static inline void init_rq_hrtick(struct rq *rq) 487 static inline void init_rq_hrtick(struct rq *rq)
488 { 488 {
489 } 489 }
490 490
491 static inline void init_hrtick(void) 491 static inline void init_hrtick(void)
492 { 492 {
493 } 493 }
494 #endif /* CONFIG_SCHED_HRTICK */ 494 #endif /* CONFIG_SCHED_HRTICK */
495 495
496 /* 496 /*
497 * resched_task - mark a task 'to be rescheduled now'. 497 * resched_task - mark a task 'to be rescheduled now'.
498 * 498 *
499 * On UP this means the setting of the need_resched flag, on SMP it 499 * On UP this means the setting of the need_resched flag, on SMP it
500 * might also involve a cross-CPU call to trigger the scheduler on 500 * might also involve a cross-CPU call to trigger the scheduler on
501 * the target CPU. 501 * the target CPU.
502 */ 502 */
503 #ifdef CONFIG_SMP 503 #ifdef CONFIG_SMP
504 504
505 #ifndef tsk_is_polling 505 #ifndef tsk_is_polling
506 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 506 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
507 #endif 507 #endif
508 508
509 void resched_task(struct task_struct *p) 509 void resched_task(struct task_struct *p)
510 { 510 {
511 int cpu; 511 int cpu;
512 512
513 assert_raw_spin_locked(&task_rq(p)->lock); 513 assert_raw_spin_locked(&task_rq(p)->lock);
514 514
515 if (test_tsk_need_resched(p)) 515 if (test_tsk_need_resched(p))
516 return; 516 return;
517 517
518 set_tsk_need_resched(p); 518 set_tsk_need_resched(p);
519 519
520 cpu = task_cpu(p); 520 cpu = task_cpu(p);
521 if (cpu == smp_processor_id()) 521 if (cpu == smp_processor_id())
522 return; 522 return;
523 523
524 /* NEED_RESCHED must be visible before we test polling */ 524 /* NEED_RESCHED must be visible before we test polling */
525 smp_mb(); 525 smp_mb();
526 if (!tsk_is_polling(p)) 526 if (!tsk_is_polling(p))
527 smp_send_reschedule(cpu); 527 smp_send_reschedule(cpu);
528 } 528 }
529 529
530 void resched_cpu(int cpu) 530 void resched_cpu(int cpu)
531 { 531 {
532 struct rq *rq = cpu_rq(cpu); 532 struct rq *rq = cpu_rq(cpu);
533 unsigned long flags; 533 unsigned long flags;
534 534
535 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 535 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
536 return; 536 return;
537 resched_task(cpu_curr(cpu)); 537 resched_task(cpu_curr(cpu));
538 raw_spin_unlock_irqrestore(&rq->lock, flags); 538 raw_spin_unlock_irqrestore(&rq->lock, flags);
539 } 539 }
540 540
541 #ifdef CONFIG_NO_HZ 541 #ifdef CONFIG_NO_HZ
542 /* 542 /*
543 * In the semi idle case, use the nearest busy cpu for migrating timers 543 * In the semi idle case, use the nearest busy cpu for migrating timers
544 * from an idle cpu. This is good for power-savings. 544 * from an idle cpu. This is good for power-savings.
545 * 545 *
546 * We don't do similar optimization for completely idle system, as 546 * We don't do similar optimization for completely idle system, as
547 * selecting an idle cpu will add more delays to the timers than intended 547 * selecting an idle cpu will add more delays to the timers than intended
548 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 548 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
549 */ 549 */
550 int get_nohz_timer_target(void) 550 int get_nohz_timer_target(void)
551 { 551 {
552 int cpu = smp_processor_id(); 552 int cpu = smp_processor_id();
553 int i; 553 int i;
554 struct sched_domain *sd; 554 struct sched_domain *sd;
555 555
556 rcu_read_lock(); 556 rcu_read_lock();
557 for_each_domain(cpu, sd) { 557 for_each_domain(cpu, sd) {
558 for_each_cpu(i, sched_domain_span(sd)) { 558 for_each_cpu(i, sched_domain_span(sd)) {
559 if (!idle_cpu(i)) { 559 if (!idle_cpu(i)) {
560 cpu = i; 560 cpu = i;
561 goto unlock; 561 goto unlock;
562 } 562 }
563 } 563 }
564 } 564 }
565 unlock: 565 unlock:
566 rcu_read_unlock(); 566 rcu_read_unlock();
567 return cpu; 567 return cpu;
568 } 568 }
569 /* 569 /*
570 * When add_timer_on() enqueues a timer into the timer wheel of an 570 * When add_timer_on() enqueues a timer into the timer wheel of an
571 * idle CPU then this timer might expire before the next timer event 571 * idle CPU then this timer might expire before the next timer event
572 * which is scheduled to wake up that CPU. In case of a completely 572 * which is scheduled to wake up that CPU. In case of a completely
573 * idle system the next event might even be infinite time into the 573 * idle system the next event might even be infinite time into the
574 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 574 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
575 * leaves the inner idle loop so the newly added timer is taken into 575 * leaves the inner idle loop so the newly added timer is taken into
576 * account when the CPU goes back to idle and evaluates the timer 576 * account when the CPU goes back to idle and evaluates the timer
577 * wheel for the next timer event. 577 * wheel for the next timer event.
578 */ 578 */
579 void wake_up_idle_cpu(int cpu) 579 void wake_up_idle_cpu(int cpu)
580 { 580 {
581 struct rq *rq = cpu_rq(cpu); 581 struct rq *rq = cpu_rq(cpu);
582 582
583 if (cpu == smp_processor_id()) 583 if (cpu == smp_processor_id())
584 return; 584 return;
585 585
586 /* 586 /*
587 * This is safe, as this function is called with the timer 587 * This is safe, as this function is called with the timer
588 * wheel base lock of (cpu) held. When the CPU is on the way 588 * wheel base lock of (cpu) held. When the CPU is on the way
589 * to idle and has not yet set rq->curr to idle then it will 589 * to idle and has not yet set rq->curr to idle then it will
590 * be serialized on the timer wheel base lock and take the new 590 * be serialized on the timer wheel base lock and take the new
591 * timer into account automatically. 591 * timer into account automatically.
592 */ 592 */
593 if (rq->curr != rq->idle) 593 if (rq->curr != rq->idle)
594 return; 594 return;
595 595
596 /* 596 /*
597 * We can set TIF_RESCHED on the idle task of the other CPU 597 * We can set TIF_RESCHED on the idle task of the other CPU
598 * lockless. The worst case is that the other CPU runs the 598 * lockless. The worst case is that the other CPU runs the
599 * idle task through an additional NOOP schedule() 599 * idle task through an additional NOOP schedule()
600 */ 600 */
601 set_tsk_need_resched(rq->idle); 601 set_tsk_need_resched(rq->idle);
602 602
603 /* NEED_RESCHED must be visible before we test polling */ 603 /* NEED_RESCHED must be visible before we test polling */
604 smp_mb(); 604 smp_mb();
605 if (!tsk_is_polling(rq->idle)) 605 if (!tsk_is_polling(rq->idle))
606 smp_send_reschedule(cpu); 606 smp_send_reschedule(cpu);
607 } 607 }
608 608
609 static inline bool got_nohz_idle_kick(void) 609 static inline bool got_nohz_idle_kick(void)
610 { 610 {
611 int cpu = smp_processor_id(); 611 int cpu = smp_processor_id();
612 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 612 return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
613 } 613 }
614 614
615 #else /* CONFIG_NO_HZ */ 615 #else /* CONFIG_NO_HZ */
616 616
617 static inline bool got_nohz_idle_kick(void) 617 static inline bool got_nohz_idle_kick(void)
618 { 618 {
619 return false; 619 return false;
620 } 620 }
621 621
622 #endif /* CONFIG_NO_HZ */ 622 #endif /* CONFIG_NO_HZ */
623 623
624 void sched_avg_update(struct rq *rq) 624 void sched_avg_update(struct rq *rq)
625 { 625 {
626 s64 period = sched_avg_period(); 626 s64 period = sched_avg_period();
627 627
628 while ((s64)(rq->clock - rq->age_stamp) > period) { 628 while ((s64)(rq->clock - rq->age_stamp) > period) {
629 /* 629 /*
630 * Inline assembly required to prevent the compiler 630 * Inline assembly required to prevent the compiler
631 * optimising this loop into a divmod call. 631 * optimising this loop into a divmod call.
632 * See __iter_div_u64_rem() for another example of this. 632 * See __iter_div_u64_rem() for another example of this.
633 */ 633 */
634 asm("" : "+rm" (rq->age_stamp)); 634 asm("" : "+rm" (rq->age_stamp));
635 rq->age_stamp += period; 635 rq->age_stamp += period;
636 rq->rt_avg /= 2; 636 rq->rt_avg /= 2;
637 } 637 }
638 } 638 }
639 639
640 #else /* !CONFIG_SMP */ 640 #else /* !CONFIG_SMP */
641 void resched_task(struct task_struct *p) 641 void resched_task(struct task_struct *p)
642 { 642 {
643 assert_raw_spin_locked(&task_rq(p)->lock); 643 assert_raw_spin_locked(&task_rq(p)->lock);
644 set_tsk_need_resched(p); 644 set_tsk_need_resched(p);
645 } 645 }
646 #endif /* CONFIG_SMP */ 646 #endif /* CONFIG_SMP */
647 647
648 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 648 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
649 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 649 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
650 /* 650 /*
651 * Iterate task_group tree rooted at *from, calling @down when first entering a 651 * Iterate task_group tree rooted at *from, calling @down when first entering a
652 * node and @up when leaving it for the final time. 652 * node and @up when leaving it for the final time.
653 * 653 *
654 * Caller must hold rcu_lock or sufficient equivalent. 654 * Caller must hold rcu_lock or sufficient equivalent.
655 */ 655 */
656 int walk_tg_tree_from(struct task_group *from, 656 int walk_tg_tree_from(struct task_group *from,
657 tg_visitor down, tg_visitor up, void *data) 657 tg_visitor down, tg_visitor up, void *data)
658 { 658 {
659 struct task_group *parent, *child; 659 struct task_group *parent, *child;
660 int ret; 660 int ret;
661 661
662 parent = from; 662 parent = from;
663 663
664 down: 664 down:
665 ret = (*down)(parent, data); 665 ret = (*down)(parent, data);
666 if (ret) 666 if (ret)
667 goto out; 667 goto out;
668 list_for_each_entry_rcu(child, &parent->children, siblings) { 668 list_for_each_entry_rcu(child, &parent->children, siblings) {
669 parent = child; 669 parent = child;
670 goto down; 670 goto down;
671 671
672 up: 672 up:
673 continue; 673 continue;
674 } 674 }
675 ret = (*up)(parent, data); 675 ret = (*up)(parent, data);
676 if (ret || parent == from) 676 if (ret || parent == from)
677 goto out; 677 goto out;
678 678
679 child = parent; 679 child = parent;
680 parent = parent->parent; 680 parent = parent->parent;
681 if (parent) 681 if (parent)
682 goto up; 682 goto up;
683 out: 683 out:
684 return ret; 684 return ret;
685 } 685 }
686 686
687 int tg_nop(struct task_group *tg, void *data) 687 int tg_nop(struct task_group *tg, void *data)
688 { 688 {
689 return 0; 689 return 0;
690 } 690 }
691 #endif 691 #endif
692 692
693 void update_cpu_load(struct rq *this_rq); 693 void update_cpu_load(struct rq *this_rq);
694 694
695 static void set_load_weight(struct task_struct *p) 695 static void set_load_weight(struct task_struct *p)
696 { 696 {
697 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
698 struct load_weight *load = &p->se.load; 698 struct load_weight *load = &p->se.load;
699 699
700 /* 700 /*
701 * SCHED_IDLE tasks get minimal weight: 701 * SCHED_IDLE tasks get minimal weight:
702 */ 702 */
703 if (p->policy == SCHED_IDLE) { 703 if (p->policy == SCHED_IDLE) {
704 load->weight = scale_load(WEIGHT_IDLEPRIO); 704 load->weight = scale_load(WEIGHT_IDLEPRIO);
705 load->inv_weight = WMULT_IDLEPRIO; 705 load->inv_weight = WMULT_IDLEPRIO;
706 return; 706 return;
707 } 707 }
708 708
709 load->weight = scale_load(prio_to_weight[prio]); 709 load->weight = scale_load(prio_to_weight[prio]);
710 load->inv_weight = prio_to_wmult[prio]; 710 load->inv_weight = prio_to_wmult[prio];
711 } 711 }
712 712
713 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 713 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
714 { 714 {
715 update_rq_clock(rq); 715 update_rq_clock(rq);
716 sched_info_queued(p); 716 sched_info_queued(p);
717 p->sched_class->enqueue_task(rq, p, flags); 717 p->sched_class->enqueue_task(rq, p, flags);
718 } 718 }
719 719
720 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 720 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
721 { 721 {
722 update_rq_clock(rq); 722 update_rq_clock(rq);
723 sched_info_dequeued(p); 723 sched_info_dequeued(p);
724 p->sched_class->dequeue_task(rq, p, flags); 724 p->sched_class->dequeue_task(rq, p, flags);
725 } 725 }
726 726
727 void activate_task(struct rq *rq, struct task_struct *p, int flags) 727 void activate_task(struct rq *rq, struct task_struct *p, int flags)
728 { 728 {
729 if (task_contributes_to_load(p)) 729 if (task_contributes_to_load(p))
730 rq->nr_uninterruptible--; 730 rq->nr_uninterruptible--;
731 731
732 enqueue_task(rq, p, flags); 732 enqueue_task(rq, p, flags);
733 } 733 }
734 734
735 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 735 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
736 { 736 {
737 if (task_contributes_to_load(p)) 737 if (task_contributes_to_load(p))
738 rq->nr_uninterruptible++; 738 rq->nr_uninterruptible++;
739 739
740 dequeue_task(rq, p, flags); 740 dequeue_task(rq, p, flags);
741 } 741 }
742 742
743 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 743 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
744 744
745 /* 745 /*
746 * There are no locks covering percpu hardirq/softirq time. 746 * There are no locks covering percpu hardirq/softirq time.
747 * They are only modified in account_system_vtime, on corresponding CPU 747 * They are only modified in account_system_vtime, on corresponding CPU
748 * with interrupts disabled. So, writes are safe. 748 * with interrupts disabled. So, writes are safe.
749 * They are read and saved off onto struct rq in update_rq_clock(). 749 * They are read and saved off onto struct rq in update_rq_clock().
750 * This may result in other CPU reading this CPU's irq time and can 750 * This may result in other CPU reading this CPU's irq time and can
751 * race with irq/account_system_vtime on this CPU. We would either get old 751 * race with irq/account_system_vtime on this CPU. We would either get old
752 * or new value with a side effect of accounting a slice of irq time to wrong 752 * or new value with a side effect of accounting a slice of irq time to wrong
753 * task when irq is in progress while we read rq->clock. That is a worthy 753 * task when irq is in progress while we read rq->clock. That is a worthy
754 * compromise in place of having locks on each irq in account_system_time. 754 * compromise in place of having locks on each irq in account_system_time.
755 */ 755 */
756 static DEFINE_PER_CPU(u64, cpu_hardirq_time); 756 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
757 static DEFINE_PER_CPU(u64, cpu_softirq_time); 757 static DEFINE_PER_CPU(u64, cpu_softirq_time);
758 758
759 static DEFINE_PER_CPU(u64, irq_start_time); 759 static DEFINE_PER_CPU(u64, irq_start_time);
760 static int sched_clock_irqtime; 760 static int sched_clock_irqtime;
761 761
762 void enable_sched_clock_irqtime(void) 762 void enable_sched_clock_irqtime(void)
763 { 763 {
764 sched_clock_irqtime = 1; 764 sched_clock_irqtime = 1;
765 } 765 }
766 766
767 void disable_sched_clock_irqtime(void) 767 void disable_sched_clock_irqtime(void)
768 { 768 {
769 sched_clock_irqtime = 0; 769 sched_clock_irqtime = 0;
770 } 770 }
771 771
772 #ifndef CONFIG_64BIT 772 #ifndef CONFIG_64BIT
773 static DEFINE_PER_CPU(seqcount_t, irq_time_seq); 773 static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
774 774
775 static inline void irq_time_write_begin(void) 775 static inline void irq_time_write_begin(void)
776 { 776 {
777 __this_cpu_inc(irq_time_seq.sequence); 777 __this_cpu_inc(irq_time_seq.sequence);
778 smp_wmb(); 778 smp_wmb();
779 } 779 }
780 780
781 static inline void irq_time_write_end(void) 781 static inline void irq_time_write_end(void)
782 { 782 {
783 smp_wmb(); 783 smp_wmb();
784 __this_cpu_inc(irq_time_seq.sequence); 784 __this_cpu_inc(irq_time_seq.sequence);
785 } 785 }
786 786
787 static inline u64 irq_time_read(int cpu) 787 static inline u64 irq_time_read(int cpu)
788 { 788 {
789 u64 irq_time; 789 u64 irq_time;
790 unsigned seq; 790 unsigned seq;
791 791
792 do { 792 do {
793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); 793 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
794 irq_time = per_cpu(cpu_softirq_time, cpu) + 794 irq_time = per_cpu(cpu_softirq_time, cpu) +
795 per_cpu(cpu_hardirq_time, cpu); 795 per_cpu(cpu_hardirq_time, cpu);
796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); 796 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
797 797
798 return irq_time; 798 return irq_time;
799 } 799 }
800 #else /* CONFIG_64BIT */ 800 #else /* CONFIG_64BIT */
801 static inline void irq_time_write_begin(void) 801 static inline void irq_time_write_begin(void)
802 { 802 {
803 } 803 }
804 804
805 static inline void irq_time_write_end(void) 805 static inline void irq_time_write_end(void)
806 { 806 {
807 } 807 }
808 808
809 static inline u64 irq_time_read(int cpu) 809 static inline u64 irq_time_read(int cpu)
810 { 810 {
811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 811 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
812 } 812 }
813 #endif /* CONFIG_64BIT */ 813 #endif /* CONFIG_64BIT */
814 814
815 /* 815 /*
816 * Called before incrementing preempt_count on {soft,}irq_enter 816 * Called before incrementing preempt_count on {soft,}irq_enter
817 * and before decrementing preempt_count on {soft,}irq_exit. 817 * and before decrementing preempt_count on {soft,}irq_exit.
818 */ 818 */
819 void account_system_vtime(struct task_struct *curr) 819 void account_system_vtime(struct task_struct *curr)
820 { 820 {
821 unsigned long flags; 821 unsigned long flags;
822 s64 delta; 822 s64 delta;
823 int cpu; 823 int cpu;
824 824
825 if (!sched_clock_irqtime) 825 if (!sched_clock_irqtime)
826 return; 826 return;
827 827
828 local_irq_save(flags); 828 local_irq_save(flags);
829 829
830 cpu = smp_processor_id(); 830 cpu = smp_processor_id();
831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); 831 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
832 __this_cpu_add(irq_start_time, delta); 832 __this_cpu_add(irq_start_time, delta);
833 833
834 irq_time_write_begin(); 834 irq_time_write_begin();
835 /* 835 /*
836 * We do not account for softirq time from ksoftirqd here. 836 * We do not account for softirq time from ksoftirqd here.
837 * We want to continue accounting softirq time to ksoftirqd thread 837 * We want to continue accounting softirq time to ksoftirqd thread
838 * in that case, so as not to confuse scheduler with a special task 838 * in that case, so as not to confuse scheduler with a special task
839 * that do not consume any time, but still wants to run. 839 * that do not consume any time, but still wants to run.
840 */ 840 */
841 if (hardirq_count()) 841 if (hardirq_count())
842 __this_cpu_add(cpu_hardirq_time, delta); 842 __this_cpu_add(cpu_hardirq_time, delta);
843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) 843 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
844 __this_cpu_add(cpu_softirq_time, delta); 844 __this_cpu_add(cpu_softirq_time, delta);
845 845
846 irq_time_write_end(); 846 irq_time_write_end();
847 local_irq_restore(flags); 847 local_irq_restore(flags);
848 } 848 }
849 EXPORT_SYMBOL_GPL(account_system_vtime); 849 EXPORT_SYMBOL_GPL(account_system_vtime);
850 850
851 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 851 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
852 852
853 #ifdef CONFIG_PARAVIRT 853 #ifdef CONFIG_PARAVIRT
854 static inline u64 steal_ticks(u64 steal) 854 static inline u64 steal_ticks(u64 steal)
855 { 855 {
856 if (unlikely(steal > NSEC_PER_SEC)) 856 if (unlikely(steal > NSEC_PER_SEC))
857 return div_u64(steal, TICK_NSEC); 857 return div_u64(steal, TICK_NSEC);
858 858
859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal); 859 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
860 } 860 }
861 #endif 861 #endif
862 862
863 static void update_rq_clock_task(struct rq *rq, s64 delta) 863 static void update_rq_clock_task(struct rq *rq, s64 delta)
864 { 864 {
865 /* 865 /*
866 * In theory, the compile should just see 0 here, and optimize out the call 866 * In theory, the compile should just see 0 here, and optimize out the call
867 * to sched_rt_avg_update. But I don't trust it... 867 * to sched_rt_avg_update. But I don't trust it...
868 */ 868 */
869 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 869 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
870 s64 steal = 0, irq_delta = 0; 870 s64 steal = 0, irq_delta = 0;
871 #endif 871 #endif
872 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 872 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
873 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 873 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
874 874
875 /* 875 /*
876 * Since irq_time is only updated on {soft,}irq_exit, we might run into 876 * Since irq_time is only updated on {soft,}irq_exit, we might run into
877 * this case when a previous update_rq_clock() happened inside a 877 * this case when a previous update_rq_clock() happened inside a
878 * {soft,}irq region. 878 * {soft,}irq region.
879 * 879 *
880 * When this happens, we stop ->clock_task and only update the 880 * When this happens, we stop ->clock_task and only update the
881 * prev_irq_time stamp to account for the part that fit, so that a next 881 * prev_irq_time stamp to account for the part that fit, so that a next
882 * update will consume the rest. This ensures ->clock_task is 882 * update will consume the rest. This ensures ->clock_task is
883 * monotonic. 883 * monotonic.
884 * 884 *
885 * It does however cause some slight miss-attribution of {soft,}irq 885 * It does however cause some slight miss-attribution of {soft,}irq
886 * time, a more accurate solution would be to update the irq_time using 886 * time, a more accurate solution would be to update the irq_time using
887 * the current rq->clock timestamp, except that would require using 887 * the current rq->clock timestamp, except that would require using
888 * atomic ops. 888 * atomic ops.
889 */ 889 */
890 if (irq_delta > delta) 890 if (irq_delta > delta)
891 irq_delta = delta; 891 irq_delta = delta;
892 892
893 rq->prev_irq_time += irq_delta; 893 rq->prev_irq_time += irq_delta;
894 delta -= irq_delta; 894 delta -= irq_delta;
895 #endif 895 #endif
896 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 896 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
897 if (static_branch((&paravirt_steal_rq_enabled))) { 897 if (static_branch((&paravirt_steal_rq_enabled))) {
898 u64 st; 898 u64 st;
899 899
900 steal = paravirt_steal_clock(cpu_of(rq)); 900 steal = paravirt_steal_clock(cpu_of(rq));
901 steal -= rq->prev_steal_time_rq; 901 steal -= rq->prev_steal_time_rq;
902 902
903 if (unlikely(steal > delta)) 903 if (unlikely(steal > delta))
904 steal = delta; 904 steal = delta;
905 905
906 st = steal_ticks(steal); 906 st = steal_ticks(steal);
907 steal = st * TICK_NSEC; 907 steal = st * TICK_NSEC;
908 908
909 rq->prev_steal_time_rq += steal; 909 rq->prev_steal_time_rq += steal;
910 910
911 delta -= steal; 911 delta -= steal;
912 } 912 }
913 #endif 913 #endif
914 914
915 rq->clock_task += delta; 915 rq->clock_task += delta;
916 916
917 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 917 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
918 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 918 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
919 sched_rt_avg_update(rq, irq_delta + steal); 919 sched_rt_avg_update(rq, irq_delta + steal);
920 #endif 920 #endif
921 } 921 }
922 922
923 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 923 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
924 static int irqtime_account_hi_update(void) 924 static int irqtime_account_hi_update(void)
925 { 925 {
926 u64 *cpustat = kcpustat_this_cpu->cpustat; 926 u64 *cpustat = kcpustat_this_cpu->cpustat;
927 unsigned long flags; 927 unsigned long flags;
928 u64 latest_ns; 928 u64 latest_ns;
929 int ret = 0; 929 int ret = 0;
930 930
931 local_irq_save(flags); 931 local_irq_save(flags);
932 latest_ns = this_cpu_read(cpu_hardirq_time); 932 latest_ns = this_cpu_read(cpu_hardirq_time);
933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) 933 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
934 ret = 1; 934 ret = 1;
935 local_irq_restore(flags); 935 local_irq_restore(flags);
936 return ret; 936 return ret;
937 } 937 }
938 938
939 static int irqtime_account_si_update(void) 939 static int irqtime_account_si_update(void)
940 { 940 {
941 u64 *cpustat = kcpustat_this_cpu->cpustat; 941 u64 *cpustat = kcpustat_this_cpu->cpustat;
942 unsigned long flags; 942 unsigned long flags;
943 u64 latest_ns; 943 u64 latest_ns;
944 int ret = 0; 944 int ret = 0;
945 945
946 local_irq_save(flags); 946 local_irq_save(flags);
947 latest_ns = this_cpu_read(cpu_softirq_time); 947 latest_ns = this_cpu_read(cpu_softirq_time);
948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) 948 if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
949 ret = 1; 949 ret = 1;
950 local_irq_restore(flags); 950 local_irq_restore(flags);
951 return ret; 951 return ret;
952 } 952 }
953 953
954 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 954 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
955 955
956 #define sched_clock_irqtime (0) 956 #define sched_clock_irqtime (0)
957 957
958 #endif 958 #endif
959 959
960 void sched_set_stop_task(int cpu, struct task_struct *stop) 960 void sched_set_stop_task(int cpu, struct task_struct *stop)
961 { 961 {
962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 962 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
963 struct task_struct *old_stop = cpu_rq(cpu)->stop; 963 struct task_struct *old_stop = cpu_rq(cpu)->stop;
964 964
965 if (stop) { 965 if (stop) {
966 /* 966 /*
967 * Make it appear like a SCHED_FIFO task, its something 967 * Make it appear like a SCHED_FIFO task, its something
968 * userspace knows about and won't get confused about. 968 * userspace knows about and won't get confused about.
969 * 969 *
970 * Also, it will make PI more or less work without too 970 * Also, it will make PI more or less work without too
971 * much confusion -- but then, stop work should not 971 * much confusion -- but then, stop work should not
972 * rely on PI working anyway. 972 * rely on PI working anyway.
973 */ 973 */
974 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); 974 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
975 975
976 stop->sched_class = &stop_sched_class; 976 stop->sched_class = &stop_sched_class;
977 } 977 }
978 978
979 cpu_rq(cpu)->stop = stop; 979 cpu_rq(cpu)->stop = stop;
980 980
981 if (old_stop) { 981 if (old_stop) {
982 /* 982 /*
983 * Reset it back to a normal scheduling class so that 983 * Reset it back to a normal scheduling class so that
984 * it can die in pieces. 984 * it can die in pieces.
985 */ 985 */
986 old_stop->sched_class = &rt_sched_class; 986 old_stop->sched_class = &rt_sched_class;
987 } 987 }
988 } 988 }
989 989
990 /* 990 /*
991 * __normal_prio - return the priority that is based on the static prio 991 * __normal_prio - return the priority that is based on the static prio
992 */ 992 */
993 static inline int __normal_prio(struct task_struct *p) 993 static inline int __normal_prio(struct task_struct *p)
994 { 994 {
995 return p->static_prio; 995 return p->static_prio;
996 } 996 }
997 997
998 /* 998 /*
999 * Calculate the expected normal priority: i.e. priority 999 * Calculate the expected normal priority: i.e. priority
1000 * without taking RT-inheritance into account. Might be 1000 * without taking RT-inheritance into account. Might be
1001 * boosted by interactivity modifiers. Changes upon fork, 1001 * boosted by interactivity modifiers. Changes upon fork,
1002 * setprio syscalls, and whenever the interactivity 1002 * setprio syscalls, and whenever the interactivity
1003 * estimator recalculates. 1003 * estimator recalculates.
1004 */ 1004 */
1005 static inline int normal_prio(struct task_struct *p) 1005 static inline int normal_prio(struct task_struct *p)
1006 { 1006 {
1007 int prio; 1007 int prio;
1008 1008
1009 if (task_has_rt_policy(p)) 1009 if (task_has_rt_policy(p))
1010 prio = MAX_RT_PRIO-1 - p->rt_priority; 1010 prio = MAX_RT_PRIO-1 - p->rt_priority;
1011 else 1011 else
1012 prio = __normal_prio(p); 1012 prio = __normal_prio(p);
1013 return prio; 1013 return prio;
1014 } 1014 }
1015 1015
1016 /* 1016 /*
1017 * Calculate the current priority, i.e. the priority 1017 * Calculate the current priority, i.e. the priority
1018 * taken into account by the scheduler. This value might 1018 * taken into account by the scheduler. This value might
1019 * be boosted by RT tasks, or might be boosted by 1019 * be boosted by RT tasks, or might be boosted by
1020 * interactivity modifiers. Will be RT if the task got 1020 * interactivity modifiers. Will be RT if the task got
1021 * RT-boosted. If not then it returns p->normal_prio. 1021 * RT-boosted. If not then it returns p->normal_prio.
1022 */ 1022 */
1023 static int effective_prio(struct task_struct *p) 1023 static int effective_prio(struct task_struct *p)
1024 { 1024 {
1025 p->normal_prio = normal_prio(p); 1025 p->normal_prio = normal_prio(p);
1026 /* 1026 /*
1027 * If we are RT tasks or we were boosted to RT priority, 1027 * If we are RT tasks or we were boosted to RT priority,
1028 * keep the priority unchanged. Otherwise, update priority 1028 * keep the priority unchanged. Otherwise, update priority
1029 * to the normal priority: 1029 * to the normal priority:
1030 */ 1030 */
1031 if (!rt_prio(p->prio)) 1031 if (!rt_prio(p->prio))
1032 return p->normal_prio; 1032 return p->normal_prio;
1033 return p->prio; 1033 return p->prio;
1034 } 1034 }
1035 1035
1036 /** 1036 /**
1037 * task_curr - is this task currently executing on a CPU? 1037 * task_curr - is this task currently executing on a CPU?
1038 * @p: the task in question. 1038 * @p: the task in question.
1039 */ 1039 */
1040 inline int task_curr(const struct task_struct *p) 1040 inline int task_curr(const struct task_struct *p)
1041 { 1041 {
1042 return cpu_curr(task_cpu(p)) == p; 1042 return cpu_curr(task_cpu(p)) == p;
1043 } 1043 }
1044 1044
1045 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1045 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1046 const struct sched_class *prev_class, 1046 const struct sched_class *prev_class,
1047 int oldprio) 1047 int oldprio)
1048 { 1048 {
1049 if (prev_class != p->sched_class) { 1049 if (prev_class != p->sched_class) {
1050 if (prev_class->switched_from) 1050 if (prev_class->switched_from)
1051 prev_class->switched_from(rq, p); 1051 prev_class->switched_from(rq, p);
1052 p->sched_class->switched_to(rq, p); 1052 p->sched_class->switched_to(rq, p);
1053 } else if (oldprio != p->prio) 1053 } else if (oldprio != p->prio)
1054 p->sched_class->prio_changed(rq, p, oldprio); 1054 p->sched_class->prio_changed(rq, p, oldprio);
1055 } 1055 }
1056 1056
1057 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 1057 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1058 { 1058 {
1059 const struct sched_class *class; 1059 const struct sched_class *class;
1060 1060
1061 if (p->sched_class == rq->curr->sched_class) { 1061 if (p->sched_class == rq->curr->sched_class) {
1062 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 1062 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1063 } else { 1063 } else {
1064 for_each_class(class) { 1064 for_each_class(class) {
1065 if (class == rq->curr->sched_class) 1065 if (class == rq->curr->sched_class)
1066 break; 1066 break;
1067 if (class == p->sched_class) { 1067 if (class == p->sched_class) {
1068 resched_task(rq->curr); 1068 resched_task(rq->curr);
1069 break; 1069 break;
1070 } 1070 }
1071 } 1071 }
1072 } 1072 }
1073 1073
1074 /* 1074 /*
1075 * A queue event has occurred, and we're going to schedule. In 1075 * A queue event has occurred, and we're going to schedule. In
1076 * this case, we can save a useless back to back clock update. 1076 * this case, we can save a useless back to back clock update.
1077 */ 1077 */
1078 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 1078 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
1079 rq->skip_clock_update = 1; 1079 rq->skip_clock_update = 1;
1080 } 1080 }
1081 1081
1082 #ifdef CONFIG_SMP 1082 #ifdef CONFIG_SMP
1083 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1083 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1084 { 1084 {
1085 #ifdef CONFIG_SCHED_DEBUG 1085 #ifdef CONFIG_SCHED_DEBUG
1086 /* 1086 /*
1087 * We should never call set_task_cpu() on a blocked task, 1087 * We should never call set_task_cpu() on a blocked task,
1088 * ttwu() will sort out the placement. 1088 * ttwu() will sort out the placement.
1089 */ 1089 */
1090 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 1090 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1091 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 1091 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
1092 1092
1093 #ifdef CONFIG_LOCKDEP 1093 #ifdef CONFIG_LOCKDEP
1094 /* 1094 /*
1095 * The caller should hold either p->pi_lock or rq->lock, when changing 1095 * The caller should hold either p->pi_lock or rq->lock, when changing
1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 1096 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1097 * 1097 *
1098 * sched_move_task() holds both and thus holding either pins the cgroup, 1098 * sched_move_task() holds both and thus holding either pins the cgroup,
1099 * see set_task_rq(). 1099 * see set_task_rq().
1100 * 1100 *
1101 * Furthermore, all task_rq users should acquire both locks, see 1101 * Furthermore, all task_rq users should acquire both locks, see
1102 * task_rq_lock(). 1102 * task_rq_lock().
1103 */ 1103 */
1104 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 1104 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1105 lockdep_is_held(&task_rq(p)->lock))); 1105 lockdep_is_held(&task_rq(p)->lock)));
1106 #endif 1106 #endif
1107 #endif 1107 #endif
1108 1108
1109 trace_sched_migrate_task(p, new_cpu); 1109 trace_sched_migrate_task(p, new_cpu);
1110 1110
1111 if (task_cpu(p) != new_cpu) { 1111 if (task_cpu(p) != new_cpu) {
1112 p->se.nr_migrations++; 1112 p->se.nr_migrations++;
1113 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1113 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1114 } 1114 }
1115 1115
1116 __set_task_cpu(p, new_cpu); 1116 __set_task_cpu(p, new_cpu);
1117 } 1117 }
1118 1118
1119 struct migration_arg { 1119 struct migration_arg {
1120 struct task_struct *task; 1120 struct task_struct *task;
1121 int dest_cpu; 1121 int dest_cpu;
1122 }; 1122 };
1123 1123
1124 static int migration_cpu_stop(void *data); 1124 static int migration_cpu_stop(void *data);
1125 1125
1126 /* 1126 /*
1127 * wait_task_inactive - wait for a thread to unschedule. 1127 * wait_task_inactive - wait for a thread to unschedule.
1128 * 1128 *
1129 * If @match_state is nonzero, it's the @p->state value just checked and 1129 * If @match_state is nonzero, it's the @p->state value just checked and
1130 * not expected to change. If it changes, i.e. @p might have woken up, 1130 * not expected to change. If it changes, i.e. @p might have woken up,
1131 * then return zero. When we succeed in waiting for @p to be off its CPU, 1131 * then return zero. When we succeed in waiting for @p to be off its CPU,
1132 * we return a positive number (its total switch count). If a second call 1132 * we return a positive number (its total switch count). If a second call
1133 * a short while later returns the same number, the caller can be sure that 1133 * a short while later returns the same number, the caller can be sure that
1134 * @p has remained unscheduled the whole time. 1134 * @p has remained unscheduled the whole time.
1135 * 1135 *
1136 * The caller must ensure that the task *will* unschedule sometime soon, 1136 * The caller must ensure that the task *will* unschedule sometime soon,
1137 * else this function might spin for a *long* time. This function can't 1137 * else this function might spin for a *long* time. This function can't
1138 * be called with interrupts off, or it may introduce deadlock with 1138 * be called with interrupts off, or it may introduce deadlock with
1139 * smp_call_function() if an IPI is sent by the same process we are 1139 * smp_call_function() if an IPI is sent by the same process we are
1140 * waiting to become inactive. 1140 * waiting to become inactive.
1141 */ 1141 */
1142 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1142 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1143 { 1143 {
1144 unsigned long flags; 1144 unsigned long flags;
1145 int running, on_rq; 1145 int running, on_rq;
1146 unsigned long ncsw; 1146 unsigned long ncsw;
1147 struct rq *rq; 1147 struct rq *rq;
1148 1148
1149 for (;;) { 1149 for (;;) {
1150 /* 1150 /*
1151 * We do the initial early heuristics without holding 1151 * We do the initial early heuristics without holding
1152 * any task-queue locks at all. We'll only try to get 1152 * any task-queue locks at all. We'll only try to get
1153 * the runqueue lock when things look like they will 1153 * the runqueue lock when things look like they will
1154 * work out! 1154 * work out!
1155 */ 1155 */
1156 rq = task_rq(p); 1156 rq = task_rq(p);
1157 1157
1158 /* 1158 /*
1159 * If the task is actively running on another CPU 1159 * If the task is actively running on another CPU
1160 * still, just relax and busy-wait without holding 1160 * still, just relax and busy-wait without holding
1161 * any locks. 1161 * any locks.
1162 * 1162 *
1163 * NOTE! Since we don't hold any locks, it's not 1163 * NOTE! Since we don't hold any locks, it's not
1164 * even sure that "rq" stays as the right runqueue! 1164 * even sure that "rq" stays as the right runqueue!
1165 * But we don't care, since "task_running()" will 1165 * But we don't care, since "task_running()" will
1166 * return false if the runqueue has changed and p 1166 * return false if the runqueue has changed and p
1167 * is actually now running somewhere else! 1167 * is actually now running somewhere else!
1168 */ 1168 */
1169 while (task_running(rq, p)) { 1169 while (task_running(rq, p)) {
1170 if (match_state && unlikely(p->state != match_state)) 1170 if (match_state && unlikely(p->state != match_state))
1171 return 0; 1171 return 0;
1172 cpu_relax(); 1172 cpu_relax();
1173 } 1173 }
1174 1174
1175 /* 1175 /*
1176 * Ok, time to look more closely! We need the rq 1176 * Ok, time to look more closely! We need the rq
1177 * lock now, to be *sure*. If we're wrong, we'll 1177 * lock now, to be *sure*. If we're wrong, we'll
1178 * just go back and repeat. 1178 * just go back and repeat.
1179 */ 1179 */
1180 rq = task_rq_lock(p, &flags); 1180 rq = task_rq_lock(p, &flags);
1181 trace_sched_wait_task(p); 1181 trace_sched_wait_task(p);
1182 running = task_running(rq, p); 1182 running = task_running(rq, p);
1183 on_rq = p->on_rq; 1183 on_rq = p->on_rq;
1184 ncsw = 0; 1184 ncsw = 0;
1185 if (!match_state || p->state == match_state) 1185 if (!match_state || p->state == match_state)
1186 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1186 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1187 task_rq_unlock(rq, p, &flags); 1187 task_rq_unlock(rq, p, &flags);
1188 1188
1189 /* 1189 /*
1190 * If it changed from the expected state, bail out now. 1190 * If it changed from the expected state, bail out now.
1191 */ 1191 */
1192 if (unlikely(!ncsw)) 1192 if (unlikely(!ncsw))
1193 break; 1193 break;
1194 1194
1195 /* 1195 /*
1196 * Was it really running after all now that we 1196 * Was it really running after all now that we
1197 * checked with the proper locks actually held? 1197 * checked with the proper locks actually held?
1198 * 1198 *
1199 * Oops. Go back and try again.. 1199 * Oops. Go back and try again..
1200 */ 1200 */
1201 if (unlikely(running)) { 1201 if (unlikely(running)) {
1202 cpu_relax(); 1202 cpu_relax();
1203 continue; 1203 continue;
1204 } 1204 }
1205 1205
1206 /* 1206 /*
1207 * It's not enough that it's not actively running, 1207 * It's not enough that it's not actively running,
1208 * it must be off the runqueue _entirely_, and not 1208 * it must be off the runqueue _entirely_, and not
1209 * preempted! 1209 * preempted!
1210 * 1210 *
1211 * So if it was still runnable (but just not actively 1211 * So if it was still runnable (but just not actively
1212 * running right now), it's preempted, and we should 1212 * running right now), it's preempted, and we should
1213 * yield - it could be a while. 1213 * yield - it could be a while.
1214 */ 1214 */
1215 if (unlikely(on_rq)) { 1215 if (unlikely(on_rq)) {
1216 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1216 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1217 1217
1218 set_current_state(TASK_UNINTERRUPTIBLE); 1218 set_current_state(TASK_UNINTERRUPTIBLE);
1219 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1219 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1220 continue; 1220 continue;
1221 } 1221 }
1222 1222
1223 /* 1223 /*
1224 * Ahh, all good. It wasn't running, and it wasn't 1224 * Ahh, all good. It wasn't running, and it wasn't
1225 * runnable, which means that it will never become 1225 * runnable, which means that it will never become
1226 * running in the future either. We're all done! 1226 * running in the future either. We're all done!
1227 */ 1227 */
1228 break; 1228 break;
1229 } 1229 }
1230 1230
1231 return ncsw; 1231 return ncsw;
1232 } 1232 }
1233 1233
1234 /*** 1234 /***
1235 * kick_process - kick a running thread to enter/exit the kernel 1235 * kick_process - kick a running thread to enter/exit the kernel
1236 * @p: the to-be-kicked thread 1236 * @p: the to-be-kicked thread
1237 * 1237 *
1238 * Cause a process which is running on another CPU to enter 1238 * Cause a process which is running on another CPU to enter
1239 * kernel-mode, without any delay. (to get signals handled.) 1239 * kernel-mode, without any delay. (to get signals handled.)
1240 * 1240 *
1241 * NOTE: this function doesn't have to take the runqueue lock, 1241 * NOTE: this function doesn't have to take the runqueue lock,
1242 * because all it wants to ensure is that the remote task enters 1242 * because all it wants to ensure is that the remote task enters
1243 * the kernel. If the IPI races and the task has been migrated 1243 * the kernel. If the IPI races and the task has been migrated
1244 * to another CPU then no harm is done and the purpose has been 1244 * to another CPU then no harm is done and the purpose has been
1245 * achieved as well. 1245 * achieved as well.
1246 */ 1246 */
1247 void kick_process(struct task_struct *p) 1247 void kick_process(struct task_struct *p)
1248 { 1248 {
1249 int cpu; 1249 int cpu;
1250 1250
1251 preempt_disable(); 1251 preempt_disable();
1252 cpu = task_cpu(p); 1252 cpu = task_cpu(p);
1253 if ((cpu != smp_processor_id()) && task_curr(p)) 1253 if ((cpu != smp_processor_id()) && task_curr(p))
1254 smp_send_reschedule(cpu); 1254 smp_send_reschedule(cpu);
1255 preempt_enable(); 1255 preempt_enable();
1256 } 1256 }
1257 EXPORT_SYMBOL_GPL(kick_process); 1257 EXPORT_SYMBOL_GPL(kick_process);
1258 #endif /* CONFIG_SMP */ 1258 #endif /* CONFIG_SMP */
1259 1259
1260 #ifdef CONFIG_SMP 1260 #ifdef CONFIG_SMP
1261 /* 1261 /*
1262 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1262 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1263 */ 1263 */
1264 static int select_fallback_rq(int cpu, struct task_struct *p) 1264 static int select_fallback_rq(int cpu, struct task_struct *p)
1265 { 1265 {
1266 int dest_cpu; 1266 int dest_cpu;
1267 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1267 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1268 1268
1269 /* Look for allowed, online CPU in same node. */ 1269 /* Look for allowed, online CPU in same node. */
1270 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 1270 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
1271 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1271 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1272 return dest_cpu; 1272 return dest_cpu;
1273 1273
1274 /* Any allowed, online CPU? */ 1274 /* Any allowed, online CPU? */
1275 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); 1275 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
1276 if (dest_cpu < nr_cpu_ids) 1276 if (dest_cpu < nr_cpu_ids)
1277 return dest_cpu; 1277 return dest_cpu;
1278 1278
1279 /* No more Mr. Nice Guy. */ 1279 /* No more Mr. Nice Guy. */
1280 dest_cpu = cpuset_cpus_allowed_fallback(p); 1280 dest_cpu = cpuset_cpus_allowed_fallback(p);
1281 /* 1281 /*
1282 * Don't tell them about moving exiting tasks or 1282 * Don't tell them about moving exiting tasks or
1283 * kernel threads (both mm NULL), since they never 1283 * kernel threads (both mm NULL), since they never
1284 * leave kernel. 1284 * leave kernel.
1285 */ 1285 */
1286 if (p->mm && printk_ratelimit()) { 1286 if (p->mm && printk_ratelimit()) {
1287 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", 1287 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
1288 task_pid_nr(p), p->comm, cpu); 1288 task_pid_nr(p), p->comm, cpu);
1289 } 1289 }
1290 1290
1291 return dest_cpu; 1291 return dest_cpu;
1292 } 1292 }
1293 1293
1294 /* 1294 /*
1295 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1295 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1296 */ 1296 */
1297 static inline 1297 static inline
1298 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1298 int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
1299 { 1299 {
1300 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1300 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
1301 1301
1302 /* 1302 /*
1303 * In order not to call set_task_cpu() on a blocking task we need 1303 * In order not to call set_task_cpu() on a blocking task we need
1304 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1304 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1305 * cpu. 1305 * cpu.
1306 * 1306 *
1307 * Since this is common to all placement strategies, this lives here. 1307 * Since this is common to all placement strategies, this lives here.
1308 * 1308 *
1309 * [ this allows ->select_task() to simply return task_cpu(p) and 1309 * [ this allows ->select_task() to simply return task_cpu(p) and
1310 * not worry about this generic constraint ] 1310 * not worry about this generic constraint ]
1311 */ 1311 */
1312 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1312 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1313 !cpu_online(cpu))) 1313 !cpu_online(cpu)))
1314 cpu = select_fallback_rq(task_cpu(p), p); 1314 cpu = select_fallback_rq(task_cpu(p), p);
1315 1315
1316 return cpu; 1316 return cpu;
1317 } 1317 }
1318 1318
1319 static void update_avg(u64 *avg, u64 sample) 1319 static void update_avg(u64 *avg, u64 sample)
1320 { 1320 {
1321 s64 diff = sample - *avg; 1321 s64 diff = sample - *avg;
1322 *avg += diff >> 3; 1322 *avg += diff >> 3;
1323 } 1323 }
1324 #endif 1324 #endif
1325 1325
1326 static void 1326 static void
1327 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1327 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1328 { 1328 {
1329 #ifdef CONFIG_SCHEDSTATS 1329 #ifdef CONFIG_SCHEDSTATS
1330 struct rq *rq = this_rq(); 1330 struct rq *rq = this_rq();
1331 1331
1332 #ifdef CONFIG_SMP 1332 #ifdef CONFIG_SMP
1333 int this_cpu = smp_processor_id(); 1333 int this_cpu = smp_processor_id();
1334 1334
1335 if (cpu == this_cpu) { 1335 if (cpu == this_cpu) {
1336 schedstat_inc(rq, ttwu_local); 1336 schedstat_inc(rq, ttwu_local);
1337 schedstat_inc(p, se.statistics.nr_wakeups_local); 1337 schedstat_inc(p, se.statistics.nr_wakeups_local);
1338 } else { 1338 } else {
1339 struct sched_domain *sd; 1339 struct sched_domain *sd;
1340 1340
1341 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1341 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1342 rcu_read_lock(); 1342 rcu_read_lock();
1343 for_each_domain(this_cpu, sd) { 1343 for_each_domain(this_cpu, sd) {
1344 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1344 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1345 schedstat_inc(sd, ttwu_wake_remote); 1345 schedstat_inc(sd, ttwu_wake_remote);
1346 break; 1346 break;
1347 } 1347 }
1348 } 1348 }
1349 rcu_read_unlock(); 1349 rcu_read_unlock();
1350 } 1350 }
1351 1351
1352 if (wake_flags & WF_MIGRATED) 1352 if (wake_flags & WF_MIGRATED)
1353 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1353 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1354 1354
1355 #endif /* CONFIG_SMP */ 1355 #endif /* CONFIG_SMP */
1356 1356
1357 schedstat_inc(rq, ttwu_count); 1357 schedstat_inc(rq, ttwu_count);
1358 schedstat_inc(p, se.statistics.nr_wakeups); 1358 schedstat_inc(p, se.statistics.nr_wakeups);
1359 1359
1360 if (wake_flags & WF_SYNC) 1360 if (wake_flags & WF_SYNC)
1361 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1361 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1362 1362
1363 #endif /* CONFIG_SCHEDSTATS */ 1363 #endif /* CONFIG_SCHEDSTATS */
1364 } 1364 }
1365 1365
1366 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1366 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1367 { 1367 {
1368 activate_task(rq, p, en_flags); 1368 activate_task(rq, p, en_flags);
1369 p->on_rq = 1; 1369 p->on_rq = 1;
1370 1370
1371 /* if a worker is waking up, notify workqueue */ 1371 /* if a worker is waking up, notify workqueue */
1372 if (p->flags & PF_WQ_WORKER) 1372 if (p->flags & PF_WQ_WORKER)
1373 wq_worker_waking_up(p, cpu_of(rq)); 1373 wq_worker_waking_up(p, cpu_of(rq));
1374 } 1374 }
1375 1375
1376 /* 1376 /*
1377 * Mark the task runnable and perform wakeup-preemption. 1377 * Mark the task runnable and perform wakeup-preemption.
1378 */ 1378 */
1379 static void 1379 static void
1380 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1380 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1381 { 1381 {
1382 trace_sched_wakeup(p, true); 1382 trace_sched_wakeup(p, true);
1383 check_preempt_curr(rq, p, wake_flags); 1383 check_preempt_curr(rq, p, wake_flags);
1384 1384
1385 p->state = TASK_RUNNING; 1385 p->state = TASK_RUNNING;
1386 #ifdef CONFIG_SMP 1386 #ifdef CONFIG_SMP
1387 if (p->sched_class->task_woken) 1387 if (p->sched_class->task_woken)
1388 p->sched_class->task_woken(rq, p); 1388 p->sched_class->task_woken(rq, p);
1389 1389
1390 if (rq->idle_stamp) { 1390 if (rq->idle_stamp) {
1391 u64 delta = rq->clock - rq->idle_stamp; 1391 u64 delta = rq->clock - rq->idle_stamp;
1392 u64 max = 2*sysctl_sched_migration_cost; 1392 u64 max = 2*sysctl_sched_migration_cost;
1393 1393
1394 if (delta > max) 1394 if (delta > max)
1395 rq->avg_idle = max; 1395 rq->avg_idle = max;
1396 else 1396 else
1397 update_avg(&rq->avg_idle, delta); 1397 update_avg(&rq->avg_idle, delta);
1398 rq->idle_stamp = 0; 1398 rq->idle_stamp = 0;
1399 } 1399 }
1400 #endif 1400 #endif
1401 } 1401 }
1402 1402
1403 static void 1403 static void
1404 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1404 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1405 { 1405 {
1406 #ifdef CONFIG_SMP 1406 #ifdef CONFIG_SMP
1407 if (p->sched_contributes_to_load) 1407 if (p->sched_contributes_to_load)
1408 rq->nr_uninterruptible--; 1408 rq->nr_uninterruptible--;
1409 #endif 1409 #endif
1410 1410
1411 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1411 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1412 ttwu_do_wakeup(rq, p, wake_flags); 1412 ttwu_do_wakeup(rq, p, wake_flags);
1413 } 1413 }
1414 1414
1415 /* 1415 /*
1416 * Called in case the task @p isn't fully descheduled from its runqueue, 1416 * Called in case the task @p isn't fully descheduled from its runqueue,
1417 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1417 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1418 * since all we need to do is flip p->state to TASK_RUNNING, since 1418 * since all we need to do is flip p->state to TASK_RUNNING, since
1419 * the task is still ->on_rq. 1419 * the task is still ->on_rq.
1420 */ 1420 */
1421 static int ttwu_remote(struct task_struct *p, int wake_flags) 1421 static int ttwu_remote(struct task_struct *p, int wake_flags)
1422 { 1422 {
1423 struct rq *rq; 1423 struct rq *rq;
1424 int ret = 0; 1424 int ret = 0;
1425 1425
1426 rq = __task_rq_lock(p); 1426 rq = __task_rq_lock(p);
1427 if (p->on_rq) { 1427 if (p->on_rq) {
1428 ttwu_do_wakeup(rq, p, wake_flags); 1428 ttwu_do_wakeup(rq, p, wake_flags);
1429 ret = 1; 1429 ret = 1;
1430 } 1430 }
1431 __task_rq_unlock(rq); 1431 __task_rq_unlock(rq);
1432 1432
1433 return ret; 1433 return ret;
1434 } 1434 }
1435 1435
1436 #ifdef CONFIG_SMP 1436 #ifdef CONFIG_SMP
1437 static void sched_ttwu_pending(void) 1437 static void sched_ttwu_pending(void)
1438 { 1438 {
1439 struct rq *rq = this_rq(); 1439 struct rq *rq = this_rq();
1440 struct llist_node *llist = llist_del_all(&rq->wake_list); 1440 struct llist_node *llist = llist_del_all(&rq->wake_list);
1441 struct task_struct *p; 1441 struct task_struct *p;
1442 1442
1443 raw_spin_lock(&rq->lock); 1443 raw_spin_lock(&rq->lock);
1444 1444
1445 while (llist) { 1445 while (llist) {
1446 p = llist_entry(llist, struct task_struct, wake_entry); 1446 p = llist_entry(llist, struct task_struct, wake_entry);
1447 llist = llist_next(llist); 1447 llist = llist_next(llist);
1448 ttwu_do_activate(rq, p, 0); 1448 ttwu_do_activate(rq, p, 0);
1449 } 1449 }
1450 1450
1451 raw_spin_unlock(&rq->lock); 1451 raw_spin_unlock(&rq->lock);
1452 } 1452 }
1453 1453
1454 void scheduler_ipi(void) 1454 void scheduler_ipi(void)
1455 { 1455 {
1456 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) 1456 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1457 return; 1457 return;
1458 1458
1459 /* 1459 /*
1460 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1460 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1461 * traditionally all their work was done from the interrupt return 1461 * traditionally all their work was done from the interrupt return
1462 * path. Now that we actually do some work, we need to make sure 1462 * path. Now that we actually do some work, we need to make sure
1463 * we do call them. 1463 * we do call them.
1464 * 1464 *
1465 * Some archs already do call them, luckily irq_enter/exit nest 1465 * Some archs already do call them, luckily irq_enter/exit nest
1466 * properly. 1466 * properly.
1467 * 1467 *
1468 * Arguably we should visit all archs and update all handlers, 1468 * Arguably we should visit all archs and update all handlers,
1469 * however a fair share of IPIs are still resched only so this would 1469 * however a fair share of IPIs are still resched only so this would
1470 * somewhat pessimize the simple resched case. 1470 * somewhat pessimize the simple resched case.
1471 */ 1471 */
1472 irq_enter(); 1472 irq_enter();
1473 sched_ttwu_pending(); 1473 sched_ttwu_pending();
1474 1474
1475 /* 1475 /*
1476 * Check if someone kicked us for doing the nohz idle load balance. 1476 * Check if someone kicked us for doing the nohz idle load balance.
1477 */ 1477 */
1478 if (unlikely(got_nohz_idle_kick() && !need_resched())) { 1478 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
1479 this_rq()->idle_balance = 1; 1479 this_rq()->idle_balance = 1;
1480 raise_softirq_irqoff(SCHED_SOFTIRQ); 1480 raise_softirq_irqoff(SCHED_SOFTIRQ);
1481 } 1481 }
1482 irq_exit(); 1482 irq_exit();
1483 } 1483 }
1484 1484
1485 static void ttwu_queue_remote(struct task_struct *p, int cpu) 1485 static void ttwu_queue_remote(struct task_struct *p, int cpu)
1486 { 1486 {
1487 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1487 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1488 smp_send_reschedule(cpu); 1488 smp_send_reschedule(cpu);
1489 } 1489 }
1490 1490
1491 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1491 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1492 static int ttwu_activate_remote(struct task_struct *p, int wake_flags) 1492 static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1493 { 1493 {
1494 struct rq *rq; 1494 struct rq *rq;
1495 int ret = 0; 1495 int ret = 0;
1496 1496
1497 rq = __task_rq_lock(p); 1497 rq = __task_rq_lock(p);
1498 if (p->on_cpu) { 1498 if (p->on_cpu) {
1499 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1499 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1500 ttwu_do_wakeup(rq, p, wake_flags); 1500 ttwu_do_wakeup(rq, p, wake_flags);
1501 ret = 1; 1501 ret = 1;
1502 } 1502 }
1503 __task_rq_unlock(rq); 1503 __task_rq_unlock(rq);
1504 1504
1505 return ret; 1505 return ret;
1506 1506
1507 } 1507 }
1508 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1508 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1509 1509
1510 bool cpus_share_cache(int this_cpu, int that_cpu) 1510 bool cpus_share_cache(int this_cpu, int that_cpu)
1511 { 1511 {
1512 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1512 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1513 } 1513 }
1514 #endif /* CONFIG_SMP */ 1514 #endif /* CONFIG_SMP */
1515 1515
1516 static void ttwu_queue(struct task_struct *p, int cpu) 1516 static void ttwu_queue(struct task_struct *p, int cpu)
1517 { 1517 {
1518 struct rq *rq = cpu_rq(cpu); 1518 struct rq *rq = cpu_rq(cpu);
1519 1519
1520 #if defined(CONFIG_SMP) 1520 #if defined(CONFIG_SMP)
1521 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1521 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1522 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1522 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1523 ttwu_queue_remote(p, cpu); 1523 ttwu_queue_remote(p, cpu);
1524 return; 1524 return;
1525 } 1525 }
1526 #endif 1526 #endif
1527 1527
1528 raw_spin_lock(&rq->lock); 1528 raw_spin_lock(&rq->lock);
1529 ttwu_do_activate(rq, p, 0); 1529 ttwu_do_activate(rq, p, 0);
1530 raw_spin_unlock(&rq->lock); 1530 raw_spin_unlock(&rq->lock);
1531 } 1531 }
1532 1532
1533 /** 1533 /**
1534 * try_to_wake_up - wake up a thread 1534 * try_to_wake_up - wake up a thread
1535 * @p: the thread to be awakened 1535 * @p: the thread to be awakened
1536 * @state: the mask of task states that can be woken 1536 * @state: the mask of task states that can be woken
1537 * @wake_flags: wake modifier flags (WF_*) 1537 * @wake_flags: wake modifier flags (WF_*)
1538 * 1538 *
1539 * Put it on the run-queue if it's not already there. The "current" 1539 * Put it on the run-queue if it's not already there. The "current"
1540 * thread is always on the run-queue (except when the actual 1540 * thread is always on the run-queue (except when the actual
1541 * re-schedule is in progress), and as such you're allowed to do 1541 * re-schedule is in progress), and as such you're allowed to do
1542 * the simpler "current->state = TASK_RUNNING" to mark yourself 1542 * the simpler "current->state = TASK_RUNNING" to mark yourself
1543 * runnable without the overhead of this. 1543 * runnable without the overhead of this.
1544 * 1544 *
1545 * Returns %true if @p was woken up, %false if it was already running 1545 * Returns %true if @p was woken up, %false if it was already running
1546 * or @state didn't match @p's state. 1546 * or @state didn't match @p's state.
1547 */ 1547 */
1548 static int 1548 static int
1549 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1549 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1550 { 1550 {
1551 unsigned long flags; 1551 unsigned long flags;
1552 int cpu, success = 0; 1552 int cpu, success = 0;
1553 1553
1554 smp_wmb(); 1554 smp_wmb();
1555 raw_spin_lock_irqsave(&p->pi_lock, flags); 1555 raw_spin_lock_irqsave(&p->pi_lock, flags);
1556 if (!(p->state & state)) 1556 if (!(p->state & state))
1557 goto out; 1557 goto out;
1558 1558
1559 success = 1; /* we're going to change ->state */ 1559 success = 1; /* we're going to change ->state */
1560 cpu = task_cpu(p); 1560 cpu = task_cpu(p);
1561 1561
1562 if (p->on_rq && ttwu_remote(p, wake_flags)) 1562 if (p->on_rq && ttwu_remote(p, wake_flags))
1563 goto stat; 1563 goto stat;
1564 1564
1565 #ifdef CONFIG_SMP 1565 #ifdef CONFIG_SMP
1566 /* 1566 /*
1567 * If the owning (remote) cpu is still in the middle of schedule() with 1567 * If the owning (remote) cpu is still in the middle of schedule() with
1568 * this task as prev, wait until its done referencing the task. 1568 * this task as prev, wait until its done referencing the task.
1569 */ 1569 */
1570 while (p->on_cpu) { 1570 while (p->on_cpu) {
1571 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1571 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1572 /* 1572 /*
1573 * In case the architecture enables interrupts in 1573 * In case the architecture enables interrupts in
1574 * context_switch(), we cannot busy wait, since that 1574 * context_switch(), we cannot busy wait, since that
1575 * would lead to deadlocks when an interrupt hits and 1575 * would lead to deadlocks when an interrupt hits and
1576 * tries to wake up @prev. So bail and do a complete 1576 * tries to wake up @prev. So bail and do a complete
1577 * remote wakeup. 1577 * remote wakeup.
1578 */ 1578 */
1579 if (ttwu_activate_remote(p, wake_flags)) 1579 if (ttwu_activate_remote(p, wake_flags))
1580 goto stat; 1580 goto stat;
1581 #else 1581 #else
1582 cpu_relax(); 1582 cpu_relax();
1583 #endif 1583 #endif
1584 } 1584 }
1585 /* 1585 /*
1586 * Pairs with the smp_wmb() in finish_lock_switch(). 1586 * Pairs with the smp_wmb() in finish_lock_switch().
1587 */ 1587 */
1588 smp_rmb(); 1588 smp_rmb();
1589 1589
1590 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1590 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1591 p->state = TASK_WAKING; 1591 p->state = TASK_WAKING;
1592 1592
1593 if (p->sched_class->task_waking) 1593 if (p->sched_class->task_waking)
1594 p->sched_class->task_waking(p); 1594 p->sched_class->task_waking(p);
1595 1595
1596 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1596 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
1597 if (task_cpu(p) != cpu) { 1597 if (task_cpu(p) != cpu) {
1598 wake_flags |= WF_MIGRATED; 1598 wake_flags |= WF_MIGRATED;
1599 set_task_cpu(p, cpu); 1599 set_task_cpu(p, cpu);
1600 } 1600 }
1601 #endif /* CONFIG_SMP */ 1601 #endif /* CONFIG_SMP */
1602 1602
1603 ttwu_queue(p, cpu); 1603 ttwu_queue(p, cpu);
1604 stat: 1604 stat:
1605 ttwu_stat(p, cpu, wake_flags); 1605 ttwu_stat(p, cpu, wake_flags);
1606 out: 1606 out:
1607 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1607 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1608 1608
1609 return success; 1609 return success;
1610 } 1610 }
1611 1611
1612 /** 1612 /**
1613 * try_to_wake_up_local - try to wake up a local task with rq lock held 1613 * try_to_wake_up_local - try to wake up a local task with rq lock held
1614 * @p: the thread to be awakened 1614 * @p: the thread to be awakened
1615 * 1615 *
1616 * Put @p on the run-queue if it's not already there. The caller must 1616 * Put @p on the run-queue if it's not already there. The caller must
1617 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1617 * ensure that this_rq() is locked, @p is bound to this_rq() and not
1618 * the current task. 1618 * the current task.
1619 */ 1619 */
1620 static void try_to_wake_up_local(struct task_struct *p) 1620 static void try_to_wake_up_local(struct task_struct *p)
1621 { 1621 {
1622 struct rq *rq = task_rq(p); 1622 struct rq *rq = task_rq(p);
1623 1623
1624 BUG_ON(rq != this_rq()); 1624 BUG_ON(rq != this_rq());
1625 BUG_ON(p == current); 1625 BUG_ON(p == current);
1626 lockdep_assert_held(&rq->lock); 1626 lockdep_assert_held(&rq->lock);
1627 1627
1628 if (!raw_spin_trylock(&p->pi_lock)) { 1628 if (!raw_spin_trylock(&p->pi_lock)) {
1629 raw_spin_unlock(&rq->lock); 1629 raw_spin_unlock(&rq->lock);
1630 raw_spin_lock(&p->pi_lock); 1630 raw_spin_lock(&p->pi_lock);
1631 raw_spin_lock(&rq->lock); 1631 raw_spin_lock(&rq->lock);
1632 } 1632 }
1633 1633
1634 if (!(p->state & TASK_NORMAL)) 1634 if (!(p->state & TASK_NORMAL))
1635 goto out; 1635 goto out;
1636 1636
1637 if (!p->on_rq) 1637 if (!p->on_rq)
1638 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1638 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1639 1639
1640 ttwu_do_wakeup(rq, p, 0); 1640 ttwu_do_wakeup(rq, p, 0);
1641 ttwu_stat(p, smp_processor_id(), 0); 1641 ttwu_stat(p, smp_processor_id(), 0);
1642 out: 1642 out:
1643 raw_spin_unlock(&p->pi_lock); 1643 raw_spin_unlock(&p->pi_lock);
1644 } 1644 }
1645 1645
1646 /** 1646 /**
1647 * wake_up_process - Wake up a specific process 1647 * wake_up_process - Wake up a specific process
1648 * @p: The process to be woken up. 1648 * @p: The process to be woken up.
1649 * 1649 *
1650 * Attempt to wake up the nominated process and move it to the set of runnable 1650 * Attempt to wake up the nominated process and move it to the set of runnable
1651 * processes. Returns 1 if the process was woken up, 0 if it was already 1651 * processes. Returns 1 if the process was woken up, 0 if it was already
1652 * running. 1652 * running.
1653 * 1653 *
1654 * It may be assumed that this function implies a write memory barrier before 1654 * It may be assumed that this function implies a write memory barrier before
1655 * changing the task state if and only if any tasks are woken up. 1655 * changing the task state if and only if any tasks are woken up.
1656 */ 1656 */
1657 int wake_up_process(struct task_struct *p) 1657 int wake_up_process(struct task_struct *p)
1658 { 1658 {
1659 return try_to_wake_up(p, TASK_ALL, 0); 1659 return try_to_wake_up(p, TASK_ALL, 0);
1660 } 1660 }
1661 EXPORT_SYMBOL(wake_up_process); 1661 EXPORT_SYMBOL(wake_up_process);
1662 1662
1663 int wake_up_state(struct task_struct *p, unsigned int state) 1663 int wake_up_state(struct task_struct *p, unsigned int state)
1664 { 1664 {
1665 return try_to_wake_up(p, state, 0); 1665 return try_to_wake_up(p, state, 0);
1666 } 1666 }
1667 1667
1668 /* 1668 /*
1669 * Perform scheduler related setup for a newly forked process p. 1669 * Perform scheduler related setup for a newly forked process p.
1670 * p is forked by current. 1670 * p is forked by current.
1671 * 1671 *
1672 * __sched_fork() is basic setup used by init_idle() too: 1672 * __sched_fork() is basic setup used by init_idle() too:
1673 */ 1673 */
1674 static void __sched_fork(struct task_struct *p) 1674 static void __sched_fork(struct task_struct *p)
1675 { 1675 {
1676 p->on_rq = 0; 1676 p->on_rq = 0;
1677 1677
1678 p->se.on_rq = 0; 1678 p->se.on_rq = 0;
1679 p->se.exec_start = 0; 1679 p->se.exec_start = 0;
1680 p->se.sum_exec_runtime = 0; 1680 p->se.sum_exec_runtime = 0;
1681 p->se.prev_sum_exec_runtime = 0; 1681 p->se.prev_sum_exec_runtime = 0;
1682 p->se.nr_migrations = 0; 1682 p->se.nr_migrations = 0;
1683 p->se.vruntime = 0; 1683 p->se.vruntime = 0;
1684 INIT_LIST_HEAD(&p->se.group_node); 1684 INIT_LIST_HEAD(&p->se.group_node);
1685 1685
1686 #ifdef CONFIG_SCHEDSTATS 1686 #ifdef CONFIG_SCHEDSTATS
1687 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1687 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1688 #endif 1688 #endif
1689 1689
1690 INIT_LIST_HEAD(&p->rt.run_list); 1690 INIT_LIST_HEAD(&p->rt.run_list);
1691 1691
1692 #ifdef CONFIG_PREEMPT_NOTIFIERS 1692 #ifdef CONFIG_PREEMPT_NOTIFIERS
1693 INIT_HLIST_HEAD(&p->preempt_notifiers); 1693 INIT_HLIST_HEAD(&p->preempt_notifiers);
1694 #endif 1694 #endif
1695 } 1695 }
1696 1696
1697 /* 1697 /*
1698 * fork()/clone()-time setup: 1698 * fork()/clone()-time setup:
1699 */ 1699 */
1700 void sched_fork(struct task_struct *p) 1700 void sched_fork(struct task_struct *p)
1701 { 1701 {
1702 unsigned long flags; 1702 unsigned long flags;
1703 int cpu = get_cpu(); 1703 int cpu = get_cpu();
1704 1704
1705 __sched_fork(p); 1705 __sched_fork(p);
1706 /* 1706 /*
1707 * We mark the process as running here. This guarantees that 1707 * We mark the process as running here. This guarantees that
1708 * nobody will actually run it, and a signal or other external 1708 * nobody will actually run it, and a signal or other external
1709 * event cannot wake it up and insert it on the runqueue either. 1709 * event cannot wake it up and insert it on the runqueue either.
1710 */ 1710 */
1711 p->state = TASK_RUNNING; 1711 p->state = TASK_RUNNING;
1712 1712
1713 /* 1713 /*
1714 * Make sure we do not leak PI boosting priority to the child. 1714 * Make sure we do not leak PI boosting priority to the child.
1715 */ 1715 */
1716 p->prio = current->normal_prio; 1716 p->prio = current->normal_prio;
1717 1717
1718 /* 1718 /*
1719 * Revert to default priority/policy on fork if requested. 1719 * Revert to default priority/policy on fork if requested.
1720 */ 1720 */
1721 if (unlikely(p->sched_reset_on_fork)) { 1721 if (unlikely(p->sched_reset_on_fork)) {
1722 if (task_has_rt_policy(p)) { 1722 if (task_has_rt_policy(p)) {
1723 p->policy = SCHED_NORMAL; 1723 p->policy = SCHED_NORMAL;
1724 p->static_prio = NICE_TO_PRIO(0); 1724 p->static_prio = NICE_TO_PRIO(0);
1725 p->rt_priority = 0; 1725 p->rt_priority = 0;
1726 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1726 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1727 p->static_prio = NICE_TO_PRIO(0); 1727 p->static_prio = NICE_TO_PRIO(0);
1728 1728
1729 p->prio = p->normal_prio = __normal_prio(p); 1729 p->prio = p->normal_prio = __normal_prio(p);
1730 set_load_weight(p); 1730 set_load_weight(p);
1731 1731
1732 /* 1732 /*
1733 * We don't need the reset flag anymore after the fork. It has 1733 * We don't need the reset flag anymore after the fork. It has
1734 * fulfilled its duty: 1734 * fulfilled its duty:
1735 */ 1735 */
1736 p->sched_reset_on_fork = 0; 1736 p->sched_reset_on_fork = 0;
1737 } 1737 }
1738 1738
1739 if (!rt_prio(p->prio)) 1739 if (!rt_prio(p->prio))
1740 p->sched_class = &fair_sched_class; 1740 p->sched_class = &fair_sched_class;
1741 1741
1742 if (p->sched_class->task_fork) 1742 if (p->sched_class->task_fork)
1743 p->sched_class->task_fork(p); 1743 p->sched_class->task_fork(p);
1744 1744
1745 /* 1745 /*
1746 * The child is not yet in the pid-hash so no cgroup attach races, 1746 * The child is not yet in the pid-hash so no cgroup attach races,
1747 * and the cgroup is pinned to this child due to cgroup_fork() 1747 * and the cgroup is pinned to this child due to cgroup_fork()
1748 * is ran before sched_fork(). 1748 * is ran before sched_fork().
1749 * 1749 *
1750 * Silence PROVE_RCU. 1750 * Silence PROVE_RCU.
1751 */ 1751 */
1752 raw_spin_lock_irqsave(&p->pi_lock, flags); 1752 raw_spin_lock_irqsave(&p->pi_lock, flags);
1753 set_task_cpu(p, cpu); 1753 set_task_cpu(p, cpu);
1754 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1754 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1755 1755
1756 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1756 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1757 if (likely(sched_info_on())) 1757 if (likely(sched_info_on()))
1758 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1758 memset(&p->sched_info, 0, sizeof(p->sched_info));
1759 #endif 1759 #endif
1760 #if defined(CONFIG_SMP) 1760 #if defined(CONFIG_SMP)
1761 p->on_cpu = 0; 1761 p->on_cpu = 0;
1762 #endif 1762 #endif
1763 #ifdef CONFIG_PREEMPT_COUNT 1763 #ifdef CONFIG_PREEMPT_COUNT
1764 /* Want to start with kernel preemption disabled. */ 1764 /* Want to start with kernel preemption disabled. */
1765 task_thread_info(p)->preempt_count = 1; 1765 task_thread_info(p)->preempt_count = 1;
1766 #endif 1766 #endif
1767 #ifdef CONFIG_SMP 1767 #ifdef CONFIG_SMP
1768 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1768 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1769 #endif 1769 #endif
1770 1770
1771 put_cpu(); 1771 put_cpu();
1772 } 1772 }
1773 1773
1774 /* 1774 /*
1775 * wake_up_new_task - wake up a newly created task for the first time. 1775 * wake_up_new_task - wake up a newly created task for the first time.
1776 * 1776 *
1777 * This function will do some initial scheduler statistics housekeeping 1777 * This function will do some initial scheduler statistics housekeeping
1778 * that must be done for every newly created context, then puts the task 1778 * that must be done for every newly created context, then puts the task
1779 * on the runqueue and wakes it. 1779 * on the runqueue and wakes it.
1780 */ 1780 */
1781 void wake_up_new_task(struct task_struct *p) 1781 void wake_up_new_task(struct task_struct *p)
1782 { 1782 {
1783 unsigned long flags; 1783 unsigned long flags;
1784 struct rq *rq; 1784 struct rq *rq;
1785 1785
1786 raw_spin_lock_irqsave(&p->pi_lock, flags); 1786 raw_spin_lock_irqsave(&p->pi_lock, flags);
1787 #ifdef CONFIG_SMP 1787 #ifdef CONFIG_SMP
1788 /* 1788 /*
1789 * Fork balancing, do it here and not earlier because: 1789 * Fork balancing, do it here and not earlier because:
1790 * - cpus_allowed can change in the fork path 1790 * - cpus_allowed can change in the fork path
1791 * - any previously selected cpu might disappear through hotplug 1791 * - any previously selected cpu might disappear through hotplug
1792 */ 1792 */
1793 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1793 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
1794 #endif 1794 #endif
1795 1795
1796 rq = __task_rq_lock(p); 1796 rq = __task_rq_lock(p);
1797 activate_task(rq, p, 0); 1797 activate_task(rq, p, 0);
1798 p->on_rq = 1; 1798 p->on_rq = 1;
1799 trace_sched_wakeup_new(p, true); 1799 trace_sched_wakeup_new(p, true);
1800 check_preempt_curr(rq, p, WF_FORK); 1800 check_preempt_curr(rq, p, WF_FORK);
1801 #ifdef CONFIG_SMP 1801 #ifdef CONFIG_SMP
1802 if (p->sched_class->task_woken) 1802 if (p->sched_class->task_woken)
1803 p->sched_class->task_woken(rq, p); 1803 p->sched_class->task_woken(rq, p);
1804 #endif 1804 #endif
1805 task_rq_unlock(rq, p, &flags); 1805 task_rq_unlock(rq, p, &flags);
1806 } 1806 }
1807 1807
1808 #ifdef CONFIG_PREEMPT_NOTIFIERS 1808 #ifdef CONFIG_PREEMPT_NOTIFIERS
1809 1809
1810 /** 1810 /**
1811 * preempt_notifier_register - tell me when current is being preempted & rescheduled 1811 * preempt_notifier_register - tell me when current is being preempted & rescheduled
1812 * @notifier: notifier struct to register 1812 * @notifier: notifier struct to register
1813 */ 1813 */
1814 void preempt_notifier_register(struct preempt_notifier *notifier) 1814 void preempt_notifier_register(struct preempt_notifier *notifier)
1815 { 1815 {
1816 hlist_add_head(&notifier->link, &current->preempt_notifiers); 1816 hlist_add_head(&notifier->link, &current->preempt_notifiers);
1817 } 1817 }
1818 EXPORT_SYMBOL_GPL(preempt_notifier_register); 1818 EXPORT_SYMBOL_GPL(preempt_notifier_register);
1819 1819
1820 /** 1820 /**
1821 * preempt_notifier_unregister - no longer interested in preemption notifications 1821 * preempt_notifier_unregister - no longer interested in preemption notifications
1822 * @notifier: notifier struct to unregister 1822 * @notifier: notifier struct to unregister
1823 * 1823 *
1824 * This is safe to call from within a preemption notifier. 1824 * This is safe to call from within a preemption notifier.
1825 */ 1825 */
1826 void preempt_notifier_unregister(struct preempt_notifier *notifier) 1826 void preempt_notifier_unregister(struct preempt_notifier *notifier)
1827 { 1827 {
1828 hlist_del(&notifier->link); 1828 hlist_del(&notifier->link);
1829 } 1829 }
1830 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 1830 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1831 1831
1832 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1832 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1833 { 1833 {
1834 struct preempt_notifier *notifier; 1834 struct preempt_notifier *notifier;
1835 struct hlist_node *node; 1835 struct hlist_node *node;
1836 1836
1837 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1837 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1838 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1838 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1839 } 1839 }
1840 1840
1841 static void 1841 static void
1842 fire_sched_out_preempt_notifiers(struct task_struct *curr, 1842 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1843 struct task_struct *next) 1843 struct task_struct *next)
1844 { 1844 {
1845 struct preempt_notifier *notifier; 1845 struct preempt_notifier *notifier;
1846 struct hlist_node *node; 1846 struct hlist_node *node;
1847 1847
1848 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1848 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1849 notifier->ops->sched_out(notifier, next); 1849 notifier->ops->sched_out(notifier, next);
1850 } 1850 }
1851 1851
1852 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 1852 #else /* !CONFIG_PREEMPT_NOTIFIERS */
1853 1853
1854 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1854 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1855 { 1855 {
1856 } 1856 }
1857 1857
1858 static void 1858 static void
1859 fire_sched_out_preempt_notifiers(struct task_struct *curr, 1859 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1860 struct task_struct *next) 1860 struct task_struct *next)
1861 { 1861 {
1862 } 1862 }
1863 1863
1864 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 1864 #endif /* CONFIG_PREEMPT_NOTIFIERS */
1865 1865
1866 /** 1866 /**
1867 * prepare_task_switch - prepare to switch tasks 1867 * prepare_task_switch - prepare to switch tasks
1868 * @rq: the runqueue preparing to switch 1868 * @rq: the runqueue preparing to switch
1869 * @prev: the current task that is being switched out 1869 * @prev: the current task that is being switched out
1870 * @next: the task we are going to switch to. 1870 * @next: the task we are going to switch to.
1871 * 1871 *
1872 * This is called with the rq lock held and interrupts off. It must 1872 * This is called with the rq lock held and interrupts off. It must
1873 * be paired with a subsequent finish_task_switch after the context 1873 * be paired with a subsequent finish_task_switch after the context
1874 * switch. 1874 * switch.
1875 * 1875 *
1876 * prepare_task_switch sets up locking and calls architecture specific 1876 * prepare_task_switch sets up locking and calls architecture specific
1877 * hooks. 1877 * hooks.
1878 */ 1878 */
1879 static inline void 1879 static inline void
1880 prepare_task_switch(struct rq *rq, struct task_struct *prev, 1880 prepare_task_switch(struct rq *rq, struct task_struct *prev,
1881 struct task_struct *next) 1881 struct task_struct *next)
1882 { 1882 {
1883 sched_info_switch(prev, next); 1883 sched_info_switch(prev, next);
1884 perf_event_task_sched_out(prev, next); 1884 perf_event_task_sched_out(prev, next);
1885 fire_sched_out_preempt_notifiers(prev, next); 1885 fire_sched_out_preempt_notifiers(prev, next);
1886 prepare_lock_switch(rq, next); 1886 prepare_lock_switch(rq, next);
1887 prepare_arch_switch(next); 1887 prepare_arch_switch(next);
1888 trace_sched_switch(prev, next); 1888 trace_sched_switch(prev, next);
1889 } 1889 }
1890 1890
1891 /** 1891 /**
1892 * finish_task_switch - clean up after a task-switch 1892 * finish_task_switch - clean up after a task-switch
1893 * @rq: runqueue associated with task-switch 1893 * @rq: runqueue associated with task-switch
1894 * @prev: the thread we just switched away from. 1894 * @prev: the thread we just switched away from.
1895 * 1895 *
1896 * finish_task_switch must be called after the context switch, paired 1896 * finish_task_switch must be called after the context switch, paired
1897 * with a prepare_task_switch call before the context switch. 1897 * with a prepare_task_switch call before the context switch.
1898 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1898 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1899 * and do any other architecture-specific cleanup actions. 1899 * and do any other architecture-specific cleanup actions.
1900 * 1900 *
1901 * Note that we may have delayed dropping an mm in context_switch(). If 1901 * Note that we may have delayed dropping an mm in context_switch(). If
1902 * so, we finish that here outside of the runqueue lock. (Doing it 1902 * so, we finish that here outside of the runqueue lock. (Doing it
1903 * with the lock held can cause deadlocks; see schedule() for 1903 * with the lock held can cause deadlocks; see schedule() for
1904 * details.) 1904 * details.)
1905 */ 1905 */
1906 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 1906 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1907 __releases(rq->lock) 1907 __releases(rq->lock)
1908 { 1908 {
1909 struct mm_struct *mm = rq->prev_mm; 1909 struct mm_struct *mm = rq->prev_mm;
1910 long prev_state; 1910 long prev_state;
1911 1911
1912 rq->prev_mm = NULL; 1912 rq->prev_mm = NULL;
1913 1913
1914 /* 1914 /*
1915 * A task struct has one reference for the use as "current". 1915 * A task struct has one reference for the use as "current".
1916 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1916 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1917 * schedule one last time. The schedule call will never return, and 1917 * schedule one last time. The schedule call will never return, and
1918 * the scheduled task must drop that reference. 1918 * the scheduled task must drop that reference.
1919 * The test for TASK_DEAD must occur while the runqueue locks are 1919 * The test for TASK_DEAD must occur while the runqueue locks are
1920 * still held, otherwise prev could be scheduled on another cpu, die 1920 * still held, otherwise prev could be scheduled on another cpu, die
1921 * there before we look at prev->state, and then the reference would 1921 * there before we look at prev->state, and then the reference would
1922 * be dropped twice. 1922 * be dropped twice.
1923 * Manfred Spraul <manfred@colorfullife.com> 1923 * Manfred Spraul <manfred@colorfullife.com>
1924 */ 1924 */
1925 prev_state = prev->state; 1925 prev_state = prev->state;
1926 finish_arch_switch(prev); 1926 finish_arch_switch(prev);
1927 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1927 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1928 local_irq_disable(); 1928 local_irq_disable();
1929 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1929 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1930 perf_event_task_sched_in(prev, current); 1930 perf_event_task_sched_in(prev, current);
1931 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1931 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1932 local_irq_enable(); 1932 local_irq_enable();
1933 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1933 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1934 finish_lock_switch(rq, prev); 1934 finish_lock_switch(rq, prev);
1935 1935
1936 fire_sched_in_preempt_notifiers(current); 1936 fire_sched_in_preempt_notifiers(current);
1937 if (mm) 1937 if (mm)
1938 mmdrop(mm); 1938 mmdrop(mm);
1939 if (unlikely(prev_state == TASK_DEAD)) { 1939 if (unlikely(prev_state == TASK_DEAD)) {
1940 /* 1940 /*
1941 * Remove function-return probe instances associated with this 1941 * Remove function-return probe instances associated with this
1942 * task and put them back on the free list. 1942 * task and put them back on the free list.
1943 */ 1943 */
1944 kprobe_flush_task(prev); 1944 kprobe_flush_task(prev);
1945 put_task_struct(prev); 1945 put_task_struct(prev);
1946 } 1946 }
1947 } 1947 }
1948 1948
1949 #ifdef CONFIG_SMP 1949 #ifdef CONFIG_SMP
1950 1950
1951 /* assumes rq->lock is held */ 1951 /* assumes rq->lock is held */
1952 static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 1952 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
1953 { 1953 {
1954 if (prev->sched_class->pre_schedule) 1954 if (prev->sched_class->pre_schedule)
1955 prev->sched_class->pre_schedule(rq, prev); 1955 prev->sched_class->pre_schedule(rq, prev);
1956 } 1956 }
1957 1957
1958 /* rq->lock is NOT held, but preemption is disabled */ 1958 /* rq->lock is NOT held, but preemption is disabled */
1959 static inline void post_schedule(struct rq *rq) 1959 static inline void post_schedule(struct rq *rq)
1960 { 1960 {
1961 if (rq->post_schedule) { 1961 if (rq->post_schedule) {
1962 unsigned long flags; 1962 unsigned long flags;
1963 1963
1964 raw_spin_lock_irqsave(&rq->lock, flags); 1964 raw_spin_lock_irqsave(&rq->lock, flags);
1965 if (rq->curr->sched_class->post_schedule) 1965 if (rq->curr->sched_class->post_schedule)
1966 rq->curr->sched_class->post_schedule(rq); 1966 rq->curr->sched_class->post_schedule(rq);
1967 raw_spin_unlock_irqrestore(&rq->lock, flags); 1967 raw_spin_unlock_irqrestore(&rq->lock, flags);
1968 1968
1969 rq->post_schedule = 0; 1969 rq->post_schedule = 0;
1970 } 1970 }
1971 } 1971 }
1972 1972
1973 #else 1973 #else
1974 1974
1975 static inline void pre_schedule(struct rq *rq, struct task_struct *p) 1975 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
1976 { 1976 {
1977 } 1977 }
1978 1978
1979 static inline void post_schedule(struct rq *rq) 1979 static inline void post_schedule(struct rq *rq)
1980 { 1980 {
1981 } 1981 }
1982 1982
1983 #endif 1983 #endif
1984 1984
1985 /** 1985 /**
1986 * schedule_tail - first thing a freshly forked thread must call. 1986 * schedule_tail - first thing a freshly forked thread must call.
1987 * @prev: the thread we just switched away from. 1987 * @prev: the thread we just switched away from.
1988 */ 1988 */
1989 asmlinkage void schedule_tail(struct task_struct *prev) 1989 asmlinkage void schedule_tail(struct task_struct *prev)
1990 __releases(rq->lock) 1990 __releases(rq->lock)
1991 { 1991 {
1992 struct rq *rq = this_rq(); 1992 struct rq *rq = this_rq();
1993 1993
1994 finish_task_switch(rq, prev); 1994 finish_task_switch(rq, prev);
1995 1995
1996 /* 1996 /*
1997 * FIXME: do we need to worry about rq being invalidated by the 1997 * FIXME: do we need to worry about rq being invalidated by the
1998 * task_switch? 1998 * task_switch?
1999 */ 1999 */
2000 post_schedule(rq); 2000 post_schedule(rq);
2001 2001
2002 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 2002 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
2003 /* In this case, finish_task_switch does not reenable preemption */ 2003 /* In this case, finish_task_switch does not reenable preemption */
2004 preempt_enable(); 2004 preempt_enable();
2005 #endif 2005 #endif
2006 if (current->set_child_tid) 2006 if (current->set_child_tid)
2007 put_user(task_pid_vnr(current), current->set_child_tid); 2007 put_user(task_pid_vnr(current), current->set_child_tid);
2008 } 2008 }
2009 2009
2010 /* 2010 /*
2011 * context_switch - switch to the new MM and the new 2011 * context_switch - switch to the new MM and the new
2012 * thread's register state. 2012 * thread's register state.
2013 */ 2013 */
2014 static inline void 2014 static inline void
2015 context_switch(struct rq *rq, struct task_struct *prev, 2015 context_switch(struct rq *rq, struct task_struct *prev,
2016 struct task_struct *next) 2016 struct task_struct *next)
2017 { 2017 {
2018 struct mm_struct *mm, *oldmm; 2018 struct mm_struct *mm, *oldmm;
2019 2019
2020 prepare_task_switch(rq, prev, next); 2020 prepare_task_switch(rq, prev, next);
2021 2021
2022 mm = next->mm; 2022 mm = next->mm;
2023 oldmm = prev->active_mm; 2023 oldmm = prev->active_mm;
2024 /* 2024 /*
2025 * For paravirt, this is coupled with an exit in switch_to to 2025 * For paravirt, this is coupled with an exit in switch_to to
2026 * combine the page table reload and the switch backend into 2026 * combine the page table reload and the switch backend into
2027 * one hypercall. 2027 * one hypercall.
2028 */ 2028 */
2029 arch_start_context_switch(prev); 2029 arch_start_context_switch(prev);
2030 2030
2031 if (!mm) { 2031 if (!mm) {
2032 next->active_mm = oldmm; 2032 next->active_mm = oldmm;
2033 atomic_inc(&oldmm->mm_count); 2033 atomic_inc(&oldmm->mm_count);
2034 enter_lazy_tlb(oldmm, next); 2034 enter_lazy_tlb(oldmm, next);
2035 } else 2035 } else
2036 switch_mm(oldmm, mm, next); 2036 switch_mm(oldmm, mm, next);
2037 2037
2038 if (!prev->mm) { 2038 if (!prev->mm) {
2039 prev->active_mm = NULL; 2039 prev->active_mm = NULL;
2040 rq->prev_mm = oldmm; 2040 rq->prev_mm = oldmm;
2041 } 2041 }
2042 /* 2042 /*
2043 * Since the runqueue lock will be released by the next 2043 * Since the runqueue lock will be released by the next
2044 * task (which is an invalid locking op but in the case 2044 * task (which is an invalid locking op but in the case
2045 * of the scheduler it's an obvious special-case), so we 2045 * of the scheduler it's an obvious special-case), so we
2046 * do an early lockdep release here: 2046 * do an early lockdep release here:
2047 */ 2047 */
2048 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 2048 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
2049 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2049 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2050 #endif 2050 #endif
2051 2051
2052 /* Here we just switch the register state and the stack. */ 2052 /* Here we just switch the register state and the stack. */
2053 switch_to(prev, next, prev); 2053 switch_to(prev, next, prev);
2054 2054
2055 barrier(); 2055 barrier();
2056 /* 2056 /*
2057 * this_rq must be evaluated again because prev may have moved 2057 * this_rq must be evaluated again because prev may have moved
2058 * CPUs since it called schedule(), thus the 'rq' on its stack 2058 * CPUs since it called schedule(), thus the 'rq' on its stack
2059 * frame will be invalid. 2059 * frame will be invalid.
2060 */ 2060 */
2061 finish_task_switch(this_rq(), prev); 2061 finish_task_switch(this_rq(), prev);
2062 } 2062 }
2063 2063
2064 /* 2064 /*
2065 * nr_running, nr_uninterruptible and nr_context_switches: 2065 * nr_running, nr_uninterruptible and nr_context_switches:
2066 * 2066 *
2067 * externally visible scheduler statistics: current number of runnable 2067 * externally visible scheduler statistics: current number of runnable
2068 * threads, current number of uninterruptible-sleeping threads, total 2068 * threads, current number of uninterruptible-sleeping threads, total
2069 * number of context switches performed since bootup. 2069 * number of context switches performed since bootup.
2070 */ 2070 */
2071 unsigned long nr_running(void) 2071 unsigned long nr_running(void)
2072 { 2072 {
2073 unsigned long i, sum = 0; 2073 unsigned long i, sum = 0;
2074 2074
2075 for_each_online_cpu(i) 2075 for_each_online_cpu(i)
2076 sum += cpu_rq(i)->nr_running; 2076 sum += cpu_rq(i)->nr_running;
2077 2077
2078 return sum; 2078 return sum;
2079 } 2079 }
2080 2080
2081 unsigned long nr_uninterruptible(void) 2081 unsigned long nr_uninterruptible(void)
2082 { 2082 {
2083 unsigned long i, sum = 0; 2083 unsigned long i, sum = 0;
2084 2084
2085 for_each_possible_cpu(i) 2085 for_each_possible_cpu(i)
2086 sum += cpu_rq(i)->nr_uninterruptible; 2086 sum += cpu_rq(i)->nr_uninterruptible;
2087 2087
2088 /* 2088 /*
2089 * Since we read the counters lockless, it might be slightly 2089 * Since we read the counters lockless, it might be slightly
2090 * inaccurate. Do not allow it to go below zero though: 2090 * inaccurate. Do not allow it to go below zero though:
2091 */ 2091 */
2092 if (unlikely((long)sum < 0)) 2092 if (unlikely((long)sum < 0))
2093 sum = 0; 2093 sum = 0;
2094 2094
2095 return sum; 2095 return sum;
2096 } 2096 }
2097 2097
2098 unsigned long long nr_context_switches(void) 2098 unsigned long long nr_context_switches(void)
2099 { 2099 {
2100 int i; 2100 int i;
2101 unsigned long long sum = 0; 2101 unsigned long long sum = 0;
2102 2102
2103 for_each_possible_cpu(i) 2103 for_each_possible_cpu(i)
2104 sum += cpu_rq(i)->nr_switches; 2104 sum += cpu_rq(i)->nr_switches;
2105 2105
2106 return sum; 2106 return sum;
2107 } 2107 }
2108 2108
2109 unsigned long nr_iowait(void) 2109 unsigned long nr_iowait(void)
2110 { 2110 {
2111 unsigned long i, sum = 0; 2111 unsigned long i, sum = 0;
2112 2112
2113 for_each_possible_cpu(i) 2113 for_each_possible_cpu(i)
2114 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2114 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2115 2115
2116 return sum; 2116 return sum;
2117 } 2117 }
2118 2118
2119 unsigned long nr_iowait_cpu(int cpu) 2119 unsigned long nr_iowait_cpu(int cpu)
2120 { 2120 {
2121 struct rq *this = cpu_rq(cpu); 2121 struct rq *this = cpu_rq(cpu);
2122 return atomic_read(&this->nr_iowait); 2122 return atomic_read(&this->nr_iowait);
2123 } 2123 }
2124 2124
2125 unsigned long this_cpu_load(void) 2125 unsigned long this_cpu_load(void)
2126 { 2126 {
2127 struct rq *this = this_rq(); 2127 struct rq *this = this_rq();
2128 return this->cpu_load[0]; 2128 return this->cpu_load[0];
2129 } 2129 }
2130 2130
2131 2131
2132 /* Variables and functions for calc_load */ 2132 /* Variables and functions for calc_load */
2133 static atomic_long_t calc_load_tasks; 2133 static atomic_long_t calc_load_tasks;
2134 static unsigned long calc_load_update; 2134 static unsigned long calc_load_update;
2135 unsigned long avenrun[3]; 2135 unsigned long avenrun[3];
2136 EXPORT_SYMBOL(avenrun); 2136 EXPORT_SYMBOL(avenrun);
2137 2137
2138 static long calc_load_fold_active(struct rq *this_rq) 2138 static long calc_load_fold_active(struct rq *this_rq)
2139 { 2139 {
2140 long nr_active, delta = 0; 2140 long nr_active, delta = 0;
2141 2141
2142 nr_active = this_rq->nr_running; 2142 nr_active = this_rq->nr_running;
2143 nr_active += (long) this_rq->nr_uninterruptible; 2143 nr_active += (long) this_rq->nr_uninterruptible;
2144 2144
2145 if (nr_active != this_rq->calc_load_active) { 2145 if (nr_active != this_rq->calc_load_active) {
2146 delta = nr_active - this_rq->calc_load_active; 2146 delta = nr_active - this_rq->calc_load_active;
2147 this_rq->calc_load_active = nr_active; 2147 this_rq->calc_load_active = nr_active;
2148 } 2148 }
2149 2149
2150 return delta; 2150 return delta;
2151 } 2151 }
2152 2152
2153 static unsigned long 2153 static unsigned long
2154 calc_load(unsigned long load, unsigned long exp, unsigned long active) 2154 calc_load(unsigned long load, unsigned long exp, unsigned long active)
2155 { 2155 {
2156 load *= exp; 2156 load *= exp;
2157 load += active * (FIXED_1 - exp); 2157 load += active * (FIXED_1 - exp);
2158 load += 1UL << (FSHIFT - 1); 2158 load += 1UL << (FSHIFT - 1);
2159 return load >> FSHIFT; 2159 return load >> FSHIFT;
2160 } 2160 }
2161 2161
2162 #ifdef CONFIG_NO_HZ 2162 #ifdef CONFIG_NO_HZ
2163 /* 2163 /*
2164 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2164 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2165 * 2165 *
2166 * When making the ILB scale, we should try to pull this in as well. 2166 * When making the ILB scale, we should try to pull this in as well.
2167 */ 2167 */
2168 static atomic_long_t calc_load_tasks_idle; 2168 static atomic_long_t calc_load_tasks_idle;
2169 2169
2170 void calc_load_account_idle(struct rq *this_rq) 2170 void calc_load_account_idle(struct rq *this_rq)
2171 { 2171 {
2172 long delta; 2172 long delta;
2173 2173
2174 delta = calc_load_fold_active(this_rq); 2174 delta = calc_load_fold_active(this_rq);
2175 if (delta) 2175 if (delta)
2176 atomic_long_add(delta, &calc_load_tasks_idle); 2176 atomic_long_add(delta, &calc_load_tasks_idle);
2177 } 2177 }
2178 2178
2179 static long calc_load_fold_idle(void) 2179 static long calc_load_fold_idle(void)
2180 { 2180 {
2181 long delta = 0; 2181 long delta = 0;
2182 2182
2183 /* 2183 /*
2184 * Its got a race, we don't care... 2184 * Its got a race, we don't care...
2185 */ 2185 */
2186 if (atomic_long_read(&calc_load_tasks_idle)) 2186 if (atomic_long_read(&calc_load_tasks_idle))
2187 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2187 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2188 2188
2189 return delta; 2189 return delta;
2190 } 2190 }
2191 2191
2192 /** 2192 /**
2193 * fixed_power_int - compute: x^n, in O(log n) time 2193 * fixed_power_int - compute: x^n, in O(log n) time
2194 * 2194 *
2195 * @x: base of the power 2195 * @x: base of the power
2196 * @frac_bits: fractional bits of @x 2196 * @frac_bits: fractional bits of @x
2197 * @n: power to raise @x to. 2197 * @n: power to raise @x to.
2198 * 2198 *
2199 * By exploiting the relation between the definition of the natural power 2199 * By exploiting the relation between the definition of the natural power
2200 * function: x^n := x*x*...*x (x multiplied by itself for n times), and 2200 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
2201 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, 2201 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
2202 * (where: n_i \elem {0, 1}, the binary vector representing n), 2202 * (where: n_i \elem {0, 1}, the binary vector representing n),
2203 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is 2203 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
2204 * of course trivially computable in O(log_2 n), the length of our binary 2204 * of course trivially computable in O(log_2 n), the length of our binary
2205 * vector. 2205 * vector.
2206 */ 2206 */
2207 static unsigned long 2207 static unsigned long
2208 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) 2208 fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
2209 { 2209 {
2210 unsigned long result = 1UL << frac_bits; 2210 unsigned long result = 1UL << frac_bits;
2211 2211
2212 if (n) for (;;) { 2212 if (n) for (;;) {
2213 if (n & 1) { 2213 if (n & 1) {
2214 result *= x; 2214 result *= x;
2215 result += 1UL << (frac_bits - 1); 2215 result += 1UL << (frac_bits - 1);
2216 result >>= frac_bits; 2216 result >>= frac_bits;
2217 } 2217 }
2218 n >>= 1; 2218 n >>= 1;
2219 if (!n) 2219 if (!n)
2220 break; 2220 break;
2221 x *= x; 2221 x *= x;
2222 x += 1UL << (frac_bits - 1); 2222 x += 1UL << (frac_bits - 1);
2223 x >>= frac_bits; 2223 x >>= frac_bits;
2224 } 2224 }
2225 2225
2226 return result; 2226 return result;
2227 } 2227 }
2228 2228
2229 /* 2229 /*
2230 * a1 = a0 * e + a * (1 - e) 2230 * a1 = a0 * e + a * (1 - e)
2231 * 2231 *
2232 * a2 = a1 * e + a * (1 - e) 2232 * a2 = a1 * e + a * (1 - e)
2233 * = (a0 * e + a * (1 - e)) * e + a * (1 - e) 2233 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
2234 * = a0 * e^2 + a * (1 - e) * (1 + e) 2234 * = a0 * e^2 + a * (1 - e) * (1 + e)
2235 * 2235 *
2236 * a3 = a2 * e + a * (1 - e) 2236 * a3 = a2 * e + a * (1 - e)
2237 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) 2237 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
2238 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) 2238 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
2239 * 2239 *
2240 * ... 2240 * ...
2241 * 2241 *
2242 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] 2242 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
2243 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) 2243 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
2244 * = a0 * e^n + a * (1 - e^n) 2244 * = a0 * e^n + a * (1 - e^n)
2245 * 2245 *
2246 * [1] application of the geometric series: 2246 * [1] application of the geometric series:
2247 * 2247 *
2248 * n 1 - x^(n+1) 2248 * n 1 - x^(n+1)
2249 * S_n := \Sum x^i = ------------- 2249 * S_n := \Sum x^i = -------------
2250 * i=0 1 - x 2250 * i=0 1 - x
2251 */ 2251 */
2252 static unsigned long 2252 static unsigned long
2253 calc_load_n(unsigned long load, unsigned long exp, 2253 calc_load_n(unsigned long load, unsigned long exp,
2254 unsigned long active, unsigned int n) 2254 unsigned long active, unsigned int n)
2255 { 2255 {
2256 2256
2257 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); 2257 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
2258 } 2258 }
2259 2259
2260 /* 2260 /*
2261 * NO_HZ can leave us missing all per-cpu ticks calling 2261 * NO_HZ can leave us missing all per-cpu ticks calling
2262 * calc_load_account_active(), but since an idle CPU folds its delta into 2262 * calc_load_account_active(), but since an idle CPU folds its delta into
2263 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold 2263 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
2264 * in the pending idle delta if our idle period crossed a load cycle boundary. 2264 * in the pending idle delta if our idle period crossed a load cycle boundary.
2265 * 2265 *
2266 * Once we've updated the global active value, we need to apply the exponential 2266 * Once we've updated the global active value, we need to apply the exponential
2267 * weights adjusted to the number of cycles missed. 2267 * weights adjusted to the number of cycles missed.
2268 */ 2268 */
2269 static void calc_global_nohz(unsigned long ticks) 2269 static void calc_global_nohz(unsigned long ticks)
2270 { 2270 {
2271 long delta, active, n; 2271 long delta, active, n;
2272 2272
2273 if (time_before(jiffies, calc_load_update)) 2273 if (time_before(jiffies, calc_load_update))
2274 return; 2274 return;
2275 2275
2276 /* 2276 /*
2277 * If we crossed a calc_load_update boundary, make sure to fold 2277 * If we crossed a calc_load_update boundary, make sure to fold
2278 * any pending idle changes, the respective CPUs might have 2278 * any pending idle changes, the respective CPUs might have
2279 * missed the tick driven calc_load_account_active() update 2279 * missed the tick driven calc_load_account_active() update
2280 * due to NO_HZ. 2280 * due to NO_HZ.
2281 */ 2281 */
2282 delta = calc_load_fold_idle(); 2282 delta = calc_load_fold_idle();
2283 if (delta) 2283 if (delta)
2284 atomic_long_add(delta, &calc_load_tasks); 2284 atomic_long_add(delta, &calc_load_tasks);
2285 2285
2286 /* 2286 /*
2287 * If we were idle for multiple load cycles, apply them. 2287 * If we were idle for multiple load cycles, apply them.
2288 */ 2288 */
2289 if (ticks >= LOAD_FREQ) { 2289 if (ticks >= LOAD_FREQ) {
2290 n = ticks / LOAD_FREQ; 2290 n = ticks / LOAD_FREQ;
2291 2291
2292 active = atomic_long_read(&calc_load_tasks); 2292 active = atomic_long_read(&calc_load_tasks);
2293 active = active > 0 ? active * FIXED_1 : 0; 2293 active = active > 0 ? active * FIXED_1 : 0;
2294 2294
2295 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2295 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2296 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2296 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2297 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2297 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2298 2298
2299 calc_load_update += n * LOAD_FREQ; 2299 calc_load_update += n * LOAD_FREQ;
2300 } 2300 }
2301 2301
2302 /* 2302 /*
2303 * Its possible the remainder of the above division also crosses 2303 * Its possible the remainder of the above division also crosses
2304 * a LOAD_FREQ period, the regular check in calc_global_load() 2304 * a LOAD_FREQ period, the regular check in calc_global_load()
2305 * which comes after this will take care of that. 2305 * which comes after this will take care of that.
2306 * 2306 *
2307 * Consider us being 11 ticks before a cycle completion, and us 2307 * Consider us being 11 ticks before a cycle completion, and us
2308 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will 2308 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
2309 * age us 4 cycles, and the test in calc_global_load() will 2309 * age us 4 cycles, and the test in calc_global_load() will
2310 * pick up the final one. 2310 * pick up the final one.
2311 */ 2311 */
2312 } 2312 }
2313 #else 2313 #else
2314 void calc_load_account_idle(struct rq *this_rq) 2314 void calc_load_account_idle(struct rq *this_rq)
2315 { 2315 {
2316 } 2316 }
2317 2317
2318 static inline long calc_load_fold_idle(void) 2318 static inline long calc_load_fold_idle(void)
2319 { 2319 {
2320 return 0; 2320 return 0;
2321 } 2321 }
2322 2322
2323 static void calc_global_nohz(unsigned long ticks) 2323 static void calc_global_nohz(unsigned long ticks)
2324 { 2324 {
2325 } 2325 }
2326 #endif 2326 #endif
2327 2327
2328 /** 2328 /**
2329 * get_avenrun - get the load average array 2329 * get_avenrun - get the load average array
2330 * @loads: pointer to dest load array 2330 * @loads: pointer to dest load array
2331 * @offset: offset to add 2331 * @offset: offset to add
2332 * @shift: shift count to shift the result left 2332 * @shift: shift count to shift the result left
2333 * 2333 *
2334 * These values are estimates at best, so no need for locking. 2334 * These values are estimates at best, so no need for locking.
2335 */ 2335 */
2336 void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 2336 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2337 { 2337 {
2338 loads[0] = (avenrun[0] + offset) << shift; 2338 loads[0] = (avenrun[0] + offset) << shift;
2339 loads[1] = (avenrun[1] + offset) << shift; 2339 loads[1] = (avenrun[1] + offset) << shift;
2340 loads[2] = (avenrun[2] + offset) << shift; 2340 loads[2] = (avenrun[2] + offset) << shift;
2341 } 2341 }
2342 2342
2343 /* 2343 /*
2344 * calc_load - update the avenrun load estimates 10 ticks after the 2344 * calc_load - update the avenrun load estimates 10 ticks after the
2345 * CPUs have updated calc_load_tasks. 2345 * CPUs have updated calc_load_tasks.
2346 */ 2346 */
2347 void calc_global_load(unsigned long ticks) 2347 void calc_global_load(unsigned long ticks)
2348 { 2348 {
2349 long active; 2349 long active;
2350 2350
2351 calc_global_nohz(ticks); 2351 calc_global_nohz(ticks);
2352 2352
2353 if (time_before(jiffies, calc_load_update + 10)) 2353 if (time_before(jiffies, calc_load_update + 10))
2354 return; 2354 return;
2355 2355
2356 active = atomic_long_read(&calc_load_tasks); 2356 active = atomic_long_read(&calc_load_tasks);
2357 active = active > 0 ? active * FIXED_1 : 0; 2357 active = active > 0 ? active * FIXED_1 : 0;
2358 2358
2359 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 2359 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2360 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 2360 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2361 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 2361 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2362 2362
2363 calc_load_update += LOAD_FREQ; 2363 calc_load_update += LOAD_FREQ;
2364 } 2364 }
2365 2365
2366 /* 2366 /*
2367 * Called from update_cpu_load() to periodically update this CPU's 2367 * Called from update_cpu_load() to periodically update this CPU's
2368 * active count. 2368 * active count.
2369 */ 2369 */
2370 static void calc_load_account_active(struct rq *this_rq) 2370 static void calc_load_account_active(struct rq *this_rq)
2371 { 2371 {
2372 long delta; 2372 long delta;
2373 2373
2374 if (time_before(jiffies, this_rq->calc_load_update)) 2374 if (time_before(jiffies, this_rq->calc_load_update))
2375 return; 2375 return;
2376 2376
2377 delta = calc_load_fold_active(this_rq); 2377 delta = calc_load_fold_active(this_rq);
2378 delta += calc_load_fold_idle(); 2378 delta += calc_load_fold_idle();
2379 if (delta) 2379 if (delta)
2380 atomic_long_add(delta, &calc_load_tasks); 2380 atomic_long_add(delta, &calc_load_tasks);
2381 2381
2382 this_rq->calc_load_update += LOAD_FREQ; 2382 this_rq->calc_load_update += LOAD_FREQ;
2383 } 2383 }
2384 2384
2385 /* 2385 /*
2386 * The exact cpuload at various idx values, calculated at every tick would be 2386 * The exact cpuload at various idx values, calculated at every tick would be
2387 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2387 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2388 * 2388 *
2389 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called 2389 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
2390 * on nth tick when cpu may be busy, then we have: 2390 * on nth tick when cpu may be busy, then we have:
2391 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 2391 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2392 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load 2392 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
2393 * 2393 *
2394 * decay_load_missed() below does efficient calculation of 2394 * decay_load_missed() below does efficient calculation of
2395 * load = ((2^idx - 1) / 2^idx)^(n-1) * load 2395 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
2396 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load 2396 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
2397 * 2397 *
2398 * The calculation is approximated on a 128 point scale. 2398 * The calculation is approximated on a 128 point scale.
2399 * degrade_zero_ticks is the number of ticks after which load at any 2399 * degrade_zero_ticks is the number of ticks after which load at any
2400 * particular idx is approximated to be zero. 2400 * particular idx is approximated to be zero.
2401 * degrade_factor is a precomputed table, a row for each load idx. 2401 * degrade_factor is a precomputed table, a row for each load idx.
2402 * Each column corresponds to degradation factor for a power of two ticks, 2402 * Each column corresponds to degradation factor for a power of two ticks,
2403 * based on 128 point scale. 2403 * based on 128 point scale.
2404 * Example: 2404 * Example:
2405 * row 2, col 3 (=12) says that the degradation at load idx 2 after 2405 * row 2, col 3 (=12) says that the degradation at load idx 2 after
2406 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). 2406 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
2407 * 2407 *
2408 * With this power of 2 load factors, we can degrade the load n times 2408 * With this power of 2 load factors, we can degrade the load n times
2409 * by looking at 1 bits in n and doing as many mult/shift instead of 2409 * by looking at 1 bits in n and doing as many mult/shift instead of
2410 * n mult/shifts needed by the exact degradation. 2410 * n mult/shifts needed by the exact degradation.
2411 */ 2411 */
2412 #define DEGRADE_SHIFT 7 2412 #define DEGRADE_SHIFT 7
2413 static const unsigned char 2413 static const unsigned char
2414 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; 2414 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
2415 static const unsigned char 2415 static const unsigned char
2416 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { 2416 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
2417 {0, 0, 0, 0, 0, 0, 0, 0}, 2417 {0, 0, 0, 0, 0, 0, 0, 0},
2418 {64, 32, 8, 0, 0, 0, 0, 0}, 2418 {64, 32, 8, 0, 0, 0, 0, 0},
2419 {96, 72, 40, 12, 1, 0, 0}, 2419 {96, 72, 40, 12, 1, 0, 0},
2420 {112, 98, 75, 43, 15, 1, 0}, 2420 {112, 98, 75, 43, 15, 1, 0},
2421 {120, 112, 98, 76, 45, 16, 2} }; 2421 {120, 112, 98, 76, 45, 16, 2} };
2422 2422
2423 /* 2423 /*
2424 * Update cpu_load for any missed ticks, due to tickless idle. The backlog 2424 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
2425 * would be when CPU is idle and so we just decay the old load without 2425 * would be when CPU is idle and so we just decay the old load without
2426 * adding any new load. 2426 * adding any new load.
2427 */ 2427 */
2428 static unsigned long 2428 static unsigned long
2429 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) 2429 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2430 { 2430 {
2431 int j = 0; 2431 int j = 0;
2432 2432
2433 if (!missed_updates) 2433 if (!missed_updates)
2434 return load; 2434 return load;
2435 2435
2436 if (missed_updates >= degrade_zero_ticks[idx]) 2436 if (missed_updates >= degrade_zero_ticks[idx])
2437 return 0; 2437 return 0;
2438 2438
2439 if (idx == 1) 2439 if (idx == 1)
2440 return load >> missed_updates; 2440 return load >> missed_updates;
2441 2441
2442 while (missed_updates) { 2442 while (missed_updates) {
2443 if (missed_updates % 2) 2443 if (missed_updates % 2)
2444 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; 2444 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
2445 2445
2446 missed_updates >>= 1; 2446 missed_updates >>= 1;
2447 j++; 2447 j++;
2448 } 2448 }
2449 return load; 2449 return load;
2450 } 2450 }
2451 2451
2452 /* 2452 /*
2453 * Update rq->cpu_load[] statistics. This function is usually called every 2453 * Update rq->cpu_load[] statistics. This function is usually called every
2454 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2454 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2455 * every tick. We fix it up based on jiffies. 2455 * every tick. We fix it up based on jiffies.
2456 */ 2456 */
2457 void update_cpu_load(struct rq *this_rq) 2457 void update_cpu_load(struct rq *this_rq)
2458 { 2458 {
2459 unsigned long this_load = this_rq->load.weight; 2459 unsigned long this_load = this_rq->load.weight;
2460 unsigned long curr_jiffies = jiffies; 2460 unsigned long curr_jiffies = jiffies;
2461 unsigned long pending_updates; 2461 unsigned long pending_updates;
2462 int i, scale; 2462 int i, scale;
2463 2463
2464 this_rq->nr_load_updates++; 2464 this_rq->nr_load_updates++;
2465 2465
2466 /* Avoid repeated calls on same jiffy, when moving in and out of idle */ 2466 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2467 if (curr_jiffies == this_rq->last_load_update_tick) 2467 if (curr_jiffies == this_rq->last_load_update_tick)
2468 return; 2468 return;
2469 2469
2470 pending_updates = curr_jiffies - this_rq->last_load_update_tick; 2470 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2471 this_rq->last_load_update_tick = curr_jiffies; 2471 this_rq->last_load_update_tick = curr_jiffies;
2472 2472
2473 /* Update our load: */ 2473 /* Update our load: */
2474 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2474 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2475 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2475 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
2476 unsigned long old_load, new_load; 2476 unsigned long old_load, new_load;
2477 2477
2478 /* scale is effectively 1 << i now, and >> i divides by scale */ 2478 /* scale is effectively 1 << i now, and >> i divides by scale */
2479 2479
2480 old_load = this_rq->cpu_load[i]; 2480 old_load = this_rq->cpu_load[i];
2481 old_load = decay_load_missed(old_load, pending_updates - 1, i); 2481 old_load = decay_load_missed(old_load, pending_updates - 1, i);
2482 new_load = this_load; 2482 new_load = this_load;
2483 /* 2483 /*
2484 * Round up the averaging division if load is increasing. This 2484 * Round up the averaging division if load is increasing. This
2485 * prevents us from getting stuck on 9 if the load is 10, for 2485 * prevents us from getting stuck on 9 if the load is 10, for
2486 * example. 2486 * example.
2487 */ 2487 */
2488 if (new_load > old_load) 2488 if (new_load > old_load)
2489 new_load += scale - 1; 2489 new_load += scale - 1;
2490 2490
2491 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; 2491 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
2492 } 2492 }
2493 2493
2494 sched_avg_update(this_rq); 2494 sched_avg_update(this_rq);
2495 } 2495 }
2496 2496
2497 static void update_cpu_load_active(struct rq *this_rq) 2497 static void update_cpu_load_active(struct rq *this_rq)
2498 { 2498 {
2499 update_cpu_load(this_rq); 2499 update_cpu_load(this_rq);
2500 2500
2501 calc_load_account_active(this_rq); 2501 calc_load_account_active(this_rq);
2502 } 2502 }
2503 2503
2504 #ifdef CONFIG_SMP 2504 #ifdef CONFIG_SMP
2505 2505
2506 /* 2506 /*
2507 * sched_exec - execve() is a valuable balancing opportunity, because at 2507 * sched_exec - execve() is a valuable balancing opportunity, because at
2508 * this point the task has the smallest effective memory and cache footprint. 2508 * this point the task has the smallest effective memory and cache footprint.
2509 */ 2509 */
2510 void sched_exec(void) 2510 void sched_exec(void)
2511 { 2511 {
2512 struct task_struct *p = current; 2512 struct task_struct *p = current;
2513 unsigned long flags; 2513 unsigned long flags;
2514 int dest_cpu; 2514 int dest_cpu;
2515 2515
2516 raw_spin_lock_irqsave(&p->pi_lock, flags); 2516 raw_spin_lock_irqsave(&p->pi_lock, flags);
2517 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2517 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
2518 if (dest_cpu == smp_processor_id()) 2518 if (dest_cpu == smp_processor_id())
2519 goto unlock; 2519 goto unlock;
2520 2520
2521 if (likely(cpu_active(dest_cpu))) { 2521 if (likely(cpu_active(dest_cpu))) {
2522 struct migration_arg arg = { p, dest_cpu }; 2522 struct migration_arg arg = { p, dest_cpu };
2523 2523
2524 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2524 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2525 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2525 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2526 return; 2526 return;
2527 } 2527 }
2528 unlock: 2528 unlock:
2529 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2529 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2530 } 2530 }
2531 2531
2532 #endif 2532 #endif
2533 2533
2534 DEFINE_PER_CPU(struct kernel_stat, kstat); 2534 DEFINE_PER_CPU(struct kernel_stat, kstat);
2535 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2535 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2536 2536
2537 EXPORT_PER_CPU_SYMBOL(kstat); 2537 EXPORT_PER_CPU_SYMBOL(kstat);
2538 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2538 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2539 2539
2540 /* 2540 /*
2541 * Return any ns on the sched_clock that have not yet been accounted in 2541 * Return any ns on the sched_clock that have not yet been accounted in
2542 * @p in case that task is currently running. 2542 * @p in case that task is currently running.
2543 * 2543 *
2544 * Called with task_rq_lock() held on @rq. 2544 * Called with task_rq_lock() held on @rq.
2545 */ 2545 */
2546 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2546 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2547 { 2547 {
2548 u64 ns = 0; 2548 u64 ns = 0;
2549 2549
2550 if (task_current(rq, p)) { 2550 if (task_current(rq, p)) {
2551 update_rq_clock(rq); 2551 update_rq_clock(rq);
2552 ns = rq->clock_task - p->se.exec_start; 2552 ns = rq->clock_task - p->se.exec_start;
2553 if ((s64)ns < 0) 2553 if ((s64)ns < 0)
2554 ns = 0; 2554 ns = 0;
2555 } 2555 }
2556 2556
2557 return ns; 2557 return ns;
2558 } 2558 }
2559 2559
2560 unsigned long long task_delta_exec(struct task_struct *p) 2560 unsigned long long task_delta_exec(struct task_struct *p)
2561 { 2561 {
2562 unsigned long flags; 2562 unsigned long flags;
2563 struct rq *rq; 2563 struct rq *rq;
2564 u64 ns = 0; 2564 u64 ns = 0;
2565 2565
2566 rq = task_rq_lock(p, &flags); 2566 rq = task_rq_lock(p, &flags);
2567 ns = do_task_delta_exec(p, rq); 2567 ns = do_task_delta_exec(p, rq);
2568 task_rq_unlock(rq, p, &flags); 2568 task_rq_unlock(rq, p, &flags);
2569 2569
2570 return ns; 2570 return ns;
2571 } 2571 }
2572 2572
2573 /* 2573 /*
2574 * Return accounted runtime for the task. 2574 * Return accounted runtime for the task.
2575 * In case the task is currently running, return the runtime plus current's 2575 * In case the task is currently running, return the runtime plus current's
2576 * pending runtime that have not been accounted yet. 2576 * pending runtime that have not been accounted yet.
2577 */ 2577 */
2578 unsigned long long task_sched_runtime(struct task_struct *p) 2578 unsigned long long task_sched_runtime(struct task_struct *p)
2579 { 2579 {
2580 unsigned long flags; 2580 unsigned long flags;
2581 struct rq *rq; 2581 struct rq *rq;
2582 u64 ns = 0; 2582 u64 ns = 0;
2583 2583
2584 rq = task_rq_lock(p, &flags); 2584 rq = task_rq_lock(p, &flags);
2585 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2585 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2586 task_rq_unlock(rq, p, &flags); 2586 task_rq_unlock(rq, p, &flags);
2587 2587
2588 return ns; 2588 return ns;
2589 } 2589 }
2590 2590
2591 #ifdef CONFIG_CGROUP_CPUACCT 2591 #ifdef CONFIG_CGROUP_CPUACCT
2592 struct cgroup_subsys cpuacct_subsys; 2592 struct cgroup_subsys cpuacct_subsys;
2593 struct cpuacct root_cpuacct; 2593 struct cpuacct root_cpuacct;
2594 #endif 2594 #endif
2595 2595
2596 static inline void task_group_account_field(struct task_struct *p, int index, 2596 static inline void task_group_account_field(struct task_struct *p, int index,
2597 u64 tmp) 2597 u64 tmp)
2598 { 2598 {
2599 #ifdef CONFIG_CGROUP_CPUACCT 2599 #ifdef CONFIG_CGROUP_CPUACCT
2600 struct kernel_cpustat *kcpustat; 2600 struct kernel_cpustat *kcpustat;
2601 struct cpuacct *ca; 2601 struct cpuacct *ca;
2602 #endif 2602 #endif
2603 /* 2603 /*
2604 * Since all updates are sure to touch the root cgroup, we 2604 * Since all updates are sure to touch the root cgroup, we
2605 * get ourselves ahead and touch it first. If the root cgroup 2605 * get ourselves ahead and touch it first. If the root cgroup
2606 * is the only cgroup, then nothing else should be necessary. 2606 * is the only cgroup, then nothing else should be necessary.
2607 * 2607 *
2608 */ 2608 */
2609 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; 2609 __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
2610 2610
2611 #ifdef CONFIG_CGROUP_CPUACCT 2611 #ifdef CONFIG_CGROUP_CPUACCT
2612 if (unlikely(!cpuacct_subsys.active)) 2612 if (unlikely(!cpuacct_subsys.active))
2613 return; 2613 return;
2614 2614
2615 rcu_read_lock(); 2615 rcu_read_lock();
2616 ca = task_ca(p); 2616 ca = task_ca(p);
2617 while (ca && (ca != &root_cpuacct)) { 2617 while (ca && (ca != &root_cpuacct)) {
2618 kcpustat = this_cpu_ptr(ca->cpustat); 2618 kcpustat = this_cpu_ptr(ca->cpustat);
2619 kcpustat->cpustat[index] += tmp; 2619 kcpustat->cpustat[index] += tmp;
2620 ca = parent_ca(ca); 2620 ca = parent_ca(ca);
2621 } 2621 }
2622 rcu_read_unlock(); 2622 rcu_read_unlock();
2623 #endif 2623 #endif
2624 } 2624 }
2625 2625
2626 2626
2627 /* 2627 /*
2628 * Account user cpu time to a process. 2628 * Account user cpu time to a process.
2629 * @p: the process that the cpu time gets accounted to 2629 * @p: the process that the cpu time gets accounted to
2630 * @cputime: the cpu time spent in user space since the last update 2630 * @cputime: the cpu time spent in user space since the last update
2631 * @cputime_scaled: cputime scaled by cpu frequency 2631 * @cputime_scaled: cputime scaled by cpu frequency
2632 */ 2632 */
2633 void account_user_time(struct task_struct *p, cputime_t cputime, 2633 void account_user_time(struct task_struct *p, cputime_t cputime,
2634 cputime_t cputime_scaled) 2634 cputime_t cputime_scaled)
2635 { 2635 {
2636 int index; 2636 int index;
2637 2637
2638 /* Add user time to process. */ 2638 /* Add user time to process. */
2639 p->utime += cputime; 2639 p->utime += cputime;
2640 p->utimescaled += cputime_scaled; 2640 p->utimescaled += cputime_scaled;
2641 account_group_user_time(p, cputime); 2641 account_group_user_time(p, cputime);
2642 2642
2643 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 2643 index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
2644 2644
2645 /* Add user time to cpustat. */ 2645 /* Add user time to cpustat. */
2646 task_group_account_field(p, index, (__force u64) cputime); 2646 task_group_account_field(p, index, (__force u64) cputime);
2647 2647
2648 /* Account for user time used */ 2648 /* Account for user time used */
2649 acct_update_integrals(p); 2649 acct_update_integrals(p);
2650 } 2650 }
2651 2651
2652 /* 2652 /*
2653 * Account guest cpu time to a process. 2653 * Account guest cpu time to a process.
2654 * @p: the process that the cpu time gets accounted to 2654 * @p: the process that the cpu time gets accounted to
2655 * @cputime: the cpu time spent in virtual machine since the last update 2655 * @cputime: the cpu time spent in virtual machine since the last update
2656 * @cputime_scaled: cputime scaled by cpu frequency 2656 * @cputime_scaled: cputime scaled by cpu frequency
2657 */ 2657 */
2658 static void account_guest_time(struct task_struct *p, cputime_t cputime, 2658 static void account_guest_time(struct task_struct *p, cputime_t cputime,
2659 cputime_t cputime_scaled) 2659 cputime_t cputime_scaled)
2660 { 2660 {
2661 u64 *cpustat = kcpustat_this_cpu->cpustat; 2661 u64 *cpustat = kcpustat_this_cpu->cpustat;
2662 2662
2663 /* Add guest time to process. */ 2663 /* Add guest time to process. */
2664 p->utime += cputime; 2664 p->utime += cputime;
2665 p->utimescaled += cputime_scaled; 2665 p->utimescaled += cputime_scaled;
2666 account_group_user_time(p, cputime); 2666 account_group_user_time(p, cputime);
2667 p->gtime += cputime; 2667 p->gtime += cputime;
2668 2668
2669 /* Add guest time to cpustat. */ 2669 /* Add guest time to cpustat. */
2670 if (TASK_NICE(p) > 0) { 2670 if (TASK_NICE(p) > 0) {
2671 cpustat[CPUTIME_NICE] += (__force u64) cputime; 2671 cpustat[CPUTIME_NICE] += (__force u64) cputime;
2672 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; 2672 cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
2673 } else { 2673 } else {
2674 cpustat[CPUTIME_USER] += (__force u64) cputime; 2674 cpustat[CPUTIME_USER] += (__force u64) cputime;
2675 cpustat[CPUTIME_GUEST] += (__force u64) cputime; 2675 cpustat[CPUTIME_GUEST] += (__force u64) cputime;
2676 } 2676 }
2677 } 2677 }
2678 2678
2679 /* 2679 /*
2680 * Account system cpu time to a process and desired cpustat field 2680 * Account system cpu time to a process and desired cpustat field
2681 * @p: the process that the cpu time gets accounted to 2681 * @p: the process that the cpu time gets accounted to
2682 * @cputime: the cpu time spent in kernel space since the last update 2682 * @cputime: the cpu time spent in kernel space since the last update
2683 * @cputime_scaled: cputime scaled by cpu frequency 2683 * @cputime_scaled: cputime scaled by cpu frequency
2684 * @target_cputime64: pointer to cpustat field that has to be updated 2684 * @target_cputime64: pointer to cpustat field that has to be updated
2685 */ 2685 */
2686 static inline 2686 static inline
2687 void __account_system_time(struct task_struct *p, cputime_t cputime, 2687 void __account_system_time(struct task_struct *p, cputime_t cputime,
2688 cputime_t cputime_scaled, int index) 2688 cputime_t cputime_scaled, int index)
2689 { 2689 {
2690 /* Add system time to process. */ 2690 /* Add system time to process. */
2691 p->stime += cputime; 2691 p->stime += cputime;
2692 p->stimescaled += cputime_scaled; 2692 p->stimescaled += cputime_scaled;
2693 account_group_system_time(p, cputime); 2693 account_group_system_time(p, cputime);
2694 2694
2695 /* Add system time to cpustat. */ 2695 /* Add system time to cpustat. */
2696 task_group_account_field(p, index, (__force u64) cputime); 2696 task_group_account_field(p, index, (__force u64) cputime);
2697 2697
2698 /* Account for system time used */ 2698 /* Account for system time used */
2699 acct_update_integrals(p); 2699 acct_update_integrals(p);
2700 } 2700 }
2701 2701
2702 /* 2702 /*
2703 * Account system cpu time to a process. 2703 * Account system cpu time to a process.
2704 * @p: the process that the cpu time gets accounted to 2704 * @p: the process that the cpu time gets accounted to
2705 * @hardirq_offset: the offset to subtract from hardirq_count() 2705 * @hardirq_offset: the offset to subtract from hardirq_count()
2706 * @cputime: the cpu time spent in kernel space since the last update 2706 * @cputime: the cpu time spent in kernel space since the last update
2707 * @cputime_scaled: cputime scaled by cpu frequency 2707 * @cputime_scaled: cputime scaled by cpu frequency
2708 */ 2708 */
2709 void account_system_time(struct task_struct *p, int hardirq_offset, 2709 void account_system_time(struct task_struct *p, int hardirq_offset,
2710 cputime_t cputime, cputime_t cputime_scaled) 2710 cputime_t cputime, cputime_t cputime_scaled)
2711 { 2711 {
2712 int index; 2712 int index;
2713 2713
2714 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 2714 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
2715 account_guest_time(p, cputime, cputime_scaled); 2715 account_guest_time(p, cputime, cputime_scaled);
2716 return; 2716 return;
2717 } 2717 }
2718 2718
2719 if (hardirq_count() - hardirq_offset) 2719 if (hardirq_count() - hardirq_offset)
2720 index = CPUTIME_IRQ; 2720 index = CPUTIME_IRQ;
2721 else if (in_serving_softirq()) 2721 else if (in_serving_softirq())
2722 index = CPUTIME_SOFTIRQ; 2722 index = CPUTIME_SOFTIRQ;
2723 else 2723 else
2724 index = CPUTIME_SYSTEM; 2724 index = CPUTIME_SYSTEM;
2725 2725
2726 __account_system_time(p, cputime, cputime_scaled, index); 2726 __account_system_time(p, cputime, cputime_scaled, index);
2727 } 2727 }
2728 2728
2729 /* 2729 /*
2730 * Account for involuntary wait time. 2730 * Account for involuntary wait time.
2731 * @cputime: the cpu time spent in involuntary wait 2731 * @cputime: the cpu time spent in involuntary wait
2732 */ 2732 */
2733 void account_steal_time(cputime_t cputime) 2733 void account_steal_time(cputime_t cputime)
2734 { 2734 {
2735 u64 *cpustat = kcpustat_this_cpu->cpustat; 2735 u64 *cpustat = kcpustat_this_cpu->cpustat;
2736 2736
2737 cpustat[CPUTIME_STEAL] += (__force u64) cputime; 2737 cpustat[CPUTIME_STEAL] += (__force u64) cputime;
2738 } 2738 }
2739 2739
2740 /* 2740 /*
2741 * Account for idle time. 2741 * Account for idle time.
2742 * @cputime: the cpu time spent in idle wait 2742 * @cputime: the cpu time spent in idle wait
2743 */ 2743 */
2744 void account_idle_time(cputime_t cputime) 2744 void account_idle_time(cputime_t cputime)
2745 { 2745 {
2746 u64 *cpustat = kcpustat_this_cpu->cpustat; 2746 u64 *cpustat = kcpustat_this_cpu->cpustat;
2747 struct rq *rq = this_rq(); 2747 struct rq *rq = this_rq();
2748 2748
2749 if (atomic_read(&rq->nr_iowait) > 0) 2749 if (atomic_read(&rq->nr_iowait) > 0)
2750 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; 2750 cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
2751 else 2751 else
2752 cpustat[CPUTIME_IDLE] += (__force u64) cputime; 2752 cpustat[CPUTIME_IDLE] += (__force u64) cputime;
2753 } 2753 }
2754 2754
2755 static __always_inline bool steal_account_process_tick(void) 2755 static __always_inline bool steal_account_process_tick(void)
2756 { 2756 {
2757 #ifdef CONFIG_PARAVIRT 2757 #ifdef CONFIG_PARAVIRT
2758 if (static_branch(&paravirt_steal_enabled)) { 2758 if (static_branch(&paravirt_steal_enabled)) {
2759 u64 steal, st = 0; 2759 u64 steal, st = 0;
2760 2760
2761 steal = paravirt_steal_clock(smp_processor_id()); 2761 steal = paravirt_steal_clock(smp_processor_id());
2762 steal -= this_rq()->prev_steal_time; 2762 steal -= this_rq()->prev_steal_time;
2763 2763
2764 st = steal_ticks(steal); 2764 st = steal_ticks(steal);
2765 this_rq()->prev_steal_time += st * TICK_NSEC; 2765 this_rq()->prev_steal_time += st * TICK_NSEC;
2766 2766
2767 account_steal_time(st); 2767 account_steal_time(st);
2768 return st; 2768 return st;
2769 } 2769 }
2770 #endif 2770 #endif
2771 return false; 2771 return false;
2772 } 2772 }
2773 2773
2774 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 2774 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
2775 2775
2776 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 2776 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
2777 /* 2777 /*
2778 * Account a tick to a process and cpustat 2778 * Account a tick to a process and cpustat
2779 * @p: the process that the cpu time gets accounted to 2779 * @p: the process that the cpu time gets accounted to
2780 * @user_tick: is the tick from userspace 2780 * @user_tick: is the tick from userspace
2781 * @rq: the pointer to rq 2781 * @rq: the pointer to rq
2782 * 2782 *
2783 * Tick demultiplexing follows the order 2783 * Tick demultiplexing follows the order
2784 * - pending hardirq update 2784 * - pending hardirq update
2785 * - pending softirq update 2785 * - pending softirq update
2786 * - user_time 2786 * - user_time
2787 * - idle_time 2787 * - idle_time
2788 * - system time 2788 * - system time
2789 * - check for guest_time 2789 * - check for guest_time
2790 * - else account as system_time 2790 * - else account as system_time
2791 * 2791 *
2792 * Check for hardirq is done both for system and user time as there is 2792 * Check for hardirq is done both for system and user time as there is
2793 * no timer going off while we are on hardirq and hence we may never get an 2793 * no timer going off while we are on hardirq and hence we may never get an
2794 * opportunity to update it solely in system time. 2794 * opportunity to update it solely in system time.
2795 * p->stime and friends are only updated on system time and not on irq 2795 * p->stime and friends are only updated on system time and not on irq
2796 * softirq as those do not count in task exec_runtime any more. 2796 * softirq as those do not count in task exec_runtime any more.
2797 */ 2797 */
2798 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 2798 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
2799 struct rq *rq) 2799 struct rq *rq)
2800 { 2800 {
2801 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2801 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2802 u64 *cpustat = kcpustat_this_cpu->cpustat; 2802 u64 *cpustat = kcpustat_this_cpu->cpustat;
2803 2803
2804 if (steal_account_process_tick()) 2804 if (steal_account_process_tick())
2805 return; 2805 return;
2806 2806
2807 if (irqtime_account_hi_update()) { 2807 if (irqtime_account_hi_update()) {
2808 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; 2808 cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
2809 } else if (irqtime_account_si_update()) { 2809 } else if (irqtime_account_si_update()) {
2810 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; 2810 cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
2811 } else if (this_cpu_ksoftirqd() == p) { 2811 } else if (this_cpu_ksoftirqd() == p) {
2812 /* 2812 /*
2813 * ksoftirqd time do not get accounted in cpu_softirq_time. 2813 * ksoftirqd time do not get accounted in cpu_softirq_time.
2814 * So, we have to handle it separately here. 2814 * So, we have to handle it separately here.
2815 * Also, p->stime needs to be updated for ksoftirqd. 2815 * Also, p->stime needs to be updated for ksoftirqd.
2816 */ 2816 */
2817 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2817 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
2818 CPUTIME_SOFTIRQ); 2818 CPUTIME_SOFTIRQ);
2819 } else if (user_tick) { 2819 } else if (user_tick) {
2820 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2820 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
2821 } else if (p == rq->idle) { 2821 } else if (p == rq->idle) {
2822 account_idle_time(cputime_one_jiffy); 2822 account_idle_time(cputime_one_jiffy);
2823 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 2823 } else if (p->flags & PF_VCPU) { /* System time or guest time */
2824 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); 2824 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
2825 } else { 2825 } else {
2826 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, 2826 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
2827 CPUTIME_SYSTEM); 2827 CPUTIME_SYSTEM);
2828 } 2828 }
2829 } 2829 }
2830 2830
2831 static void irqtime_account_idle_ticks(int ticks) 2831 static void irqtime_account_idle_ticks(int ticks)
2832 { 2832 {
2833 int i; 2833 int i;
2834 struct rq *rq = this_rq(); 2834 struct rq *rq = this_rq();
2835 2835
2836 for (i = 0; i < ticks; i++) 2836 for (i = 0; i < ticks; i++)
2837 irqtime_account_process_tick(current, 0, rq); 2837 irqtime_account_process_tick(current, 0, rq);
2838 } 2838 }
2839 #else /* CONFIG_IRQ_TIME_ACCOUNTING */ 2839 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
2840 static void irqtime_account_idle_ticks(int ticks) {} 2840 static void irqtime_account_idle_ticks(int ticks) {}
2841 static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 2841 static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
2842 struct rq *rq) {} 2842 struct rq *rq) {}
2843 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 2843 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2844 2844
2845 /* 2845 /*
2846 * Account a single tick of cpu time. 2846 * Account a single tick of cpu time.
2847 * @p: the process that the cpu time gets accounted to 2847 * @p: the process that the cpu time gets accounted to
2848 * @user_tick: indicates if the tick is a user or a system tick 2848 * @user_tick: indicates if the tick is a user or a system tick
2849 */ 2849 */
2850 void account_process_tick(struct task_struct *p, int user_tick) 2850 void account_process_tick(struct task_struct *p, int user_tick)
2851 { 2851 {
2852 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 2852 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
2853 struct rq *rq = this_rq(); 2853 struct rq *rq = this_rq();
2854 2854
2855 if (sched_clock_irqtime) { 2855 if (sched_clock_irqtime) {
2856 irqtime_account_process_tick(p, user_tick, rq); 2856 irqtime_account_process_tick(p, user_tick, rq);
2857 return; 2857 return;
2858 } 2858 }
2859 2859
2860 if (steal_account_process_tick()) 2860 if (steal_account_process_tick())
2861 return; 2861 return;
2862 2862
2863 if (user_tick) 2863 if (user_tick)
2864 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 2864 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
2865 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 2865 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
2866 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 2866 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
2867 one_jiffy_scaled); 2867 one_jiffy_scaled);
2868 else 2868 else
2869 account_idle_time(cputime_one_jiffy); 2869 account_idle_time(cputime_one_jiffy);
2870 } 2870 }
2871 2871
2872 /* 2872 /*
2873 * Account multiple ticks of steal time. 2873 * Account multiple ticks of steal time.
2874 * @p: the process from which the cpu time has been stolen 2874 * @p: the process from which the cpu time has been stolen
2875 * @ticks: number of stolen ticks 2875 * @ticks: number of stolen ticks
2876 */ 2876 */
2877 void account_steal_ticks(unsigned long ticks) 2877 void account_steal_ticks(unsigned long ticks)
2878 { 2878 {
2879 account_steal_time(jiffies_to_cputime(ticks)); 2879 account_steal_time(jiffies_to_cputime(ticks));
2880 } 2880 }
2881 2881
2882 /* 2882 /*
2883 * Account multiple ticks of idle time. 2883 * Account multiple ticks of idle time.
2884 * @ticks: number of stolen ticks 2884 * @ticks: number of stolen ticks
2885 */ 2885 */
2886 void account_idle_ticks(unsigned long ticks) 2886 void account_idle_ticks(unsigned long ticks)
2887 { 2887 {
2888 2888
2889 if (sched_clock_irqtime) { 2889 if (sched_clock_irqtime) {
2890 irqtime_account_idle_ticks(ticks); 2890 irqtime_account_idle_ticks(ticks);
2891 return; 2891 return;
2892 } 2892 }
2893 2893
2894 account_idle_time(jiffies_to_cputime(ticks)); 2894 account_idle_time(jiffies_to_cputime(ticks));
2895 } 2895 }
2896 2896
2897 #endif 2897 #endif
2898 2898
2899 /* 2899 /*
2900 * Use precise platform statistics if available: 2900 * Use precise platform statistics if available:
2901 */ 2901 */
2902 #ifdef CONFIG_VIRT_CPU_ACCOUNTING 2902 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
2903 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2903 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2904 { 2904 {
2905 *ut = p->utime; 2905 *ut = p->utime;
2906 *st = p->stime; 2906 *st = p->stime;
2907 } 2907 }
2908 2908
2909 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2909 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2910 { 2910 {
2911 struct task_cputime cputime; 2911 struct task_cputime cputime;
2912 2912
2913 thread_group_cputime(p, &cputime); 2913 thread_group_cputime(p, &cputime);
2914 2914
2915 *ut = cputime.utime; 2915 *ut = cputime.utime;
2916 *st = cputime.stime; 2916 *st = cputime.stime;
2917 } 2917 }
2918 #else 2918 #else
2919 2919
2920 #ifndef nsecs_to_cputime 2920 #ifndef nsecs_to_cputime
2921 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 2921 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
2922 #endif 2922 #endif
2923 2923
2924 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2924 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2925 { 2925 {
2926 cputime_t rtime, utime = p->utime, total = utime + p->stime; 2926 cputime_t rtime, utime = p->utime, total = utime + p->stime;
2927 2927
2928 /* 2928 /*
2929 * Use CFS's precise accounting: 2929 * Use CFS's precise accounting:
2930 */ 2930 */
2931 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 2931 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
2932 2932
2933 if (total) { 2933 if (total) {
2934 u64 temp = (__force u64) rtime; 2934 u64 temp = (__force u64) rtime;
2935 2935
2936 temp *= (__force u64) utime; 2936 temp *= (__force u64) utime;
2937 do_div(temp, (__force u32) total); 2937 do_div(temp, (__force u32) total);
2938 utime = (__force cputime_t) temp; 2938 utime = (__force cputime_t) temp;
2939 } else 2939 } else
2940 utime = rtime; 2940 utime = rtime;
2941 2941
2942 /* 2942 /*
2943 * Compare with previous values, to keep monotonicity: 2943 * Compare with previous values, to keep monotonicity:
2944 */ 2944 */
2945 p->prev_utime = max(p->prev_utime, utime); 2945 p->prev_utime = max(p->prev_utime, utime);
2946 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); 2946 p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
2947 2947
2948 *ut = p->prev_utime; 2948 *ut = p->prev_utime;
2949 *st = p->prev_stime; 2949 *st = p->prev_stime;
2950 } 2950 }
2951 2951
2952 /* 2952 /*
2953 * Must be called with siglock held. 2953 * Must be called with siglock held.
2954 */ 2954 */
2955 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 2955 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
2956 { 2956 {
2957 struct signal_struct *sig = p->signal; 2957 struct signal_struct *sig = p->signal;
2958 struct task_cputime cputime; 2958 struct task_cputime cputime;
2959 cputime_t rtime, utime, total; 2959 cputime_t rtime, utime, total;
2960 2960
2961 thread_group_cputime(p, &cputime); 2961 thread_group_cputime(p, &cputime);
2962 2962
2963 total = cputime.utime + cputime.stime; 2963 total = cputime.utime + cputime.stime;
2964 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 2964 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
2965 2965
2966 if (total) { 2966 if (total) {
2967 u64 temp = (__force u64) rtime; 2967 u64 temp = (__force u64) rtime;
2968 2968
2969 temp *= (__force u64) cputime.utime; 2969 temp *= (__force u64) cputime.utime;
2970 do_div(temp, (__force u32) total); 2970 do_div(temp, (__force u32) total);
2971 utime = (__force cputime_t) temp; 2971 utime = (__force cputime_t) temp;
2972 } else 2972 } else
2973 utime = rtime; 2973 utime = rtime;
2974 2974
2975 sig->prev_utime = max(sig->prev_utime, utime); 2975 sig->prev_utime = max(sig->prev_utime, utime);
2976 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); 2976 sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
2977 2977
2978 *ut = sig->prev_utime; 2978 *ut = sig->prev_utime;
2979 *st = sig->prev_stime; 2979 *st = sig->prev_stime;
2980 } 2980 }
2981 #endif 2981 #endif
2982 2982
2983 /* 2983 /*
2984 * This function gets called by the timer code, with HZ frequency. 2984 * This function gets called by the timer code, with HZ frequency.
2985 * We call it with interrupts disabled. 2985 * We call it with interrupts disabled.
2986 */ 2986 */
2987 void scheduler_tick(void) 2987 void scheduler_tick(void)
2988 { 2988 {
2989 int cpu = smp_processor_id(); 2989 int cpu = smp_processor_id();
2990 struct rq *rq = cpu_rq(cpu); 2990 struct rq *rq = cpu_rq(cpu);
2991 struct task_struct *curr = rq->curr; 2991 struct task_struct *curr = rq->curr;
2992 2992
2993 sched_clock_tick(); 2993 sched_clock_tick();
2994 2994
2995 raw_spin_lock(&rq->lock); 2995 raw_spin_lock(&rq->lock);
2996 update_rq_clock(rq); 2996 update_rq_clock(rq);
2997 update_cpu_load_active(rq); 2997 update_cpu_load_active(rq);
2998 curr->sched_class->task_tick(rq, curr, 0); 2998 curr->sched_class->task_tick(rq, curr, 0);
2999 raw_spin_unlock(&rq->lock); 2999 raw_spin_unlock(&rq->lock);
3000 3000
3001 perf_event_task_tick(); 3001 perf_event_task_tick();
3002 3002
3003 #ifdef CONFIG_SMP 3003 #ifdef CONFIG_SMP
3004 rq->idle_balance = idle_cpu(cpu); 3004 rq->idle_balance = idle_cpu(cpu);
3005 trigger_load_balance(rq, cpu); 3005 trigger_load_balance(rq, cpu);
3006 #endif 3006 #endif
3007 } 3007 }
3008 3008
3009 notrace unsigned long get_parent_ip(unsigned long addr) 3009 notrace unsigned long get_parent_ip(unsigned long addr)
3010 { 3010 {
3011 if (in_lock_functions(addr)) { 3011 if (in_lock_functions(addr)) {
3012 addr = CALLER_ADDR2; 3012 addr = CALLER_ADDR2;
3013 if (in_lock_functions(addr)) 3013 if (in_lock_functions(addr))
3014 addr = CALLER_ADDR3; 3014 addr = CALLER_ADDR3;
3015 } 3015 }
3016 return addr; 3016 return addr;
3017 } 3017 }
3018 3018
3019 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3019 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3020 defined(CONFIG_PREEMPT_TRACER)) 3020 defined(CONFIG_PREEMPT_TRACER))
3021 3021
3022 void __kprobes add_preempt_count(int val) 3022 void __kprobes add_preempt_count(int val)
3023 { 3023 {
3024 #ifdef CONFIG_DEBUG_PREEMPT 3024 #ifdef CONFIG_DEBUG_PREEMPT
3025 /* 3025 /*
3026 * Underflow? 3026 * Underflow?
3027 */ 3027 */
3028 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3028 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3029 return; 3029 return;
3030 #endif 3030 #endif
3031 preempt_count() += val; 3031 preempt_count() += val;
3032 #ifdef CONFIG_DEBUG_PREEMPT 3032 #ifdef CONFIG_DEBUG_PREEMPT
3033 /* 3033 /*
3034 * Spinlock count overflowing soon? 3034 * Spinlock count overflowing soon?
3035 */ 3035 */
3036 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3036 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3037 PREEMPT_MASK - 10); 3037 PREEMPT_MASK - 10);
3038 #endif 3038 #endif
3039 if (preempt_count() == val) 3039 if (preempt_count() == val)
3040 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3040 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3041 } 3041 }
3042 EXPORT_SYMBOL(add_preempt_count); 3042 EXPORT_SYMBOL(add_preempt_count);
3043 3043
3044 void __kprobes sub_preempt_count(int val) 3044 void __kprobes sub_preempt_count(int val)
3045 { 3045 {
3046 #ifdef CONFIG_DEBUG_PREEMPT 3046 #ifdef CONFIG_DEBUG_PREEMPT
3047 /* 3047 /*
3048 * Underflow? 3048 * Underflow?
3049 */ 3049 */
3050 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3050 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3051 return; 3051 return;
3052 /* 3052 /*
3053 * Is the spinlock portion underflowing? 3053 * Is the spinlock portion underflowing?
3054 */ 3054 */
3055 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3055 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3056 !(preempt_count() & PREEMPT_MASK))) 3056 !(preempt_count() & PREEMPT_MASK)))
3057 return; 3057 return;
3058 #endif 3058 #endif
3059 3059
3060 if (preempt_count() == val) 3060 if (preempt_count() == val)
3061 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3061 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3062 preempt_count() -= val; 3062 preempt_count() -= val;
3063 } 3063 }
3064 EXPORT_SYMBOL(sub_preempt_count); 3064 EXPORT_SYMBOL(sub_preempt_count);
3065 3065
3066 #endif 3066 #endif
3067 3067
3068 /* 3068 /*
3069 * Print scheduling while atomic bug: 3069 * Print scheduling while atomic bug:
3070 */ 3070 */
3071 static noinline void __schedule_bug(struct task_struct *prev) 3071 static noinline void __schedule_bug(struct task_struct *prev)
3072 { 3072 {
3073 struct pt_regs *regs = get_irq_regs(); 3073 struct pt_regs *regs = get_irq_regs();
3074 3074
3075 if (oops_in_progress) 3075 if (oops_in_progress)
3076 return; 3076 return;
3077 3077
3078 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3078 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3079 prev->comm, prev->pid, preempt_count()); 3079 prev->comm, prev->pid, preempt_count());
3080 3080
3081 debug_show_held_locks(prev); 3081 debug_show_held_locks(prev);
3082 print_modules(); 3082 print_modules();
3083 if (irqs_disabled()) 3083 if (irqs_disabled())
3084 print_irqtrace_events(prev); 3084 print_irqtrace_events(prev);
3085 3085
3086 if (regs) 3086 if (regs)
3087 show_regs(regs); 3087 show_regs(regs);
3088 else 3088 else
3089 dump_stack(); 3089 dump_stack();
3090 } 3090 }
3091 3091
3092 /* 3092 /*
3093 * Various schedule()-time debugging checks and statistics: 3093 * Various schedule()-time debugging checks and statistics:
3094 */ 3094 */
3095 static inline void schedule_debug(struct task_struct *prev) 3095 static inline void schedule_debug(struct task_struct *prev)
3096 { 3096 {
3097 /* 3097 /*
3098 * Test if we are atomic. Since do_exit() needs to call into 3098 * Test if we are atomic. Since do_exit() needs to call into
3099 * schedule() atomically, we ignore that path for now. 3099 * schedule() atomically, we ignore that path for now.
3100 * Otherwise, whine if we are scheduling when we should not be. 3100 * Otherwise, whine if we are scheduling when we should not be.
3101 */ 3101 */
3102 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 3102 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3103 __schedule_bug(prev); 3103 __schedule_bug(prev);
3104 rcu_sleep_check(); 3104 rcu_sleep_check();
3105 3105
3106 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3106 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3107 3107
3108 schedstat_inc(this_rq(), sched_count); 3108 schedstat_inc(this_rq(), sched_count);
3109 } 3109 }
3110 3110
3111 static void put_prev_task(struct rq *rq, struct task_struct *prev) 3111 static void put_prev_task(struct rq *rq, struct task_struct *prev)
3112 { 3112 {
3113 if (prev->on_rq || rq->skip_clock_update < 0) 3113 if (prev->on_rq || rq->skip_clock_update < 0)
3114 update_rq_clock(rq); 3114 update_rq_clock(rq);
3115 prev->sched_class->put_prev_task(rq, prev); 3115 prev->sched_class->put_prev_task(rq, prev);
3116 } 3116 }
3117 3117
3118 /* 3118 /*
3119 * Pick up the highest-prio task: 3119 * Pick up the highest-prio task:
3120 */ 3120 */
3121 static inline struct task_struct * 3121 static inline struct task_struct *
3122 pick_next_task(struct rq *rq) 3122 pick_next_task(struct rq *rq)
3123 { 3123 {
3124 const struct sched_class *class; 3124 const struct sched_class *class;
3125 struct task_struct *p; 3125 struct task_struct *p;
3126 3126
3127 /* 3127 /*
3128 * Optimization: we know that if all tasks are in 3128 * Optimization: we know that if all tasks are in
3129 * the fair class we can call that function directly: 3129 * the fair class we can call that function directly:
3130 */ 3130 */
3131 if (likely(rq->nr_running == rq->cfs.h_nr_running)) { 3131 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
3132 p = fair_sched_class.pick_next_task(rq); 3132 p = fair_sched_class.pick_next_task(rq);
3133 if (likely(p)) 3133 if (likely(p))
3134 return p; 3134 return p;
3135 } 3135 }
3136 3136
3137 for_each_class(class) { 3137 for_each_class(class) {
3138 p = class->pick_next_task(rq); 3138 p = class->pick_next_task(rq);
3139 if (p) 3139 if (p)
3140 return p; 3140 return p;
3141 } 3141 }
3142 3142
3143 BUG(); /* the idle class will always have a runnable task */ 3143 BUG(); /* the idle class will always have a runnable task */
3144 } 3144 }
3145 3145
3146 /* 3146 /*
3147 * __schedule() is the main scheduler function. 3147 * __schedule() is the main scheduler function.
3148 */ 3148 */
3149 static void __sched __schedule(void) 3149 static void __sched __schedule(void)
3150 { 3150 {
3151 struct task_struct *prev, *next; 3151 struct task_struct *prev, *next;
3152 unsigned long *switch_count; 3152 unsigned long *switch_count;
3153 struct rq *rq; 3153 struct rq *rq;
3154 int cpu; 3154 int cpu;
3155 3155
3156 need_resched: 3156 need_resched:
3157 preempt_disable(); 3157 preempt_disable();
3158 cpu = smp_processor_id(); 3158 cpu = smp_processor_id();
3159 rq = cpu_rq(cpu); 3159 rq = cpu_rq(cpu);
3160 rcu_note_context_switch(cpu); 3160 rcu_note_context_switch(cpu);
3161 prev = rq->curr; 3161 prev = rq->curr;
3162 3162
3163 schedule_debug(prev); 3163 schedule_debug(prev);
3164 3164
3165 if (sched_feat(HRTICK)) 3165 if (sched_feat(HRTICK))
3166 hrtick_clear(rq); 3166 hrtick_clear(rq);
3167 3167
3168 raw_spin_lock_irq(&rq->lock); 3168 raw_spin_lock_irq(&rq->lock);
3169 3169
3170 switch_count = &prev->nivcsw; 3170 switch_count = &prev->nivcsw;
3171 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3171 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3172 if (unlikely(signal_pending_state(prev->state, prev))) { 3172 if (unlikely(signal_pending_state(prev->state, prev))) {
3173 prev->state = TASK_RUNNING; 3173 prev->state = TASK_RUNNING;
3174 } else { 3174 } else {
3175 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3175 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3176 prev->on_rq = 0; 3176 prev->on_rq = 0;
3177 3177
3178 /* 3178 /*
3179 * If a worker went to sleep, notify and ask workqueue 3179 * If a worker went to sleep, notify and ask workqueue
3180 * whether it wants to wake up a task to maintain 3180 * whether it wants to wake up a task to maintain
3181 * concurrency. 3181 * concurrency.
3182 */ 3182 */
3183 if (prev->flags & PF_WQ_WORKER) { 3183 if (prev->flags & PF_WQ_WORKER) {
3184 struct task_struct *to_wakeup; 3184 struct task_struct *to_wakeup;
3185 3185
3186 to_wakeup = wq_worker_sleeping(prev, cpu); 3186 to_wakeup = wq_worker_sleeping(prev, cpu);
3187 if (to_wakeup) 3187 if (to_wakeup)
3188 try_to_wake_up_local(to_wakeup); 3188 try_to_wake_up_local(to_wakeup);
3189 } 3189 }
3190 } 3190 }
3191 switch_count = &prev->nvcsw; 3191 switch_count = &prev->nvcsw;
3192 } 3192 }
3193 3193
3194 pre_schedule(rq, prev); 3194 pre_schedule(rq, prev);
3195 3195
3196 if (unlikely(!rq->nr_running)) 3196 if (unlikely(!rq->nr_running))
3197 idle_balance(cpu, rq); 3197 idle_balance(cpu, rq);
3198 3198
3199 put_prev_task(rq, prev); 3199 put_prev_task(rq, prev);
3200 next = pick_next_task(rq); 3200 next = pick_next_task(rq);
3201 clear_tsk_need_resched(prev); 3201 clear_tsk_need_resched(prev);
3202 rq->skip_clock_update = 0; 3202 rq->skip_clock_update = 0;
3203 3203
3204 if (likely(prev != next)) { 3204 if (likely(prev != next)) {
3205 rq->nr_switches++; 3205 rq->nr_switches++;
3206 rq->curr = next; 3206 rq->curr = next;
3207 ++*switch_count; 3207 ++*switch_count;
3208 3208
3209 context_switch(rq, prev, next); /* unlocks the rq */ 3209 context_switch(rq, prev, next); /* unlocks the rq */
3210 /* 3210 /*
3211 * The context switch have flipped the stack from under us 3211 * The context switch have flipped the stack from under us
3212 * and restored the local variables which were saved when 3212 * and restored the local variables which were saved when
3213 * this task called schedule() in the past. prev == current 3213 * this task called schedule() in the past. prev == current
3214 * is still correct, but it can be moved to another cpu/rq. 3214 * is still correct, but it can be moved to another cpu/rq.
3215 */ 3215 */
3216 cpu = smp_processor_id(); 3216 cpu = smp_processor_id();
3217 rq = cpu_rq(cpu); 3217 rq = cpu_rq(cpu);
3218 } else 3218 } else
3219 raw_spin_unlock_irq(&rq->lock); 3219 raw_spin_unlock_irq(&rq->lock);
3220 3220
3221 post_schedule(rq); 3221 post_schedule(rq);
3222 3222
3223 preempt_enable_no_resched(); 3223 sched_preempt_enable_no_resched();
3224 if (need_resched()) 3224 if (need_resched())
3225 goto need_resched; 3225 goto need_resched;
3226 } 3226 }
3227 3227
3228 static inline void sched_submit_work(struct task_struct *tsk) 3228 static inline void sched_submit_work(struct task_struct *tsk)
3229 { 3229 {
3230 if (!tsk->state) 3230 if (!tsk->state)
3231 return; 3231 return;
3232 /* 3232 /*
3233 * If we are going to sleep and we have plugged IO queued, 3233 * If we are going to sleep and we have plugged IO queued,
3234 * make sure to submit it to avoid deadlocks. 3234 * make sure to submit it to avoid deadlocks.
3235 */ 3235 */
3236 if (blk_needs_flush_plug(tsk)) 3236 if (blk_needs_flush_plug(tsk))
3237 blk_schedule_flush_plug(tsk); 3237 blk_schedule_flush_plug(tsk);
3238 } 3238 }
3239 3239
3240 asmlinkage void __sched schedule(void) 3240 asmlinkage void __sched schedule(void)
3241 { 3241 {
3242 struct task_struct *tsk = current; 3242 struct task_struct *tsk = current;
3243 3243
3244 sched_submit_work(tsk); 3244 sched_submit_work(tsk);
3245 __schedule(); 3245 __schedule();
3246 } 3246 }
3247 EXPORT_SYMBOL(schedule); 3247 EXPORT_SYMBOL(schedule);
3248 3248
3249 /** 3249 /**
3250 * schedule_preempt_disabled - called with preemption disabled 3250 * schedule_preempt_disabled - called with preemption disabled
3251 * 3251 *
3252 * Returns with preemption disabled. Note: preempt_count must be 1 3252 * Returns with preemption disabled. Note: preempt_count must be 1
3253 */ 3253 */
3254 void __sched schedule_preempt_disabled(void) 3254 void __sched schedule_preempt_disabled(void)
3255 { 3255 {
3256 preempt_enable_no_resched(); 3256 sched_preempt_enable_no_resched();
3257 schedule(); 3257 schedule();
3258 preempt_disable(); 3258 preempt_disable();
3259 } 3259 }
3260 3260
3261 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 3261 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3262 3262
3263 static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 3263 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
3264 { 3264 {
3265 if (lock->owner != owner) 3265 if (lock->owner != owner)
3266 return false; 3266 return false;
3267 3267
3268 /* 3268 /*
3269 * Ensure we emit the owner->on_cpu, dereference _after_ checking 3269 * Ensure we emit the owner->on_cpu, dereference _after_ checking
3270 * lock->owner still matches owner, if that fails, owner might 3270 * lock->owner still matches owner, if that fails, owner might
3271 * point to free()d memory, if it still matches, the rcu_read_lock() 3271 * point to free()d memory, if it still matches, the rcu_read_lock()
3272 * ensures the memory stays valid. 3272 * ensures the memory stays valid.
3273 */ 3273 */
3274 barrier(); 3274 barrier();
3275 3275
3276 return owner->on_cpu; 3276 return owner->on_cpu;
3277 } 3277 }
3278 3278
3279 /* 3279 /*
3280 * Look out! "owner" is an entirely speculative pointer 3280 * Look out! "owner" is an entirely speculative pointer
3281 * access and not reliable. 3281 * access and not reliable.
3282 */ 3282 */
3283 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) 3283 int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
3284 { 3284 {
3285 if (!sched_feat(OWNER_SPIN)) 3285 if (!sched_feat(OWNER_SPIN))
3286 return 0; 3286 return 0;
3287 3287
3288 rcu_read_lock(); 3288 rcu_read_lock();
3289 while (owner_running(lock, owner)) { 3289 while (owner_running(lock, owner)) {
3290 if (need_resched()) 3290 if (need_resched())
3291 break; 3291 break;
3292 3292
3293 arch_mutex_cpu_relax(); 3293 arch_mutex_cpu_relax();
3294 } 3294 }
3295 rcu_read_unlock(); 3295 rcu_read_unlock();
3296 3296
3297 /* 3297 /*
3298 * We break out the loop above on need_resched() and when the 3298 * We break out the loop above on need_resched() and when the
3299 * owner changed, which is a sign for heavy contention. Return 3299 * owner changed, which is a sign for heavy contention. Return
3300 * success only when lock->owner is NULL. 3300 * success only when lock->owner is NULL.
3301 */ 3301 */
3302 return lock->owner == NULL; 3302 return lock->owner == NULL;
3303 } 3303 }
3304 #endif 3304 #endif
3305 3305
3306 #ifdef CONFIG_PREEMPT 3306 #ifdef CONFIG_PREEMPT
3307 /* 3307 /*
3308 * this is the entry point to schedule() from in-kernel preemption 3308 * this is the entry point to schedule() from in-kernel preemption
3309 * off of preempt_enable. Kernel preemptions off return from interrupt 3309 * off of preempt_enable. Kernel preemptions off return from interrupt
3310 * occur there and call schedule directly. 3310 * occur there and call schedule directly.
3311 */ 3311 */
3312 asmlinkage void __sched notrace preempt_schedule(void) 3312 asmlinkage void __sched notrace preempt_schedule(void)
3313 { 3313 {
3314 struct thread_info *ti = current_thread_info(); 3314 struct thread_info *ti = current_thread_info();
3315 3315
3316 /* 3316 /*
3317 * If there is a non-zero preempt_count or interrupts are disabled, 3317 * If there is a non-zero preempt_count or interrupts are disabled,
3318 * we do not want to preempt the current task. Just return.. 3318 * we do not want to preempt the current task. Just return..
3319 */ 3319 */
3320 if (likely(ti->preempt_count || irqs_disabled())) 3320 if (likely(ti->preempt_count || irqs_disabled()))
3321 return; 3321 return;
3322 3322
3323 do { 3323 do {
3324 add_preempt_count_notrace(PREEMPT_ACTIVE); 3324 add_preempt_count_notrace(PREEMPT_ACTIVE);
3325 __schedule(); 3325 __schedule();
3326 sub_preempt_count_notrace(PREEMPT_ACTIVE); 3326 sub_preempt_count_notrace(PREEMPT_ACTIVE);
3327 3327
3328 /* 3328 /*
3329 * Check again in case we missed a preemption opportunity 3329 * Check again in case we missed a preemption opportunity
3330 * between schedule and now. 3330 * between schedule and now.
3331 */ 3331 */
3332 barrier(); 3332 barrier();
3333 } while (need_resched()); 3333 } while (need_resched());
3334 } 3334 }
3335 EXPORT_SYMBOL(preempt_schedule); 3335 EXPORT_SYMBOL(preempt_schedule);
3336 3336
3337 /* 3337 /*
3338 * this is the entry point to schedule() from kernel preemption 3338 * this is the entry point to schedule() from kernel preemption
3339 * off of irq context. 3339 * off of irq context.
3340 * Note, that this is called and return with irqs disabled. This will 3340 * Note, that this is called and return with irqs disabled. This will
3341 * protect us against recursive calling from irq. 3341 * protect us against recursive calling from irq.
3342 */ 3342 */
3343 asmlinkage void __sched preempt_schedule_irq(void) 3343 asmlinkage void __sched preempt_schedule_irq(void)
3344 { 3344 {
3345 struct thread_info *ti = current_thread_info(); 3345 struct thread_info *ti = current_thread_info();
3346 3346
3347 /* Catch callers which need to be fixed */ 3347 /* Catch callers which need to be fixed */
3348 BUG_ON(ti->preempt_count || !irqs_disabled()); 3348 BUG_ON(ti->preempt_count || !irqs_disabled());
3349 3349
3350 do { 3350 do {
3351 add_preempt_count(PREEMPT_ACTIVE); 3351 add_preempt_count(PREEMPT_ACTIVE);
3352 local_irq_enable(); 3352 local_irq_enable();
3353 __schedule(); 3353 __schedule();
3354 local_irq_disable(); 3354 local_irq_disable();
3355 sub_preempt_count(PREEMPT_ACTIVE); 3355 sub_preempt_count(PREEMPT_ACTIVE);
3356 3356
3357 /* 3357 /*
3358 * Check again in case we missed a preemption opportunity 3358 * Check again in case we missed a preemption opportunity
3359 * between schedule and now. 3359 * between schedule and now.
3360 */ 3360 */
3361 barrier(); 3361 barrier();
3362 } while (need_resched()); 3362 } while (need_resched());
3363 } 3363 }
3364 3364
3365 #endif /* CONFIG_PREEMPT */ 3365 #endif /* CONFIG_PREEMPT */
3366 3366
3367 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3367 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3368 void *key) 3368 void *key)
3369 { 3369 {
3370 return try_to_wake_up(curr->private, mode, wake_flags); 3370 return try_to_wake_up(curr->private, mode, wake_flags);
3371 } 3371 }
3372 EXPORT_SYMBOL(default_wake_function); 3372 EXPORT_SYMBOL(default_wake_function);
3373 3373
3374 /* 3374 /*
3375 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3375 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3376 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3376 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3377 * number) then we wake all the non-exclusive tasks and one exclusive task. 3377 * number) then we wake all the non-exclusive tasks and one exclusive task.
3378 * 3378 *
3379 * There are circumstances in which we can try to wake a task which has already 3379 * There are circumstances in which we can try to wake a task which has already
3380 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3380 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3381 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3381 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3382 */ 3382 */
3383 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3383 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3384 int nr_exclusive, int wake_flags, void *key) 3384 int nr_exclusive, int wake_flags, void *key)
3385 { 3385 {
3386 wait_queue_t *curr, *next; 3386 wait_queue_t *curr, *next;
3387 3387
3388 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 3388 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3389 unsigned flags = curr->flags; 3389 unsigned flags = curr->flags;
3390 3390
3391 if (curr->func(curr, mode, wake_flags, key) && 3391 if (curr->func(curr, mode, wake_flags, key) &&
3392 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 3392 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3393 break; 3393 break;
3394 } 3394 }
3395 } 3395 }
3396 3396
3397 /** 3397 /**
3398 * __wake_up - wake up threads blocked on a waitqueue. 3398 * __wake_up - wake up threads blocked on a waitqueue.
3399 * @q: the waitqueue 3399 * @q: the waitqueue
3400 * @mode: which threads 3400 * @mode: which threads
3401 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3401 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3402 * @key: is directly passed to the wakeup function 3402 * @key: is directly passed to the wakeup function
3403 * 3403 *
3404 * It may be assumed that this function implies a write memory barrier before 3404 * It may be assumed that this function implies a write memory barrier before
3405 * changing the task state if and only if any tasks are woken up. 3405 * changing the task state if and only if any tasks are woken up.
3406 */ 3406 */
3407 void __wake_up(wait_queue_head_t *q, unsigned int mode, 3407 void __wake_up(wait_queue_head_t *q, unsigned int mode,
3408 int nr_exclusive, void *key) 3408 int nr_exclusive, void *key)
3409 { 3409 {
3410 unsigned long flags; 3410 unsigned long flags;
3411 3411
3412 spin_lock_irqsave(&q->lock, flags); 3412 spin_lock_irqsave(&q->lock, flags);
3413 __wake_up_common(q, mode, nr_exclusive, 0, key); 3413 __wake_up_common(q, mode, nr_exclusive, 0, key);
3414 spin_unlock_irqrestore(&q->lock, flags); 3414 spin_unlock_irqrestore(&q->lock, flags);
3415 } 3415 }
3416 EXPORT_SYMBOL(__wake_up); 3416 EXPORT_SYMBOL(__wake_up);
3417 3417
3418 /* 3418 /*
3419 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3419 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3420 */ 3420 */
3421 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3421 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3422 { 3422 {
3423 __wake_up_common(q, mode, 1, 0, NULL); 3423 __wake_up_common(q, mode, 1, 0, NULL);
3424 } 3424 }
3425 EXPORT_SYMBOL_GPL(__wake_up_locked); 3425 EXPORT_SYMBOL_GPL(__wake_up_locked);
3426 3426
3427 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3427 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3428 { 3428 {
3429 __wake_up_common(q, mode, 1, 0, key); 3429 __wake_up_common(q, mode, 1, 0, key);
3430 } 3430 }
3431 EXPORT_SYMBOL_GPL(__wake_up_locked_key); 3431 EXPORT_SYMBOL_GPL(__wake_up_locked_key);
3432 3432
3433 /** 3433 /**
3434 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 3434 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
3435 * @q: the waitqueue 3435 * @q: the waitqueue
3436 * @mode: which threads 3436 * @mode: which threads
3437 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3437 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3438 * @key: opaque value to be passed to wakeup targets 3438 * @key: opaque value to be passed to wakeup targets
3439 * 3439 *
3440 * The sync wakeup differs that the waker knows that it will schedule 3440 * The sync wakeup differs that the waker knows that it will schedule
3441 * away soon, so while the target thread will be woken up, it will not 3441 * away soon, so while the target thread will be woken up, it will not
3442 * be migrated to another CPU - ie. the two threads are 'synchronized' 3442 * be migrated to another CPU - ie. the two threads are 'synchronized'
3443 * with each other. This can prevent needless bouncing between CPUs. 3443 * with each other. This can prevent needless bouncing between CPUs.
3444 * 3444 *
3445 * On UP it can prevent extra preemption. 3445 * On UP it can prevent extra preemption.
3446 * 3446 *
3447 * It may be assumed that this function implies a write memory barrier before 3447 * It may be assumed that this function implies a write memory barrier before
3448 * changing the task state if and only if any tasks are woken up. 3448 * changing the task state if and only if any tasks are woken up.
3449 */ 3449 */
3450 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 3450 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3451 int nr_exclusive, void *key) 3451 int nr_exclusive, void *key)
3452 { 3452 {
3453 unsigned long flags; 3453 unsigned long flags;
3454 int wake_flags = WF_SYNC; 3454 int wake_flags = WF_SYNC;
3455 3455
3456 if (unlikely(!q)) 3456 if (unlikely(!q))
3457 return; 3457 return;
3458 3458
3459 if (unlikely(!nr_exclusive)) 3459 if (unlikely(!nr_exclusive))
3460 wake_flags = 0; 3460 wake_flags = 0;
3461 3461
3462 spin_lock_irqsave(&q->lock, flags); 3462 spin_lock_irqsave(&q->lock, flags);
3463 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 3463 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3464 spin_unlock_irqrestore(&q->lock, flags); 3464 spin_unlock_irqrestore(&q->lock, flags);
3465 } 3465 }
3466 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 3466 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3467 3467
3468 /* 3468 /*
3469 * __wake_up_sync - see __wake_up_sync_key() 3469 * __wake_up_sync - see __wake_up_sync_key()
3470 */ 3470 */
3471 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3471 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3472 { 3472 {
3473 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 3473 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3474 } 3474 }
3475 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3475 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3476 3476
3477 /** 3477 /**
3478 * complete: - signals a single thread waiting on this completion 3478 * complete: - signals a single thread waiting on this completion
3479 * @x: holds the state of this particular completion 3479 * @x: holds the state of this particular completion
3480 * 3480 *
3481 * This will wake up a single thread waiting on this completion. Threads will be 3481 * This will wake up a single thread waiting on this completion. Threads will be
3482 * awakened in the same order in which they were queued. 3482 * awakened in the same order in which they were queued.
3483 * 3483 *
3484 * See also complete_all(), wait_for_completion() and related routines. 3484 * See also complete_all(), wait_for_completion() and related routines.
3485 * 3485 *
3486 * It may be assumed that this function implies a write memory barrier before 3486 * It may be assumed that this function implies a write memory barrier before
3487 * changing the task state if and only if any tasks are woken up. 3487 * changing the task state if and only if any tasks are woken up.
3488 */ 3488 */
3489 void complete(struct completion *x) 3489 void complete(struct completion *x)
3490 { 3490 {
3491 unsigned long flags; 3491 unsigned long flags;
3492 3492
3493 spin_lock_irqsave(&x->wait.lock, flags); 3493 spin_lock_irqsave(&x->wait.lock, flags);
3494 x->done++; 3494 x->done++;
3495 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 3495 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3496 spin_unlock_irqrestore(&x->wait.lock, flags); 3496 spin_unlock_irqrestore(&x->wait.lock, flags);
3497 } 3497 }
3498 EXPORT_SYMBOL(complete); 3498 EXPORT_SYMBOL(complete);
3499 3499
3500 /** 3500 /**
3501 * complete_all: - signals all threads waiting on this completion 3501 * complete_all: - signals all threads waiting on this completion
3502 * @x: holds the state of this particular completion 3502 * @x: holds the state of this particular completion
3503 * 3503 *
3504 * This will wake up all threads waiting on this particular completion event. 3504 * This will wake up all threads waiting on this particular completion event.
3505 * 3505 *
3506 * It may be assumed that this function implies a write memory barrier before 3506 * It may be assumed that this function implies a write memory barrier before
3507 * changing the task state if and only if any tasks are woken up. 3507 * changing the task state if and only if any tasks are woken up.
3508 */ 3508 */
3509 void complete_all(struct completion *x) 3509 void complete_all(struct completion *x)
3510 { 3510 {
3511 unsigned long flags; 3511 unsigned long flags;
3512 3512
3513 spin_lock_irqsave(&x->wait.lock, flags); 3513 spin_lock_irqsave(&x->wait.lock, flags);
3514 x->done += UINT_MAX/2; 3514 x->done += UINT_MAX/2;
3515 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 3515 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3516 spin_unlock_irqrestore(&x->wait.lock, flags); 3516 spin_unlock_irqrestore(&x->wait.lock, flags);
3517 } 3517 }
3518 EXPORT_SYMBOL(complete_all); 3518 EXPORT_SYMBOL(complete_all);
3519 3519
3520 static inline long __sched 3520 static inline long __sched
3521 do_wait_for_common(struct completion *x, long timeout, int state) 3521 do_wait_for_common(struct completion *x, long timeout, int state)
3522 { 3522 {
3523 if (!x->done) { 3523 if (!x->done) {
3524 DECLARE_WAITQUEUE(wait, current); 3524 DECLARE_WAITQUEUE(wait, current);
3525 3525
3526 __add_wait_queue_tail_exclusive(&x->wait, &wait); 3526 __add_wait_queue_tail_exclusive(&x->wait, &wait);
3527 do { 3527 do {
3528 if (signal_pending_state(state, current)) { 3528 if (signal_pending_state(state, current)) {
3529 timeout = -ERESTARTSYS; 3529 timeout = -ERESTARTSYS;
3530 break; 3530 break;
3531 } 3531 }
3532 __set_current_state(state); 3532 __set_current_state(state);
3533 spin_unlock_irq(&x->wait.lock); 3533 spin_unlock_irq(&x->wait.lock);
3534 timeout = schedule_timeout(timeout); 3534 timeout = schedule_timeout(timeout);
3535 spin_lock_irq(&x->wait.lock); 3535 spin_lock_irq(&x->wait.lock);
3536 } while (!x->done && timeout); 3536 } while (!x->done && timeout);
3537 __remove_wait_queue(&x->wait, &wait); 3537 __remove_wait_queue(&x->wait, &wait);
3538 if (!x->done) 3538 if (!x->done)
3539 return timeout; 3539 return timeout;
3540 } 3540 }
3541 x->done--; 3541 x->done--;
3542 return timeout ?: 1; 3542 return timeout ?: 1;
3543 } 3543 }
3544 3544
3545 static long __sched 3545 static long __sched
3546 wait_for_common(struct completion *x, long timeout, int state) 3546 wait_for_common(struct completion *x, long timeout, int state)
3547 { 3547 {
3548 might_sleep(); 3548 might_sleep();
3549 3549
3550 spin_lock_irq(&x->wait.lock); 3550 spin_lock_irq(&x->wait.lock);
3551 timeout = do_wait_for_common(x, timeout, state); 3551 timeout = do_wait_for_common(x, timeout, state);
3552 spin_unlock_irq(&x->wait.lock); 3552 spin_unlock_irq(&x->wait.lock);
3553 return timeout; 3553 return timeout;
3554 } 3554 }
3555 3555
3556 /** 3556 /**
3557 * wait_for_completion: - waits for completion of a task 3557 * wait_for_completion: - waits for completion of a task
3558 * @x: holds the state of this particular completion 3558 * @x: holds the state of this particular completion
3559 * 3559 *
3560 * This waits to be signaled for completion of a specific task. It is NOT 3560 * This waits to be signaled for completion of a specific task. It is NOT
3561 * interruptible and there is no timeout. 3561 * interruptible and there is no timeout.
3562 * 3562 *
3563 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 3563 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3564 * and interrupt capability. Also see complete(). 3564 * and interrupt capability. Also see complete().
3565 */ 3565 */
3566 void __sched wait_for_completion(struct completion *x) 3566 void __sched wait_for_completion(struct completion *x)
3567 { 3567 {
3568 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 3568 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3569 } 3569 }
3570 EXPORT_SYMBOL(wait_for_completion); 3570 EXPORT_SYMBOL(wait_for_completion);
3571 3571
3572 /** 3572 /**
3573 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 3573 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
3574 * @x: holds the state of this particular completion 3574 * @x: holds the state of this particular completion
3575 * @timeout: timeout value in jiffies 3575 * @timeout: timeout value in jiffies
3576 * 3576 *
3577 * This waits for either a completion of a specific task to be signaled or for a 3577 * This waits for either a completion of a specific task to be signaled or for a
3578 * specified timeout to expire. The timeout is in jiffies. It is not 3578 * specified timeout to expire. The timeout is in jiffies. It is not
3579 * interruptible. 3579 * interruptible.
3580 * 3580 *
3581 * The return value is 0 if timed out, and positive (at least 1, or number of 3581 * The return value is 0 if timed out, and positive (at least 1, or number of
3582 * jiffies left till timeout) if completed. 3582 * jiffies left till timeout) if completed.
3583 */ 3583 */
3584 unsigned long __sched 3584 unsigned long __sched
3585 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3585 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3586 { 3586 {
3587 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 3587 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
3588 } 3588 }
3589 EXPORT_SYMBOL(wait_for_completion_timeout); 3589 EXPORT_SYMBOL(wait_for_completion_timeout);
3590 3590
3591 /** 3591 /**
3592 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 3592 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
3593 * @x: holds the state of this particular completion 3593 * @x: holds the state of this particular completion
3594 * 3594 *
3595 * This waits for completion of a specific task to be signaled. It is 3595 * This waits for completion of a specific task to be signaled. It is
3596 * interruptible. 3596 * interruptible.
3597 * 3597 *
3598 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 3598 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3599 */ 3599 */
3600 int __sched wait_for_completion_interruptible(struct completion *x) 3600 int __sched wait_for_completion_interruptible(struct completion *x)
3601 { 3601 {
3602 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 3602 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
3603 if (t == -ERESTARTSYS) 3603 if (t == -ERESTARTSYS)
3604 return t; 3604 return t;
3605 return 0; 3605 return 0;
3606 } 3606 }
3607 EXPORT_SYMBOL(wait_for_completion_interruptible); 3607 EXPORT_SYMBOL(wait_for_completion_interruptible);
3608 3608
3609 /** 3609 /**
3610 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 3610 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
3611 * @x: holds the state of this particular completion 3611 * @x: holds the state of this particular completion
3612 * @timeout: timeout value in jiffies 3612 * @timeout: timeout value in jiffies
3613 * 3613 *
3614 * This waits for either a completion of a specific task to be signaled or for a 3614 * This waits for either a completion of a specific task to be signaled or for a
3615 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 3615 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
3616 * 3616 *
3617 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 3617 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3618 * positive (at least 1, or number of jiffies left till timeout) if completed. 3618 * positive (at least 1, or number of jiffies left till timeout) if completed.
3619 */ 3619 */
3620 long __sched 3620 long __sched
3621 wait_for_completion_interruptible_timeout(struct completion *x, 3621 wait_for_completion_interruptible_timeout(struct completion *x,
3622 unsigned long timeout) 3622 unsigned long timeout)
3623 { 3623 {
3624 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 3624 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
3625 } 3625 }
3626 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3626 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3627 3627
3628 /** 3628 /**
3629 * wait_for_completion_killable: - waits for completion of a task (killable) 3629 * wait_for_completion_killable: - waits for completion of a task (killable)
3630 * @x: holds the state of this particular completion 3630 * @x: holds the state of this particular completion
3631 * 3631 *
3632 * This waits to be signaled for completion of a specific task. It can be 3632 * This waits to be signaled for completion of a specific task. It can be
3633 * interrupted by a kill signal. 3633 * interrupted by a kill signal.
3634 * 3634 *
3635 * The return value is -ERESTARTSYS if interrupted, 0 if completed. 3635 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
3636 */ 3636 */
3637 int __sched wait_for_completion_killable(struct completion *x) 3637 int __sched wait_for_completion_killable(struct completion *x)
3638 { 3638 {
3639 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 3639 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
3640 if (t == -ERESTARTSYS) 3640 if (t == -ERESTARTSYS)
3641 return t; 3641 return t;
3642 return 0; 3642 return 0;
3643 } 3643 }
3644 EXPORT_SYMBOL(wait_for_completion_killable); 3644 EXPORT_SYMBOL(wait_for_completion_killable);
3645 3645
3646 /** 3646 /**
3647 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) 3647 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
3648 * @x: holds the state of this particular completion 3648 * @x: holds the state of this particular completion
3649 * @timeout: timeout value in jiffies 3649 * @timeout: timeout value in jiffies
3650 * 3650 *
3651 * This waits for either a completion of a specific task to be 3651 * This waits for either a completion of a specific task to be
3652 * signaled or for a specified timeout to expire. It can be 3652 * signaled or for a specified timeout to expire. It can be
3653 * interrupted by a kill signal. The timeout is in jiffies. 3653 * interrupted by a kill signal. The timeout is in jiffies.
3654 * 3654 *
3655 * The return value is -ERESTARTSYS if interrupted, 0 if timed out, 3655 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
3656 * positive (at least 1, or number of jiffies left till timeout) if completed. 3656 * positive (at least 1, or number of jiffies left till timeout) if completed.
3657 */ 3657 */
3658 long __sched 3658 long __sched
3659 wait_for_completion_killable_timeout(struct completion *x, 3659 wait_for_completion_killable_timeout(struct completion *x,
3660 unsigned long timeout) 3660 unsigned long timeout)
3661 { 3661 {
3662 return wait_for_common(x, timeout, TASK_KILLABLE); 3662 return wait_for_common(x, timeout, TASK_KILLABLE);
3663 } 3663 }
3664 EXPORT_SYMBOL(wait_for_completion_killable_timeout); 3664 EXPORT_SYMBOL(wait_for_completion_killable_timeout);
3665 3665
3666 /** 3666 /**
3667 * try_wait_for_completion - try to decrement a completion without blocking 3667 * try_wait_for_completion - try to decrement a completion without blocking
3668 * @x: completion structure 3668 * @x: completion structure
3669 * 3669 *
3670 * Returns: 0 if a decrement cannot be done without blocking 3670 * Returns: 0 if a decrement cannot be done without blocking
3671 * 1 if a decrement succeeded. 3671 * 1 if a decrement succeeded.
3672 * 3672 *
3673 * If a completion is being used as a counting completion, 3673 * If a completion is being used as a counting completion,
3674 * attempt to decrement the counter without blocking. This 3674 * attempt to decrement the counter without blocking. This
3675 * enables us to avoid waiting if the resource the completion 3675 * enables us to avoid waiting if the resource the completion
3676 * is protecting is not available. 3676 * is protecting is not available.
3677 */ 3677 */
3678 bool try_wait_for_completion(struct completion *x) 3678 bool try_wait_for_completion(struct completion *x)
3679 { 3679 {
3680 unsigned long flags; 3680 unsigned long flags;
3681 int ret = 1; 3681 int ret = 1;
3682 3682
3683 spin_lock_irqsave(&x->wait.lock, flags); 3683 spin_lock_irqsave(&x->wait.lock, flags);
3684 if (!x->done) 3684 if (!x->done)
3685 ret = 0; 3685 ret = 0;
3686 else 3686 else
3687 x->done--; 3687 x->done--;
3688 spin_unlock_irqrestore(&x->wait.lock, flags); 3688 spin_unlock_irqrestore(&x->wait.lock, flags);
3689 return ret; 3689 return ret;
3690 } 3690 }
3691 EXPORT_SYMBOL(try_wait_for_completion); 3691 EXPORT_SYMBOL(try_wait_for_completion);
3692 3692
3693 /** 3693 /**
3694 * completion_done - Test to see if a completion has any waiters 3694 * completion_done - Test to see if a completion has any waiters
3695 * @x: completion structure 3695 * @x: completion structure
3696 * 3696 *
3697 * Returns: 0 if there are waiters (wait_for_completion() in progress) 3697 * Returns: 0 if there are waiters (wait_for_completion() in progress)
3698 * 1 if there are no waiters. 3698 * 1 if there are no waiters.
3699 * 3699 *
3700 */ 3700 */
3701 bool completion_done(struct completion *x) 3701 bool completion_done(struct completion *x)
3702 { 3702 {
3703 unsigned long flags; 3703 unsigned long flags;
3704 int ret = 1; 3704 int ret = 1;
3705 3705
3706 spin_lock_irqsave(&x->wait.lock, flags); 3706 spin_lock_irqsave(&x->wait.lock, flags);
3707 if (!x->done) 3707 if (!x->done)
3708 ret = 0; 3708 ret = 0;
3709 spin_unlock_irqrestore(&x->wait.lock, flags); 3709 spin_unlock_irqrestore(&x->wait.lock, flags);
3710 return ret; 3710 return ret;
3711 } 3711 }
3712 EXPORT_SYMBOL(completion_done); 3712 EXPORT_SYMBOL(completion_done);
3713 3713
3714 static long __sched 3714 static long __sched
3715 sleep_on_common(wait_queue_head_t *q, int state, long timeout) 3715 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3716 { 3716 {
3717 unsigned long flags; 3717 unsigned long flags;
3718 wait_queue_t wait; 3718 wait_queue_t wait;
3719 3719
3720 init_waitqueue_entry(&wait, current); 3720 init_waitqueue_entry(&wait, current);
3721 3721
3722 __set_current_state(state); 3722 __set_current_state(state);
3723 3723
3724 spin_lock_irqsave(&q->lock, flags); 3724 spin_lock_irqsave(&q->lock, flags);
3725 __add_wait_queue(q, &wait); 3725 __add_wait_queue(q, &wait);
3726 spin_unlock(&q->lock); 3726 spin_unlock(&q->lock);
3727 timeout = schedule_timeout(timeout); 3727 timeout = schedule_timeout(timeout);
3728 spin_lock_irq(&q->lock); 3728 spin_lock_irq(&q->lock);
3729 __remove_wait_queue(q, &wait); 3729 __remove_wait_queue(q, &wait);
3730 spin_unlock_irqrestore(&q->lock, flags); 3730 spin_unlock_irqrestore(&q->lock, flags);
3731 3731
3732 return timeout; 3732 return timeout;
3733 } 3733 }
3734 3734
3735 void __sched interruptible_sleep_on(wait_queue_head_t *q) 3735 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3736 { 3736 {
3737 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3737 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3738 } 3738 }
3739 EXPORT_SYMBOL(interruptible_sleep_on); 3739 EXPORT_SYMBOL(interruptible_sleep_on);
3740 3740
3741 long __sched 3741 long __sched
3742 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3742 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3743 { 3743 {
3744 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 3744 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
3745 } 3745 }
3746 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3746 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3747 3747
3748 void __sched sleep_on(wait_queue_head_t *q) 3748 void __sched sleep_on(wait_queue_head_t *q)
3749 { 3749 {
3750 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 3750 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
3751 } 3751 }
3752 EXPORT_SYMBOL(sleep_on); 3752 EXPORT_SYMBOL(sleep_on);
3753 3753
3754 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3754 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3755 { 3755 {
3756 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 3756 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
3757 } 3757 }
3758 EXPORT_SYMBOL(sleep_on_timeout); 3758 EXPORT_SYMBOL(sleep_on_timeout);
3759 3759
3760 #ifdef CONFIG_RT_MUTEXES 3760 #ifdef CONFIG_RT_MUTEXES
3761 3761
3762 /* 3762 /*
3763 * rt_mutex_setprio - set the current priority of a task 3763 * rt_mutex_setprio - set the current priority of a task
3764 * @p: task 3764 * @p: task
3765 * @prio: prio value (kernel-internal form) 3765 * @prio: prio value (kernel-internal form)
3766 * 3766 *
3767 * This function changes the 'effective' priority of a task. It does 3767 * This function changes the 'effective' priority of a task. It does
3768 * not touch ->normal_prio like __setscheduler(). 3768 * not touch ->normal_prio like __setscheduler().
3769 * 3769 *
3770 * Used by the rt_mutex code to implement priority inheritance logic. 3770 * Used by the rt_mutex code to implement priority inheritance logic.
3771 */ 3771 */
3772 void rt_mutex_setprio(struct task_struct *p, int prio) 3772 void rt_mutex_setprio(struct task_struct *p, int prio)
3773 { 3773 {
3774 int oldprio, on_rq, running; 3774 int oldprio, on_rq, running;
3775 struct rq *rq; 3775 struct rq *rq;
3776 const struct sched_class *prev_class; 3776 const struct sched_class *prev_class;
3777 3777
3778 BUG_ON(prio < 0 || prio > MAX_PRIO); 3778 BUG_ON(prio < 0 || prio > MAX_PRIO);
3779 3779
3780 rq = __task_rq_lock(p); 3780 rq = __task_rq_lock(p);
3781 3781
3782 trace_sched_pi_setprio(p, prio); 3782 trace_sched_pi_setprio(p, prio);
3783 oldprio = p->prio; 3783 oldprio = p->prio;
3784 prev_class = p->sched_class; 3784 prev_class = p->sched_class;
3785 on_rq = p->on_rq; 3785 on_rq = p->on_rq;
3786 running = task_current(rq, p); 3786 running = task_current(rq, p);
3787 if (on_rq) 3787 if (on_rq)
3788 dequeue_task(rq, p, 0); 3788 dequeue_task(rq, p, 0);
3789 if (running) 3789 if (running)
3790 p->sched_class->put_prev_task(rq, p); 3790 p->sched_class->put_prev_task(rq, p);
3791 3791
3792 if (rt_prio(prio)) 3792 if (rt_prio(prio))
3793 p->sched_class = &rt_sched_class; 3793 p->sched_class = &rt_sched_class;
3794 else 3794 else
3795 p->sched_class = &fair_sched_class; 3795 p->sched_class = &fair_sched_class;
3796 3796
3797 p->prio = prio; 3797 p->prio = prio;
3798 3798
3799 if (running) 3799 if (running)
3800 p->sched_class->set_curr_task(rq); 3800 p->sched_class->set_curr_task(rq);
3801 if (on_rq) 3801 if (on_rq)
3802 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3802 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3803 3803
3804 check_class_changed(rq, p, prev_class, oldprio); 3804 check_class_changed(rq, p, prev_class, oldprio);
3805 __task_rq_unlock(rq); 3805 __task_rq_unlock(rq);
3806 } 3806 }
3807 3807
3808 #endif 3808 #endif
3809 3809
3810 void set_user_nice(struct task_struct *p, long nice) 3810 void set_user_nice(struct task_struct *p, long nice)
3811 { 3811 {
3812 int old_prio, delta, on_rq; 3812 int old_prio, delta, on_rq;
3813 unsigned long flags; 3813 unsigned long flags;
3814 struct rq *rq; 3814 struct rq *rq;
3815 3815
3816 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3816 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3817 return; 3817 return;
3818 /* 3818 /*
3819 * We have to be careful, if called from sys_setpriority(), 3819 * We have to be careful, if called from sys_setpriority(),
3820 * the task might be in the middle of scheduling on another CPU. 3820 * the task might be in the middle of scheduling on another CPU.
3821 */ 3821 */
3822 rq = task_rq_lock(p, &flags); 3822 rq = task_rq_lock(p, &flags);
3823 /* 3823 /*
3824 * The RT priorities are set via sched_setscheduler(), but we still 3824 * The RT priorities are set via sched_setscheduler(), but we still
3825 * allow the 'normal' nice value to be set - but as expected 3825 * allow the 'normal' nice value to be set - but as expected
3826 * it wont have any effect on scheduling until the task is 3826 * it wont have any effect on scheduling until the task is
3827 * SCHED_FIFO/SCHED_RR: 3827 * SCHED_FIFO/SCHED_RR:
3828 */ 3828 */
3829 if (task_has_rt_policy(p)) { 3829 if (task_has_rt_policy(p)) {
3830 p->static_prio = NICE_TO_PRIO(nice); 3830 p->static_prio = NICE_TO_PRIO(nice);
3831 goto out_unlock; 3831 goto out_unlock;
3832 } 3832 }
3833 on_rq = p->on_rq; 3833 on_rq = p->on_rq;
3834 if (on_rq) 3834 if (on_rq)
3835 dequeue_task(rq, p, 0); 3835 dequeue_task(rq, p, 0);
3836 3836
3837 p->static_prio = NICE_TO_PRIO(nice); 3837 p->static_prio = NICE_TO_PRIO(nice);
3838 set_load_weight(p); 3838 set_load_weight(p);
3839 old_prio = p->prio; 3839 old_prio = p->prio;
3840 p->prio = effective_prio(p); 3840 p->prio = effective_prio(p);
3841 delta = p->prio - old_prio; 3841 delta = p->prio - old_prio;
3842 3842
3843 if (on_rq) { 3843 if (on_rq) {
3844 enqueue_task(rq, p, 0); 3844 enqueue_task(rq, p, 0);
3845 /* 3845 /*
3846 * If the task increased its priority or is running and 3846 * If the task increased its priority or is running and
3847 * lowered its priority, then reschedule its CPU: 3847 * lowered its priority, then reschedule its CPU:
3848 */ 3848 */
3849 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3849 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3850 resched_task(rq->curr); 3850 resched_task(rq->curr);
3851 } 3851 }
3852 out_unlock: 3852 out_unlock:
3853 task_rq_unlock(rq, p, &flags); 3853 task_rq_unlock(rq, p, &flags);
3854 } 3854 }
3855 EXPORT_SYMBOL(set_user_nice); 3855 EXPORT_SYMBOL(set_user_nice);
3856 3856
3857 /* 3857 /*
3858 * can_nice - check if a task can reduce its nice value 3858 * can_nice - check if a task can reduce its nice value
3859 * @p: task 3859 * @p: task
3860 * @nice: nice value 3860 * @nice: nice value
3861 */ 3861 */
3862 int can_nice(const struct task_struct *p, const int nice) 3862 int can_nice(const struct task_struct *p, const int nice)
3863 { 3863 {
3864 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3864 /* convert nice value [19,-20] to rlimit style value [1,40] */
3865 int nice_rlim = 20 - nice; 3865 int nice_rlim = 20 - nice;
3866 3866
3867 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3867 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3868 capable(CAP_SYS_NICE)); 3868 capable(CAP_SYS_NICE));
3869 } 3869 }
3870 3870
3871 #ifdef __ARCH_WANT_SYS_NICE 3871 #ifdef __ARCH_WANT_SYS_NICE
3872 3872
3873 /* 3873 /*
3874 * sys_nice - change the priority of the current process. 3874 * sys_nice - change the priority of the current process.
3875 * @increment: priority increment 3875 * @increment: priority increment
3876 * 3876 *
3877 * sys_setpriority is a more generic, but much slower function that 3877 * sys_setpriority is a more generic, but much slower function that
3878 * does similar things. 3878 * does similar things.
3879 */ 3879 */
3880 SYSCALL_DEFINE1(nice, int, increment) 3880 SYSCALL_DEFINE1(nice, int, increment)
3881 { 3881 {
3882 long nice, retval; 3882 long nice, retval;
3883 3883
3884 /* 3884 /*
3885 * Setpriority might change our priority at the same moment. 3885 * Setpriority might change our priority at the same moment.
3886 * We don't have to worry. Conceptually one call occurs first 3886 * We don't have to worry. Conceptually one call occurs first
3887 * and we have a single winner. 3887 * and we have a single winner.
3888 */ 3888 */
3889 if (increment < -40) 3889 if (increment < -40)
3890 increment = -40; 3890 increment = -40;
3891 if (increment > 40) 3891 if (increment > 40)
3892 increment = 40; 3892 increment = 40;
3893 3893
3894 nice = TASK_NICE(current) + increment; 3894 nice = TASK_NICE(current) + increment;
3895 if (nice < -20) 3895 if (nice < -20)
3896 nice = -20; 3896 nice = -20;
3897 if (nice > 19) 3897 if (nice > 19)
3898 nice = 19; 3898 nice = 19;
3899 3899
3900 if (increment < 0 && !can_nice(current, nice)) 3900 if (increment < 0 && !can_nice(current, nice))
3901 return -EPERM; 3901 return -EPERM;
3902 3902
3903 retval = security_task_setnice(current, nice); 3903 retval = security_task_setnice(current, nice);
3904 if (retval) 3904 if (retval)
3905 return retval; 3905 return retval;
3906 3906
3907 set_user_nice(current, nice); 3907 set_user_nice(current, nice);
3908 return 0; 3908 return 0;
3909 } 3909 }
3910 3910
3911 #endif 3911 #endif
3912 3912
3913 /** 3913 /**
3914 * task_prio - return the priority value of a given task. 3914 * task_prio - return the priority value of a given task.
3915 * @p: the task in question. 3915 * @p: the task in question.
3916 * 3916 *
3917 * This is the priority value as seen by users in /proc. 3917 * This is the priority value as seen by users in /proc.
3918 * RT tasks are offset by -200. Normal tasks are centered 3918 * RT tasks are offset by -200. Normal tasks are centered
3919 * around 0, value goes from -16 to +15. 3919 * around 0, value goes from -16 to +15.
3920 */ 3920 */
3921 int task_prio(const struct task_struct *p) 3921 int task_prio(const struct task_struct *p)
3922 { 3922 {
3923 return p->prio - MAX_RT_PRIO; 3923 return p->prio - MAX_RT_PRIO;
3924 } 3924 }
3925 3925
3926 /** 3926 /**
3927 * task_nice - return the nice value of a given task. 3927 * task_nice - return the nice value of a given task.
3928 * @p: the task in question. 3928 * @p: the task in question.
3929 */ 3929 */
3930 int task_nice(const struct task_struct *p) 3930 int task_nice(const struct task_struct *p)
3931 { 3931 {
3932 return TASK_NICE(p); 3932 return TASK_NICE(p);
3933 } 3933 }
3934 EXPORT_SYMBOL(task_nice); 3934 EXPORT_SYMBOL(task_nice);
3935 3935
3936 /** 3936 /**
3937 * idle_cpu - is a given cpu idle currently? 3937 * idle_cpu - is a given cpu idle currently?
3938 * @cpu: the processor in question. 3938 * @cpu: the processor in question.
3939 */ 3939 */
3940 int idle_cpu(int cpu) 3940 int idle_cpu(int cpu)
3941 { 3941 {
3942 struct rq *rq = cpu_rq(cpu); 3942 struct rq *rq = cpu_rq(cpu);
3943 3943
3944 if (rq->curr != rq->idle) 3944 if (rq->curr != rq->idle)
3945 return 0; 3945 return 0;
3946 3946
3947 if (rq->nr_running) 3947 if (rq->nr_running)
3948 return 0; 3948 return 0;
3949 3949
3950 #ifdef CONFIG_SMP 3950 #ifdef CONFIG_SMP
3951 if (!llist_empty(&rq->wake_list)) 3951 if (!llist_empty(&rq->wake_list))
3952 return 0; 3952 return 0;
3953 #endif 3953 #endif
3954 3954
3955 return 1; 3955 return 1;
3956 } 3956 }
3957 3957
3958 /** 3958 /**
3959 * idle_task - return the idle task for a given cpu. 3959 * idle_task - return the idle task for a given cpu.
3960 * @cpu: the processor in question. 3960 * @cpu: the processor in question.
3961 */ 3961 */
3962 struct task_struct *idle_task(int cpu) 3962 struct task_struct *idle_task(int cpu)
3963 { 3963 {
3964 return cpu_rq(cpu)->idle; 3964 return cpu_rq(cpu)->idle;
3965 } 3965 }
3966 3966
3967 /** 3967 /**
3968 * find_process_by_pid - find a process with a matching PID value. 3968 * find_process_by_pid - find a process with a matching PID value.
3969 * @pid: the pid in question. 3969 * @pid: the pid in question.
3970 */ 3970 */
3971 static struct task_struct *find_process_by_pid(pid_t pid) 3971 static struct task_struct *find_process_by_pid(pid_t pid)
3972 { 3972 {
3973 return pid ? find_task_by_vpid(pid) : current; 3973 return pid ? find_task_by_vpid(pid) : current;
3974 } 3974 }
3975 3975
3976 /* Actually do priority change: must hold rq lock. */ 3976 /* Actually do priority change: must hold rq lock. */
3977 static void 3977 static void
3978 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 3978 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
3979 { 3979 {
3980 p->policy = policy; 3980 p->policy = policy;
3981 p->rt_priority = prio; 3981 p->rt_priority = prio;
3982 p->normal_prio = normal_prio(p); 3982 p->normal_prio = normal_prio(p);
3983 /* we are holding p->pi_lock already */ 3983 /* we are holding p->pi_lock already */
3984 p->prio = rt_mutex_getprio(p); 3984 p->prio = rt_mutex_getprio(p);
3985 if (rt_prio(p->prio)) 3985 if (rt_prio(p->prio))
3986 p->sched_class = &rt_sched_class; 3986 p->sched_class = &rt_sched_class;
3987 else 3987 else
3988 p->sched_class = &fair_sched_class; 3988 p->sched_class = &fair_sched_class;
3989 set_load_weight(p); 3989 set_load_weight(p);
3990 } 3990 }
3991 3991
3992 /* 3992 /*
3993 * check the target process has a UID that matches the current process's 3993 * check the target process has a UID that matches the current process's
3994 */ 3994 */
3995 static bool check_same_owner(struct task_struct *p) 3995 static bool check_same_owner(struct task_struct *p)
3996 { 3996 {
3997 const struct cred *cred = current_cred(), *pcred; 3997 const struct cred *cred = current_cred(), *pcred;
3998 bool match; 3998 bool match;
3999 3999
4000 rcu_read_lock(); 4000 rcu_read_lock();
4001 pcred = __task_cred(p); 4001 pcred = __task_cred(p);
4002 if (cred->user->user_ns == pcred->user->user_ns) 4002 if (cred->user->user_ns == pcred->user->user_ns)
4003 match = (cred->euid == pcred->euid || 4003 match = (cred->euid == pcred->euid ||
4004 cred->euid == pcred->uid); 4004 cred->euid == pcred->uid);
4005 else 4005 else
4006 match = false; 4006 match = false;
4007 rcu_read_unlock(); 4007 rcu_read_unlock();
4008 return match; 4008 return match;
4009 } 4009 }
4010 4010
4011 static int __sched_setscheduler(struct task_struct *p, int policy, 4011 static int __sched_setscheduler(struct task_struct *p, int policy,
4012 const struct sched_param *param, bool user) 4012 const struct sched_param *param, bool user)
4013 { 4013 {
4014 int retval, oldprio, oldpolicy = -1, on_rq, running; 4014 int retval, oldprio, oldpolicy = -1, on_rq, running;
4015 unsigned long flags; 4015 unsigned long flags;
4016 const struct sched_class *prev_class; 4016 const struct sched_class *prev_class;
4017 struct rq *rq; 4017 struct rq *rq;
4018 int reset_on_fork; 4018 int reset_on_fork;
4019 4019
4020 /* may grab non-irq protected spin_locks */ 4020 /* may grab non-irq protected spin_locks */
4021 BUG_ON(in_interrupt()); 4021 BUG_ON(in_interrupt());
4022 recheck: 4022 recheck:
4023 /* double check policy once rq lock held */ 4023 /* double check policy once rq lock held */
4024 if (policy < 0) { 4024 if (policy < 0) {
4025 reset_on_fork = p->sched_reset_on_fork; 4025 reset_on_fork = p->sched_reset_on_fork;
4026 policy = oldpolicy = p->policy; 4026 policy = oldpolicy = p->policy;
4027 } else { 4027 } else {
4028 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 4028 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4029 policy &= ~SCHED_RESET_ON_FORK; 4029 policy &= ~SCHED_RESET_ON_FORK;
4030 4030
4031 if (policy != SCHED_FIFO && policy != SCHED_RR && 4031 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4032 policy != SCHED_NORMAL && policy != SCHED_BATCH && 4032 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4033 policy != SCHED_IDLE) 4033 policy != SCHED_IDLE)
4034 return -EINVAL; 4034 return -EINVAL;
4035 } 4035 }
4036 4036
4037 /* 4037 /*
4038 * Valid priorities for SCHED_FIFO and SCHED_RR are 4038 * Valid priorities for SCHED_FIFO and SCHED_RR are
4039 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4039 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4040 * SCHED_BATCH and SCHED_IDLE is 0. 4040 * SCHED_BATCH and SCHED_IDLE is 0.
4041 */ 4041 */
4042 if (param->sched_priority < 0 || 4042 if (param->sched_priority < 0 ||
4043 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4043 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4044 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4044 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4045 return -EINVAL; 4045 return -EINVAL;
4046 if (rt_policy(policy) != (param->sched_priority != 0)) 4046 if (rt_policy(policy) != (param->sched_priority != 0))
4047 return -EINVAL; 4047 return -EINVAL;
4048 4048
4049 /* 4049 /*
4050 * Allow unprivileged RT tasks to decrease priority: 4050 * Allow unprivileged RT tasks to decrease priority:
4051 */ 4051 */
4052 if (user && !capable(CAP_SYS_NICE)) { 4052 if (user && !capable(CAP_SYS_NICE)) {
4053 if (rt_policy(policy)) { 4053 if (rt_policy(policy)) {
4054 unsigned long rlim_rtprio = 4054 unsigned long rlim_rtprio =
4055 task_rlimit(p, RLIMIT_RTPRIO); 4055 task_rlimit(p, RLIMIT_RTPRIO);
4056 4056
4057 /* can't set/change the rt policy */ 4057 /* can't set/change the rt policy */
4058 if (policy != p->policy && !rlim_rtprio) 4058 if (policy != p->policy && !rlim_rtprio)
4059 return -EPERM; 4059 return -EPERM;
4060 4060
4061 /* can't increase priority */ 4061 /* can't increase priority */
4062 if (param->sched_priority > p->rt_priority && 4062 if (param->sched_priority > p->rt_priority &&
4063 param->sched_priority > rlim_rtprio) 4063 param->sched_priority > rlim_rtprio)
4064 return -EPERM; 4064 return -EPERM;
4065 } 4065 }
4066 4066
4067 /* 4067 /*
4068 * Treat SCHED_IDLE as nice 20. Only allow a switch to 4068 * Treat SCHED_IDLE as nice 20. Only allow a switch to
4069 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 4069 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4070 */ 4070 */
4071 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 4071 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
4072 if (!can_nice(p, TASK_NICE(p))) 4072 if (!can_nice(p, TASK_NICE(p)))
4073 return -EPERM; 4073 return -EPERM;
4074 } 4074 }
4075 4075
4076 /* can't change other user's priorities */ 4076 /* can't change other user's priorities */
4077 if (!check_same_owner(p)) 4077 if (!check_same_owner(p))
4078 return -EPERM; 4078 return -EPERM;
4079 4079
4080 /* Normal users shall not reset the sched_reset_on_fork flag */ 4080 /* Normal users shall not reset the sched_reset_on_fork flag */
4081 if (p->sched_reset_on_fork && !reset_on_fork) 4081 if (p->sched_reset_on_fork && !reset_on_fork)
4082 return -EPERM; 4082 return -EPERM;
4083 } 4083 }
4084 4084
4085 if (user) { 4085 if (user) {
4086 retval = security_task_setscheduler(p); 4086 retval = security_task_setscheduler(p);
4087 if (retval) 4087 if (retval)
4088 return retval; 4088 return retval;
4089 } 4089 }
4090 4090
4091 /* 4091 /*
4092 * make sure no PI-waiters arrive (or leave) while we are 4092 * make sure no PI-waiters arrive (or leave) while we are
4093 * changing the priority of the task: 4093 * changing the priority of the task:
4094 * 4094 *
4095 * To be able to change p->policy safely, the appropriate 4095 * To be able to change p->policy safely, the appropriate
4096 * runqueue lock must be held. 4096 * runqueue lock must be held.
4097 */ 4097 */
4098 rq = task_rq_lock(p, &flags); 4098 rq = task_rq_lock(p, &flags);
4099 4099
4100 /* 4100 /*
4101 * Changing the policy of the stop threads its a very bad idea 4101 * Changing the policy of the stop threads its a very bad idea
4102 */ 4102 */
4103 if (p == rq->stop) { 4103 if (p == rq->stop) {
4104 task_rq_unlock(rq, p, &flags); 4104 task_rq_unlock(rq, p, &flags);
4105 return -EINVAL; 4105 return -EINVAL;
4106 } 4106 }
4107 4107
4108 /* 4108 /*
4109 * If not changing anything there's no need to proceed further: 4109 * If not changing anything there's no need to proceed further:
4110 */ 4110 */
4111 if (unlikely(policy == p->policy && (!rt_policy(policy) || 4111 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
4112 param->sched_priority == p->rt_priority))) { 4112 param->sched_priority == p->rt_priority))) {
4113 4113
4114 __task_rq_unlock(rq); 4114 __task_rq_unlock(rq);
4115 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4115 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4116 return 0; 4116 return 0;
4117 } 4117 }
4118 4118
4119 #ifdef CONFIG_RT_GROUP_SCHED 4119 #ifdef CONFIG_RT_GROUP_SCHED
4120 if (user) { 4120 if (user) {
4121 /* 4121 /*
4122 * Do not allow realtime tasks into groups that have no runtime 4122 * Do not allow realtime tasks into groups that have no runtime
4123 * assigned. 4123 * assigned.
4124 */ 4124 */
4125 if (rt_bandwidth_enabled() && rt_policy(policy) && 4125 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4126 task_group(p)->rt_bandwidth.rt_runtime == 0 && 4126 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4127 !task_group_is_autogroup(task_group(p))) { 4127 !task_group_is_autogroup(task_group(p))) {
4128 task_rq_unlock(rq, p, &flags); 4128 task_rq_unlock(rq, p, &flags);
4129 return -EPERM; 4129 return -EPERM;
4130 } 4130 }
4131 } 4131 }
4132 #endif 4132 #endif
4133 4133
4134 /* recheck policy now with rq lock held */ 4134 /* recheck policy now with rq lock held */
4135 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4135 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4136 policy = oldpolicy = -1; 4136 policy = oldpolicy = -1;
4137 task_rq_unlock(rq, p, &flags); 4137 task_rq_unlock(rq, p, &flags);
4138 goto recheck; 4138 goto recheck;
4139 } 4139 }
4140 on_rq = p->on_rq; 4140 on_rq = p->on_rq;
4141 running = task_current(rq, p); 4141 running = task_current(rq, p);
4142 if (on_rq) 4142 if (on_rq)
4143 dequeue_task(rq, p, 0); 4143 dequeue_task(rq, p, 0);
4144 if (running) 4144 if (running)
4145 p->sched_class->put_prev_task(rq, p); 4145 p->sched_class->put_prev_task(rq, p);
4146 4146
4147 p->sched_reset_on_fork = reset_on_fork; 4147 p->sched_reset_on_fork = reset_on_fork;
4148 4148
4149 oldprio = p->prio; 4149 oldprio = p->prio;
4150 prev_class = p->sched_class; 4150 prev_class = p->sched_class;
4151 __setscheduler(rq, p, policy, param->sched_priority); 4151 __setscheduler(rq, p, policy, param->sched_priority);
4152 4152
4153 if (running) 4153 if (running)
4154 p->sched_class->set_curr_task(rq); 4154 p->sched_class->set_curr_task(rq);
4155 if (on_rq) 4155 if (on_rq)
4156 enqueue_task(rq, p, 0); 4156 enqueue_task(rq, p, 0);
4157 4157
4158 check_class_changed(rq, p, prev_class, oldprio); 4158 check_class_changed(rq, p, prev_class, oldprio);
4159 task_rq_unlock(rq, p, &flags); 4159 task_rq_unlock(rq, p, &flags);
4160 4160
4161 rt_mutex_adjust_pi(p); 4161 rt_mutex_adjust_pi(p);
4162 4162
4163 return 0; 4163 return 0;
4164 } 4164 }
4165 4165
4166 /** 4166 /**
4167 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4167 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4168 * @p: the task in question. 4168 * @p: the task in question.
4169 * @policy: new policy. 4169 * @policy: new policy.
4170 * @param: structure containing the new RT priority. 4170 * @param: structure containing the new RT priority.
4171 * 4171 *
4172 * NOTE that the task may be already dead. 4172 * NOTE that the task may be already dead.
4173 */ 4173 */
4174 int sched_setscheduler(struct task_struct *p, int policy, 4174 int sched_setscheduler(struct task_struct *p, int policy,
4175 const struct sched_param *param) 4175 const struct sched_param *param)
4176 { 4176 {
4177 return __sched_setscheduler(p, policy, param, true); 4177 return __sched_setscheduler(p, policy, param, true);
4178 } 4178 }
4179 EXPORT_SYMBOL_GPL(sched_setscheduler); 4179 EXPORT_SYMBOL_GPL(sched_setscheduler);
4180 4180
4181 /** 4181 /**
4182 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4182 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4183 * @p: the task in question. 4183 * @p: the task in question.
4184 * @policy: new policy. 4184 * @policy: new policy.
4185 * @param: structure containing the new RT priority. 4185 * @param: structure containing the new RT priority.
4186 * 4186 *
4187 * Just like sched_setscheduler, only don't bother checking if the 4187 * Just like sched_setscheduler, only don't bother checking if the
4188 * current context has permission. For example, this is needed in 4188 * current context has permission. For example, this is needed in
4189 * stop_machine(): we create temporary high priority worker threads, 4189 * stop_machine(): we create temporary high priority worker threads,
4190 * but our caller might not have that capability. 4190 * but our caller might not have that capability.
4191 */ 4191 */
4192 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4192 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4193 const struct sched_param *param) 4193 const struct sched_param *param)
4194 { 4194 {
4195 return __sched_setscheduler(p, policy, param, false); 4195 return __sched_setscheduler(p, policy, param, false);
4196 } 4196 }
4197 4197
4198 static int 4198 static int
4199 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4199 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4200 { 4200 {
4201 struct sched_param lparam; 4201 struct sched_param lparam;
4202 struct task_struct *p; 4202 struct task_struct *p;
4203 int retval; 4203 int retval;
4204 4204
4205 if (!param || pid < 0) 4205 if (!param || pid < 0)
4206 return -EINVAL; 4206 return -EINVAL;
4207 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4207 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4208 return -EFAULT; 4208 return -EFAULT;
4209 4209
4210 rcu_read_lock(); 4210 rcu_read_lock();
4211 retval = -ESRCH; 4211 retval = -ESRCH;
4212 p = find_process_by_pid(pid); 4212 p = find_process_by_pid(pid);
4213 if (p != NULL) 4213 if (p != NULL)
4214 retval = sched_setscheduler(p, policy, &lparam); 4214 retval = sched_setscheduler(p, policy, &lparam);
4215 rcu_read_unlock(); 4215 rcu_read_unlock();
4216 4216
4217 return retval; 4217 return retval;
4218 } 4218 }
4219 4219
4220 /** 4220 /**
4221 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4221 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4222 * @pid: the pid in question. 4222 * @pid: the pid in question.
4223 * @policy: new policy. 4223 * @policy: new policy.
4224 * @param: structure containing the new RT priority. 4224 * @param: structure containing the new RT priority.
4225 */ 4225 */
4226 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 4226 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4227 struct sched_param __user *, param) 4227 struct sched_param __user *, param)
4228 { 4228 {
4229 /* negative values for policy are not valid */ 4229 /* negative values for policy are not valid */
4230 if (policy < 0) 4230 if (policy < 0)
4231 return -EINVAL; 4231 return -EINVAL;
4232 4232
4233 return do_sched_setscheduler(pid, policy, param); 4233 return do_sched_setscheduler(pid, policy, param);
4234 } 4234 }
4235 4235
4236 /** 4236 /**
4237 * sys_sched_setparam - set/change the RT priority of a thread 4237 * sys_sched_setparam - set/change the RT priority of a thread
4238 * @pid: the pid in question. 4238 * @pid: the pid in question.
4239 * @param: structure containing the new RT priority. 4239 * @param: structure containing the new RT priority.
4240 */ 4240 */
4241 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4241 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4242 { 4242 {
4243 return do_sched_setscheduler(pid, -1, param); 4243 return do_sched_setscheduler(pid, -1, param);
4244 } 4244 }
4245 4245
4246 /** 4246 /**
4247 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4247 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4248 * @pid: the pid in question. 4248 * @pid: the pid in question.
4249 */ 4249 */
4250 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4250 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4251 { 4251 {
4252 struct task_struct *p; 4252 struct task_struct *p;
4253 int retval; 4253 int retval;
4254 4254
4255 if (pid < 0) 4255 if (pid < 0)
4256 return -EINVAL; 4256 return -EINVAL;
4257 4257
4258 retval = -ESRCH; 4258 retval = -ESRCH;
4259 rcu_read_lock(); 4259 rcu_read_lock();
4260 p = find_process_by_pid(pid); 4260 p = find_process_by_pid(pid);
4261 if (p) { 4261 if (p) {
4262 retval = security_task_getscheduler(p); 4262 retval = security_task_getscheduler(p);
4263 if (!retval) 4263 if (!retval)
4264 retval = p->policy 4264 retval = p->policy
4265 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4265 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4266 } 4266 }
4267 rcu_read_unlock(); 4267 rcu_read_unlock();
4268 return retval; 4268 return retval;
4269 } 4269 }
4270 4270
4271 /** 4271 /**
4272 * sys_sched_getparam - get the RT priority of a thread 4272 * sys_sched_getparam - get the RT priority of a thread
4273 * @pid: the pid in question. 4273 * @pid: the pid in question.
4274 * @param: structure containing the RT priority. 4274 * @param: structure containing the RT priority.
4275 */ 4275 */
4276 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4276 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4277 { 4277 {
4278 struct sched_param lp; 4278 struct sched_param lp;
4279 struct task_struct *p; 4279 struct task_struct *p;
4280 int retval; 4280 int retval;
4281 4281
4282 if (!param || pid < 0) 4282 if (!param || pid < 0)
4283 return -EINVAL; 4283 return -EINVAL;
4284 4284
4285 rcu_read_lock(); 4285 rcu_read_lock();
4286 p = find_process_by_pid(pid); 4286 p = find_process_by_pid(pid);
4287 retval = -ESRCH; 4287 retval = -ESRCH;
4288 if (!p) 4288 if (!p)
4289 goto out_unlock; 4289 goto out_unlock;
4290 4290
4291 retval = security_task_getscheduler(p); 4291 retval = security_task_getscheduler(p);
4292 if (retval) 4292 if (retval)
4293 goto out_unlock; 4293 goto out_unlock;
4294 4294
4295 lp.sched_priority = p->rt_priority; 4295 lp.sched_priority = p->rt_priority;
4296 rcu_read_unlock(); 4296 rcu_read_unlock();
4297 4297
4298 /* 4298 /*
4299 * This one might sleep, we cannot do it with a spinlock held ... 4299 * This one might sleep, we cannot do it with a spinlock held ...
4300 */ 4300 */
4301 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4301 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4302 4302
4303 return retval; 4303 return retval;
4304 4304
4305 out_unlock: 4305 out_unlock:
4306 rcu_read_unlock(); 4306 rcu_read_unlock();
4307 return retval; 4307 return retval;
4308 } 4308 }
4309 4309
4310 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4310 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4311 { 4311 {
4312 cpumask_var_t cpus_allowed, new_mask; 4312 cpumask_var_t cpus_allowed, new_mask;
4313 struct task_struct *p; 4313 struct task_struct *p;
4314 int retval; 4314 int retval;
4315 4315
4316 get_online_cpus(); 4316 get_online_cpus();
4317 rcu_read_lock(); 4317 rcu_read_lock();
4318 4318
4319 p = find_process_by_pid(pid); 4319 p = find_process_by_pid(pid);
4320 if (!p) { 4320 if (!p) {
4321 rcu_read_unlock(); 4321 rcu_read_unlock();
4322 put_online_cpus(); 4322 put_online_cpus();
4323 return -ESRCH; 4323 return -ESRCH;
4324 } 4324 }
4325 4325
4326 /* Prevent p going away */ 4326 /* Prevent p going away */
4327 get_task_struct(p); 4327 get_task_struct(p);
4328 rcu_read_unlock(); 4328 rcu_read_unlock();
4329 4329
4330 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4330 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4331 retval = -ENOMEM; 4331 retval = -ENOMEM;
4332 goto out_put_task; 4332 goto out_put_task;
4333 } 4333 }
4334 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4334 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4335 retval = -ENOMEM; 4335 retval = -ENOMEM;
4336 goto out_free_cpus_allowed; 4336 goto out_free_cpus_allowed;
4337 } 4337 }
4338 retval = -EPERM; 4338 retval = -EPERM;
4339 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE)) 4339 if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
4340 goto out_unlock; 4340 goto out_unlock;
4341 4341
4342 retval = security_task_setscheduler(p); 4342 retval = security_task_setscheduler(p);
4343 if (retval) 4343 if (retval)
4344 goto out_unlock; 4344 goto out_unlock;
4345 4345
4346 cpuset_cpus_allowed(p, cpus_allowed); 4346 cpuset_cpus_allowed(p, cpus_allowed);
4347 cpumask_and(new_mask, in_mask, cpus_allowed); 4347 cpumask_and(new_mask, in_mask, cpus_allowed);
4348 again: 4348 again:
4349 retval = set_cpus_allowed_ptr(p, new_mask); 4349 retval = set_cpus_allowed_ptr(p, new_mask);
4350 4350
4351 if (!retval) { 4351 if (!retval) {
4352 cpuset_cpus_allowed(p, cpus_allowed); 4352 cpuset_cpus_allowed(p, cpus_allowed);
4353 if (!cpumask_subset(new_mask, cpus_allowed)) { 4353 if (!cpumask_subset(new_mask, cpus_allowed)) {
4354 /* 4354 /*
4355 * We must have raced with a concurrent cpuset 4355 * We must have raced with a concurrent cpuset
4356 * update. Just reset the cpus_allowed to the 4356 * update. Just reset the cpus_allowed to the
4357 * cpuset's cpus_allowed 4357 * cpuset's cpus_allowed
4358 */ 4358 */
4359 cpumask_copy(new_mask, cpus_allowed); 4359 cpumask_copy(new_mask, cpus_allowed);
4360 goto again; 4360 goto again;
4361 } 4361 }
4362 } 4362 }
4363 out_unlock: 4363 out_unlock:
4364 free_cpumask_var(new_mask); 4364 free_cpumask_var(new_mask);
4365 out_free_cpus_allowed: 4365 out_free_cpus_allowed:
4366 free_cpumask_var(cpus_allowed); 4366 free_cpumask_var(cpus_allowed);
4367 out_put_task: 4367 out_put_task:
4368 put_task_struct(p); 4368 put_task_struct(p);
4369 put_online_cpus(); 4369 put_online_cpus();
4370 return retval; 4370 return retval;
4371 } 4371 }
4372 4372
4373 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4373 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4374 struct cpumask *new_mask) 4374 struct cpumask *new_mask)
4375 { 4375 {
4376 if (len < cpumask_size()) 4376 if (len < cpumask_size())
4377 cpumask_clear(new_mask); 4377 cpumask_clear(new_mask);
4378 else if (len > cpumask_size()) 4378 else if (len > cpumask_size())
4379 len = cpumask_size(); 4379 len = cpumask_size();
4380 4380
4381 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4381 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4382 } 4382 }
4383 4383
4384 /** 4384 /**
4385 * sys_sched_setaffinity - set the cpu affinity of a process 4385 * sys_sched_setaffinity - set the cpu affinity of a process
4386 * @pid: pid of the process 4386 * @pid: pid of the process
4387 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4387 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4388 * @user_mask_ptr: user-space pointer to the new cpu mask 4388 * @user_mask_ptr: user-space pointer to the new cpu mask
4389 */ 4389 */
4390 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4390 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4391 unsigned long __user *, user_mask_ptr) 4391 unsigned long __user *, user_mask_ptr)
4392 { 4392 {
4393 cpumask_var_t new_mask; 4393 cpumask_var_t new_mask;
4394 int retval; 4394 int retval;
4395 4395
4396 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4396 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4397 return -ENOMEM; 4397 return -ENOMEM;
4398 4398
4399 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4399 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4400 if (retval == 0) 4400 if (retval == 0)
4401 retval = sched_setaffinity(pid, new_mask); 4401 retval = sched_setaffinity(pid, new_mask);
4402 free_cpumask_var(new_mask); 4402 free_cpumask_var(new_mask);
4403 return retval; 4403 return retval;
4404 } 4404 }
4405 4405
4406 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4406 long sched_getaffinity(pid_t pid, struct cpumask *mask)
4407 { 4407 {
4408 struct task_struct *p; 4408 struct task_struct *p;
4409 unsigned long flags; 4409 unsigned long flags;
4410 int retval; 4410 int retval;
4411 4411
4412 get_online_cpus(); 4412 get_online_cpus();
4413 rcu_read_lock(); 4413 rcu_read_lock();
4414 4414
4415 retval = -ESRCH; 4415 retval = -ESRCH;
4416 p = find_process_by_pid(pid); 4416 p = find_process_by_pid(pid);
4417 if (!p) 4417 if (!p)
4418 goto out_unlock; 4418 goto out_unlock;
4419 4419
4420 retval = security_task_getscheduler(p); 4420 retval = security_task_getscheduler(p);
4421 if (retval) 4421 if (retval)
4422 goto out_unlock; 4422 goto out_unlock;
4423 4423
4424 raw_spin_lock_irqsave(&p->pi_lock, flags); 4424 raw_spin_lock_irqsave(&p->pi_lock, flags);
4425 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 4425 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4426 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4426 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4427 4427
4428 out_unlock: 4428 out_unlock:
4429 rcu_read_unlock(); 4429 rcu_read_unlock();
4430 put_online_cpus(); 4430 put_online_cpus();
4431 4431
4432 return retval; 4432 return retval;
4433 } 4433 }
4434 4434
4435 /** 4435 /**
4436 * sys_sched_getaffinity - get the cpu affinity of a process 4436 * sys_sched_getaffinity - get the cpu affinity of a process
4437 * @pid: pid of the process 4437 * @pid: pid of the process
4438 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4438 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4439 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4439 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4440 */ 4440 */
4441 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4441 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4442 unsigned long __user *, user_mask_ptr) 4442 unsigned long __user *, user_mask_ptr)
4443 { 4443 {
4444 int ret; 4444 int ret;
4445 cpumask_var_t mask; 4445 cpumask_var_t mask;
4446 4446
4447 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4447 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4448 return -EINVAL; 4448 return -EINVAL;
4449 if (len & (sizeof(unsigned long)-1)) 4449 if (len & (sizeof(unsigned long)-1))
4450 return -EINVAL; 4450 return -EINVAL;
4451 4451
4452 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4452 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4453 return -ENOMEM; 4453 return -ENOMEM;
4454 4454
4455 ret = sched_getaffinity(pid, mask); 4455 ret = sched_getaffinity(pid, mask);
4456 if (ret == 0) { 4456 if (ret == 0) {
4457 size_t retlen = min_t(size_t, len, cpumask_size()); 4457 size_t retlen = min_t(size_t, len, cpumask_size());
4458 4458
4459 if (copy_to_user(user_mask_ptr, mask, retlen)) 4459 if (copy_to_user(user_mask_ptr, mask, retlen))
4460 ret = -EFAULT; 4460 ret = -EFAULT;
4461 else 4461 else
4462 ret = retlen; 4462 ret = retlen;
4463 } 4463 }
4464 free_cpumask_var(mask); 4464 free_cpumask_var(mask);
4465 4465
4466 return ret; 4466 return ret;
4467 } 4467 }
4468 4468
4469 /** 4469 /**
4470 * sys_sched_yield - yield the current processor to other threads. 4470 * sys_sched_yield - yield the current processor to other threads.
4471 * 4471 *
4472 * This function yields the current CPU to other tasks. If there are no 4472 * This function yields the current CPU to other tasks. If there are no
4473 * other threads running on this CPU then this function will return. 4473 * other threads running on this CPU then this function will return.
4474 */ 4474 */
4475 SYSCALL_DEFINE0(sched_yield) 4475 SYSCALL_DEFINE0(sched_yield)
4476 { 4476 {
4477 struct rq *rq = this_rq_lock(); 4477 struct rq *rq = this_rq_lock();
4478 4478
4479 schedstat_inc(rq, yld_count); 4479 schedstat_inc(rq, yld_count);
4480 current->sched_class->yield_task(rq); 4480 current->sched_class->yield_task(rq);
4481 4481
4482 /* 4482 /*
4483 * Since we are going to call schedule() anyway, there's 4483 * Since we are going to call schedule() anyway, there's
4484 * no need to preempt or enable interrupts: 4484 * no need to preempt or enable interrupts:
4485 */ 4485 */
4486 __release(rq->lock); 4486 __release(rq->lock);
4487 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4487 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4488 do_raw_spin_unlock(&rq->lock); 4488 do_raw_spin_unlock(&rq->lock);
4489 preempt_enable_no_resched(); 4489 sched_preempt_enable_no_resched();
4490 4490
4491 schedule(); 4491 schedule();
4492 4492
4493 return 0; 4493 return 0;
4494 } 4494 }
4495 4495
4496 static inline int should_resched(void) 4496 static inline int should_resched(void)
4497 { 4497 {
4498 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 4498 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4499 } 4499 }
4500 4500
4501 static void __cond_resched(void) 4501 static void __cond_resched(void)
4502 { 4502 {
4503 add_preempt_count(PREEMPT_ACTIVE); 4503 add_preempt_count(PREEMPT_ACTIVE);
4504 __schedule(); 4504 __schedule();
4505 sub_preempt_count(PREEMPT_ACTIVE); 4505 sub_preempt_count(PREEMPT_ACTIVE);
4506 } 4506 }
4507 4507
4508 int __sched _cond_resched(void) 4508 int __sched _cond_resched(void)
4509 { 4509 {
4510 if (should_resched()) { 4510 if (should_resched()) {
4511 __cond_resched(); 4511 __cond_resched();
4512 return 1; 4512 return 1;
4513 } 4513 }
4514 return 0; 4514 return 0;
4515 } 4515 }
4516 EXPORT_SYMBOL(_cond_resched); 4516 EXPORT_SYMBOL(_cond_resched);
4517 4517
4518 /* 4518 /*
4519 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4519 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4520 * call schedule, and on return reacquire the lock. 4520 * call schedule, and on return reacquire the lock.
4521 * 4521 *
4522 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4522 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4523 * operations here to prevent schedule() from being called twice (once via 4523 * operations here to prevent schedule() from being called twice (once via
4524 * spin_unlock(), once by hand). 4524 * spin_unlock(), once by hand).
4525 */ 4525 */
4526 int __cond_resched_lock(spinlock_t *lock) 4526 int __cond_resched_lock(spinlock_t *lock)
4527 { 4527 {
4528 int resched = should_resched(); 4528 int resched = should_resched();
4529 int ret = 0; 4529 int ret = 0;
4530 4530
4531 lockdep_assert_held(lock); 4531 lockdep_assert_held(lock);
4532 4532
4533 if (spin_needbreak(lock) || resched) { 4533 if (spin_needbreak(lock) || resched) {
4534 spin_unlock(lock); 4534 spin_unlock(lock);
4535 if (resched) 4535 if (resched)
4536 __cond_resched(); 4536 __cond_resched();
4537 else 4537 else
4538 cpu_relax(); 4538 cpu_relax();
4539 ret = 1; 4539 ret = 1;
4540 spin_lock(lock); 4540 spin_lock(lock);
4541 } 4541 }
4542 return ret; 4542 return ret;
4543 } 4543 }
4544 EXPORT_SYMBOL(__cond_resched_lock); 4544 EXPORT_SYMBOL(__cond_resched_lock);
4545 4545
4546 int __sched __cond_resched_softirq(void) 4546 int __sched __cond_resched_softirq(void)
4547 { 4547 {
4548 BUG_ON(!in_softirq()); 4548 BUG_ON(!in_softirq());
4549 4549
4550 if (should_resched()) { 4550 if (should_resched()) {
4551 local_bh_enable(); 4551 local_bh_enable();
4552 __cond_resched(); 4552 __cond_resched();
4553 local_bh_disable(); 4553 local_bh_disable();
4554 return 1; 4554 return 1;
4555 } 4555 }
4556 return 0; 4556 return 0;
4557 } 4557 }
4558 EXPORT_SYMBOL(__cond_resched_softirq); 4558 EXPORT_SYMBOL(__cond_resched_softirq);
4559 4559
4560 /** 4560 /**
4561 * yield - yield the current processor to other threads. 4561 * yield - yield the current processor to other threads.
4562 * 4562 *
4563 * This is a shortcut for kernel-space yielding - it marks the 4563 * This is a shortcut for kernel-space yielding - it marks the
4564 * thread runnable and calls sys_sched_yield(). 4564 * thread runnable and calls sys_sched_yield().
4565 */ 4565 */
4566 void __sched yield(void) 4566 void __sched yield(void)
4567 { 4567 {
4568 set_current_state(TASK_RUNNING); 4568 set_current_state(TASK_RUNNING);
4569 sys_sched_yield(); 4569 sys_sched_yield();
4570 } 4570 }
4571 EXPORT_SYMBOL(yield); 4571 EXPORT_SYMBOL(yield);
4572 4572
4573 /** 4573 /**
4574 * yield_to - yield the current processor to another thread in 4574 * yield_to - yield the current processor to another thread in
4575 * your thread group, or accelerate that thread toward the 4575 * your thread group, or accelerate that thread toward the
4576 * processor it's on. 4576 * processor it's on.
4577 * @p: target task 4577 * @p: target task
4578 * @preempt: whether task preemption is allowed or not 4578 * @preempt: whether task preemption is allowed or not
4579 * 4579 *
4580 * It's the caller's job to ensure that the target task struct 4580 * It's the caller's job to ensure that the target task struct
4581 * can't go away on us before we can do any checks. 4581 * can't go away on us before we can do any checks.
4582 * 4582 *
4583 * Returns true if we indeed boosted the target task. 4583 * Returns true if we indeed boosted the target task.
4584 */ 4584 */
4585 bool __sched yield_to(struct task_struct *p, bool preempt) 4585 bool __sched yield_to(struct task_struct *p, bool preempt)
4586 { 4586 {
4587 struct task_struct *curr = current; 4587 struct task_struct *curr = current;
4588 struct rq *rq, *p_rq; 4588 struct rq *rq, *p_rq;
4589 unsigned long flags; 4589 unsigned long flags;
4590 bool yielded = 0; 4590 bool yielded = 0;
4591 4591
4592 local_irq_save(flags); 4592 local_irq_save(flags);
4593 rq = this_rq(); 4593 rq = this_rq();
4594 4594
4595 again: 4595 again:
4596 p_rq = task_rq(p); 4596 p_rq = task_rq(p);
4597 double_rq_lock(rq, p_rq); 4597 double_rq_lock(rq, p_rq);
4598 while (task_rq(p) != p_rq) { 4598 while (task_rq(p) != p_rq) {
4599 double_rq_unlock(rq, p_rq); 4599 double_rq_unlock(rq, p_rq);
4600 goto again; 4600 goto again;
4601 } 4601 }
4602 4602
4603 if (!curr->sched_class->yield_to_task) 4603 if (!curr->sched_class->yield_to_task)
4604 goto out; 4604 goto out;
4605 4605
4606 if (curr->sched_class != p->sched_class) 4606 if (curr->sched_class != p->sched_class)
4607 goto out; 4607 goto out;
4608 4608
4609 if (task_running(p_rq, p) || p->state) 4609 if (task_running(p_rq, p) || p->state)
4610 goto out; 4610 goto out;
4611 4611
4612 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4612 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4613 if (yielded) { 4613 if (yielded) {
4614 schedstat_inc(rq, yld_count); 4614 schedstat_inc(rq, yld_count);
4615 /* 4615 /*
4616 * Make p's CPU reschedule; pick_next_entity takes care of 4616 * Make p's CPU reschedule; pick_next_entity takes care of
4617 * fairness. 4617 * fairness.
4618 */ 4618 */
4619 if (preempt && rq != p_rq) 4619 if (preempt && rq != p_rq)
4620 resched_task(p_rq->curr); 4620 resched_task(p_rq->curr);
4621 } else { 4621 } else {
4622 /* 4622 /*
4623 * We might have set it in task_yield_fair(), but are 4623 * We might have set it in task_yield_fair(), but are
4624 * not going to schedule(), so don't want to skip 4624 * not going to schedule(), so don't want to skip
4625 * the next update. 4625 * the next update.
4626 */ 4626 */
4627 rq->skip_clock_update = 0; 4627 rq->skip_clock_update = 0;
4628 } 4628 }
4629 4629
4630 out: 4630 out:
4631 double_rq_unlock(rq, p_rq); 4631 double_rq_unlock(rq, p_rq);
4632 local_irq_restore(flags); 4632 local_irq_restore(flags);
4633 4633
4634 if (yielded) 4634 if (yielded)
4635 schedule(); 4635 schedule();
4636 4636
4637 return yielded; 4637 return yielded;
4638 } 4638 }
4639 EXPORT_SYMBOL_GPL(yield_to); 4639 EXPORT_SYMBOL_GPL(yield_to);
4640 4640
4641 /* 4641 /*
4642 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4642 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4643 * that process accounting knows that this is a task in IO wait state. 4643 * that process accounting knows that this is a task in IO wait state.
4644 */ 4644 */
4645 void __sched io_schedule(void) 4645 void __sched io_schedule(void)
4646 { 4646 {
4647 struct rq *rq = raw_rq(); 4647 struct rq *rq = raw_rq();
4648 4648
4649 delayacct_blkio_start(); 4649 delayacct_blkio_start();
4650 atomic_inc(&rq->nr_iowait); 4650 atomic_inc(&rq->nr_iowait);
4651 blk_flush_plug(current); 4651 blk_flush_plug(current);
4652 current->in_iowait = 1; 4652 current->in_iowait = 1;
4653 schedule(); 4653 schedule();
4654 current->in_iowait = 0; 4654 current->in_iowait = 0;
4655 atomic_dec(&rq->nr_iowait); 4655 atomic_dec(&rq->nr_iowait);
4656 delayacct_blkio_end(); 4656 delayacct_blkio_end();
4657 } 4657 }
4658 EXPORT_SYMBOL(io_schedule); 4658 EXPORT_SYMBOL(io_schedule);
4659 4659
4660 long __sched io_schedule_timeout(long timeout) 4660 long __sched io_schedule_timeout(long timeout)
4661 { 4661 {
4662 struct rq *rq = raw_rq(); 4662 struct rq *rq = raw_rq();
4663 long ret; 4663 long ret;
4664 4664
4665 delayacct_blkio_start(); 4665 delayacct_blkio_start();
4666 atomic_inc(&rq->nr_iowait); 4666 atomic_inc(&rq->nr_iowait);
4667 blk_flush_plug(current); 4667 blk_flush_plug(current);
4668 current->in_iowait = 1; 4668 current->in_iowait = 1;
4669 ret = schedule_timeout(timeout); 4669 ret = schedule_timeout(timeout);
4670 current->in_iowait = 0; 4670 current->in_iowait = 0;
4671 atomic_dec(&rq->nr_iowait); 4671 atomic_dec(&rq->nr_iowait);
4672 delayacct_blkio_end(); 4672 delayacct_blkio_end();
4673 return ret; 4673 return ret;
4674 } 4674 }
4675 4675
4676 /** 4676 /**
4677 * sys_sched_get_priority_max - return maximum RT priority. 4677 * sys_sched_get_priority_max - return maximum RT priority.
4678 * @policy: scheduling class. 4678 * @policy: scheduling class.
4679 * 4679 *
4680 * this syscall returns the maximum rt_priority that can be used 4680 * this syscall returns the maximum rt_priority that can be used
4681 * by a given scheduling class. 4681 * by a given scheduling class.
4682 */ 4682 */
4683 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4683 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4684 { 4684 {
4685 int ret = -EINVAL; 4685 int ret = -EINVAL;
4686 4686
4687 switch (policy) { 4687 switch (policy) {
4688 case SCHED_FIFO: 4688 case SCHED_FIFO:
4689 case SCHED_RR: 4689 case SCHED_RR:
4690 ret = MAX_USER_RT_PRIO-1; 4690 ret = MAX_USER_RT_PRIO-1;
4691 break; 4691 break;
4692 case SCHED_NORMAL: 4692 case SCHED_NORMAL:
4693 case SCHED_BATCH: 4693 case SCHED_BATCH:
4694 case SCHED_IDLE: 4694 case SCHED_IDLE:
4695 ret = 0; 4695 ret = 0;
4696 break; 4696 break;
4697 } 4697 }
4698 return ret; 4698 return ret;
4699 } 4699 }
4700 4700
4701 /** 4701 /**
4702 * sys_sched_get_priority_min - return minimum RT priority. 4702 * sys_sched_get_priority_min - return minimum RT priority.
4703 * @policy: scheduling class. 4703 * @policy: scheduling class.
4704 * 4704 *
4705 * this syscall returns the minimum rt_priority that can be used 4705 * this syscall returns the minimum rt_priority that can be used
4706 * by a given scheduling class. 4706 * by a given scheduling class.
4707 */ 4707 */
4708 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4708 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4709 { 4709 {
4710 int ret = -EINVAL; 4710 int ret = -EINVAL;
4711 4711
4712 switch (policy) { 4712 switch (policy) {
4713 case SCHED_FIFO: 4713 case SCHED_FIFO:
4714 case SCHED_RR: 4714 case SCHED_RR:
4715 ret = 1; 4715 ret = 1;
4716 break; 4716 break;
4717 case SCHED_NORMAL: 4717 case SCHED_NORMAL:
4718 case SCHED_BATCH: 4718 case SCHED_BATCH:
4719 case SCHED_IDLE: 4719 case SCHED_IDLE:
4720 ret = 0; 4720 ret = 0;
4721 } 4721 }
4722 return ret; 4722 return ret;
4723 } 4723 }
4724 4724
4725 /** 4725 /**
4726 * sys_sched_rr_get_interval - return the default timeslice of a process. 4726 * sys_sched_rr_get_interval - return the default timeslice of a process.
4727 * @pid: pid of the process. 4727 * @pid: pid of the process.
4728 * @interval: userspace pointer to the timeslice value. 4728 * @interval: userspace pointer to the timeslice value.
4729 * 4729 *
4730 * this syscall writes the default timeslice value of a given process 4730 * this syscall writes the default timeslice value of a given process
4731 * into the user-space timespec buffer. A value of '0' means infinity. 4731 * into the user-space timespec buffer. A value of '0' means infinity.
4732 */ 4732 */
4733 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4733 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4734 struct timespec __user *, interval) 4734 struct timespec __user *, interval)
4735 { 4735 {
4736 struct task_struct *p; 4736 struct task_struct *p;
4737 unsigned int time_slice; 4737 unsigned int time_slice;
4738 unsigned long flags; 4738 unsigned long flags;
4739 struct rq *rq; 4739 struct rq *rq;
4740 int retval; 4740 int retval;
4741 struct timespec t; 4741 struct timespec t;
4742 4742
4743 if (pid < 0) 4743 if (pid < 0)
4744 return -EINVAL; 4744 return -EINVAL;
4745 4745
4746 retval = -ESRCH; 4746 retval = -ESRCH;
4747 rcu_read_lock(); 4747 rcu_read_lock();
4748 p = find_process_by_pid(pid); 4748 p = find_process_by_pid(pid);
4749 if (!p) 4749 if (!p)
4750 goto out_unlock; 4750 goto out_unlock;
4751 4751
4752 retval = security_task_getscheduler(p); 4752 retval = security_task_getscheduler(p);
4753 if (retval) 4753 if (retval)
4754 goto out_unlock; 4754 goto out_unlock;
4755 4755
4756 rq = task_rq_lock(p, &flags); 4756 rq = task_rq_lock(p, &flags);
4757 time_slice = p->sched_class->get_rr_interval(rq, p); 4757 time_slice = p->sched_class->get_rr_interval(rq, p);
4758 task_rq_unlock(rq, p, &flags); 4758 task_rq_unlock(rq, p, &flags);
4759 4759
4760 rcu_read_unlock(); 4760 rcu_read_unlock();
4761 jiffies_to_timespec(time_slice, &t); 4761 jiffies_to_timespec(time_slice, &t);
4762 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4762 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4763 return retval; 4763 return retval;
4764 4764
4765 out_unlock: 4765 out_unlock:
4766 rcu_read_unlock(); 4766 rcu_read_unlock();
4767 return retval; 4767 return retval;
4768 } 4768 }
4769 4769
4770 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4770 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4771 4771
4772 void sched_show_task(struct task_struct *p) 4772 void sched_show_task(struct task_struct *p)
4773 { 4773 {
4774 unsigned long free = 0; 4774 unsigned long free = 0;
4775 unsigned state; 4775 unsigned state;
4776 4776
4777 state = p->state ? __ffs(p->state) + 1 : 0; 4777 state = p->state ? __ffs(p->state) + 1 : 0;
4778 printk(KERN_INFO "%-15.15s %c", p->comm, 4778 printk(KERN_INFO "%-15.15s %c", p->comm,
4779 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4779 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4780 #if BITS_PER_LONG == 32 4780 #if BITS_PER_LONG == 32
4781 if (state == TASK_RUNNING) 4781 if (state == TASK_RUNNING)
4782 printk(KERN_CONT " running "); 4782 printk(KERN_CONT " running ");
4783 else 4783 else
4784 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4784 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4785 #else 4785 #else
4786 if (state == TASK_RUNNING) 4786 if (state == TASK_RUNNING)
4787 printk(KERN_CONT " running task "); 4787 printk(KERN_CONT " running task ");
4788 else 4788 else
4789 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4789 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4790 #endif 4790 #endif
4791 #ifdef CONFIG_DEBUG_STACK_USAGE 4791 #ifdef CONFIG_DEBUG_STACK_USAGE
4792 free = stack_not_used(p); 4792 free = stack_not_used(p);
4793 #endif 4793 #endif
4794 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4794 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4795 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), 4795 task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
4796 (unsigned long)task_thread_info(p)->flags); 4796 (unsigned long)task_thread_info(p)->flags);
4797 4797
4798 show_stack(p, NULL); 4798 show_stack(p, NULL);
4799 } 4799 }
4800 4800
4801 void show_state_filter(unsigned long state_filter) 4801 void show_state_filter(unsigned long state_filter)
4802 { 4802 {
4803 struct task_struct *g, *p; 4803 struct task_struct *g, *p;
4804 4804
4805 #if BITS_PER_LONG == 32 4805 #if BITS_PER_LONG == 32
4806 printk(KERN_INFO 4806 printk(KERN_INFO
4807 " task PC stack pid father\n"); 4807 " task PC stack pid father\n");
4808 #else 4808 #else
4809 printk(KERN_INFO 4809 printk(KERN_INFO
4810 " task PC stack pid father\n"); 4810 " task PC stack pid father\n");
4811 #endif 4811 #endif
4812 rcu_read_lock(); 4812 rcu_read_lock();
4813 do_each_thread(g, p) { 4813 do_each_thread(g, p) {
4814 /* 4814 /*
4815 * reset the NMI-timeout, listing all files on a slow 4815 * reset the NMI-timeout, listing all files on a slow
4816 * console might take a lot of time: 4816 * console might take a lot of time:
4817 */ 4817 */
4818 touch_nmi_watchdog(); 4818 touch_nmi_watchdog();
4819 if (!state_filter || (p->state & state_filter)) 4819 if (!state_filter || (p->state & state_filter))
4820 sched_show_task(p); 4820 sched_show_task(p);
4821 } while_each_thread(g, p); 4821 } while_each_thread(g, p);
4822 4822
4823 touch_all_softlockup_watchdogs(); 4823 touch_all_softlockup_watchdogs();
4824 4824
4825 #ifdef CONFIG_SCHED_DEBUG 4825 #ifdef CONFIG_SCHED_DEBUG
4826 sysrq_sched_debug_show(); 4826 sysrq_sched_debug_show();
4827 #endif 4827 #endif
4828 rcu_read_unlock(); 4828 rcu_read_unlock();
4829 /* 4829 /*
4830 * Only show locks if all tasks are dumped: 4830 * Only show locks if all tasks are dumped:
4831 */ 4831 */
4832 if (!state_filter) 4832 if (!state_filter)
4833 debug_show_all_locks(); 4833 debug_show_all_locks();
4834 } 4834 }
4835 4835
4836 void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4836 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4837 { 4837 {
4838 idle->sched_class = &idle_sched_class; 4838 idle->sched_class = &idle_sched_class;
4839 } 4839 }
4840 4840
4841 /** 4841 /**
4842 * init_idle - set up an idle thread for a given CPU 4842 * init_idle - set up an idle thread for a given CPU
4843 * @idle: task in question 4843 * @idle: task in question
4844 * @cpu: cpu the idle task belongs to 4844 * @cpu: cpu the idle task belongs to
4845 * 4845 *
4846 * NOTE: this function does not set the idle thread's NEED_RESCHED 4846 * NOTE: this function does not set the idle thread's NEED_RESCHED
4847 * flag, to make booting more robust. 4847 * flag, to make booting more robust.
4848 */ 4848 */
4849 void __cpuinit init_idle(struct task_struct *idle, int cpu) 4849 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4850 { 4850 {
4851 struct rq *rq = cpu_rq(cpu); 4851 struct rq *rq = cpu_rq(cpu);
4852 unsigned long flags; 4852 unsigned long flags;
4853 4853
4854 raw_spin_lock_irqsave(&rq->lock, flags); 4854 raw_spin_lock_irqsave(&rq->lock, flags);
4855 4855
4856 __sched_fork(idle); 4856 __sched_fork(idle);
4857 idle->state = TASK_RUNNING; 4857 idle->state = TASK_RUNNING;
4858 idle->se.exec_start = sched_clock(); 4858 idle->se.exec_start = sched_clock();
4859 4859
4860 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4860 do_set_cpus_allowed(idle, cpumask_of(cpu));
4861 /* 4861 /*
4862 * We're having a chicken and egg problem, even though we are 4862 * We're having a chicken and egg problem, even though we are
4863 * holding rq->lock, the cpu isn't yet set to this cpu so the 4863 * holding rq->lock, the cpu isn't yet set to this cpu so the
4864 * lockdep check in task_group() will fail. 4864 * lockdep check in task_group() will fail.
4865 * 4865 *
4866 * Similar case to sched_fork(). / Alternatively we could 4866 * Similar case to sched_fork(). / Alternatively we could
4867 * use task_rq_lock() here and obtain the other rq->lock. 4867 * use task_rq_lock() here and obtain the other rq->lock.
4868 * 4868 *
4869 * Silence PROVE_RCU 4869 * Silence PROVE_RCU
4870 */ 4870 */
4871 rcu_read_lock(); 4871 rcu_read_lock();
4872 __set_task_cpu(idle, cpu); 4872 __set_task_cpu(idle, cpu);
4873 rcu_read_unlock(); 4873 rcu_read_unlock();
4874 4874
4875 rq->curr = rq->idle = idle; 4875 rq->curr = rq->idle = idle;
4876 #if defined(CONFIG_SMP) 4876 #if defined(CONFIG_SMP)
4877 idle->on_cpu = 1; 4877 idle->on_cpu = 1;
4878 #endif 4878 #endif
4879 raw_spin_unlock_irqrestore(&rq->lock, flags); 4879 raw_spin_unlock_irqrestore(&rq->lock, flags);
4880 4880
4881 /* Set the preempt count _outside_ the spinlocks! */ 4881 /* Set the preempt count _outside_ the spinlocks! */
4882 task_thread_info(idle)->preempt_count = 0; 4882 task_thread_info(idle)->preempt_count = 0;
4883 4883
4884 /* 4884 /*
4885 * The idle tasks have their own, simple scheduling class: 4885 * The idle tasks have their own, simple scheduling class:
4886 */ 4886 */
4887 idle->sched_class = &idle_sched_class; 4887 idle->sched_class = &idle_sched_class;
4888 ftrace_graph_init_idle_task(idle, cpu); 4888 ftrace_graph_init_idle_task(idle, cpu);
4889 #if defined(CONFIG_SMP) 4889 #if defined(CONFIG_SMP)
4890 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4890 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4891 #endif 4891 #endif
4892 } 4892 }
4893 4893
4894 #ifdef CONFIG_SMP 4894 #ifdef CONFIG_SMP
4895 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4895 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4896 { 4896 {
4897 if (p->sched_class && p->sched_class->set_cpus_allowed) 4897 if (p->sched_class && p->sched_class->set_cpus_allowed)
4898 p->sched_class->set_cpus_allowed(p, new_mask); 4898 p->sched_class->set_cpus_allowed(p, new_mask);
4899 4899
4900 cpumask_copy(&p->cpus_allowed, new_mask); 4900 cpumask_copy(&p->cpus_allowed, new_mask);
4901 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 4901 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
4902 } 4902 }
4903 4903
4904 /* 4904 /*
4905 * This is how migration works: 4905 * This is how migration works:
4906 * 4906 *
4907 * 1) we invoke migration_cpu_stop() on the target CPU using 4907 * 1) we invoke migration_cpu_stop() on the target CPU using
4908 * stop_one_cpu(). 4908 * stop_one_cpu().
4909 * 2) stopper starts to run (implicitly forcing the migrated thread 4909 * 2) stopper starts to run (implicitly forcing the migrated thread
4910 * off the CPU) 4910 * off the CPU)
4911 * 3) it checks whether the migrated task is still in the wrong runqueue. 4911 * 3) it checks whether the migrated task is still in the wrong runqueue.
4912 * 4) if it's in the wrong runqueue then the migration thread removes 4912 * 4) if it's in the wrong runqueue then the migration thread removes
4913 * it and puts it into the right queue. 4913 * it and puts it into the right queue.
4914 * 5) stopper completes and stop_one_cpu() returns and the migration 4914 * 5) stopper completes and stop_one_cpu() returns and the migration
4915 * is done. 4915 * is done.
4916 */ 4916 */
4917 4917
4918 /* 4918 /*
4919 * Change a given task's CPU affinity. Migrate the thread to a 4919 * Change a given task's CPU affinity. Migrate the thread to a
4920 * proper CPU and schedule it away if the CPU it's executing on 4920 * proper CPU and schedule it away if the CPU it's executing on
4921 * is removed from the allowed bitmask. 4921 * is removed from the allowed bitmask.
4922 * 4922 *
4923 * NOTE: the caller must have a valid reference to the task, the 4923 * NOTE: the caller must have a valid reference to the task, the
4924 * task must not exit() & deallocate itself prematurely. The 4924 * task must not exit() & deallocate itself prematurely. The
4925 * call is not atomic; no spinlocks may be held. 4925 * call is not atomic; no spinlocks may be held.
4926 */ 4926 */
4927 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4927 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4928 { 4928 {
4929 unsigned long flags; 4929 unsigned long flags;
4930 struct rq *rq; 4930 struct rq *rq;
4931 unsigned int dest_cpu; 4931 unsigned int dest_cpu;
4932 int ret = 0; 4932 int ret = 0;
4933 4933
4934 rq = task_rq_lock(p, &flags); 4934 rq = task_rq_lock(p, &flags);
4935 4935
4936 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4936 if (cpumask_equal(&p->cpus_allowed, new_mask))
4937 goto out; 4937 goto out;
4938 4938
4939 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4939 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4940 ret = -EINVAL; 4940 ret = -EINVAL;
4941 goto out; 4941 goto out;
4942 } 4942 }
4943 4943
4944 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { 4944 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
4945 ret = -EINVAL; 4945 ret = -EINVAL;
4946 goto out; 4946 goto out;
4947 } 4947 }
4948 4948
4949 do_set_cpus_allowed(p, new_mask); 4949 do_set_cpus_allowed(p, new_mask);
4950 4950
4951 /* Can the task run on the task's current CPU? If so, we're done */ 4951 /* Can the task run on the task's current CPU? If so, we're done */
4952 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4952 if (cpumask_test_cpu(task_cpu(p), new_mask))
4953 goto out; 4953 goto out;
4954 4954
4955 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4955 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4956 if (p->on_rq) { 4956 if (p->on_rq) {
4957 struct migration_arg arg = { p, dest_cpu }; 4957 struct migration_arg arg = { p, dest_cpu };
4958 /* Need help from migration thread: drop lock and wait. */ 4958 /* Need help from migration thread: drop lock and wait. */
4959 task_rq_unlock(rq, p, &flags); 4959 task_rq_unlock(rq, p, &flags);
4960 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4960 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4961 tlb_migrate_finish(p->mm); 4961 tlb_migrate_finish(p->mm);
4962 return 0; 4962 return 0;
4963 } 4963 }
4964 out: 4964 out:
4965 task_rq_unlock(rq, p, &flags); 4965 task_rq_unlock(rq, p, &flags);
4966 4966
4967 return ret; 4967 return ret;
4968 } 4968 }
4969 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4969 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4970 4970
4971 /* 4971 /*
4972 * Move (not current) task off this cpu, onto dest cpu. We're doing 4972 * Move (not current) task off this cpu, onto dest cpu. We're doing
4973 * this because either it can't run here any more (set_cpus_allowed() 4973 * this because either it can't run here any more (set_cpus_allowed()
4974 * away from this CPU, or CPU going down), or because we're 4974 * away from this CPU, or CPU going down), or because we're
4975 * attempting to rebalance this task on exec (sched_exec). 4975 * attempting to rebalance this task on exec (sched_exec).
4976 * 4976 *
4977 * So we race with normal scheduler movements, but that's OK, as long 4977 * So we race with normal scheduler movements, but that's OK, as long
4978 * as the task is no longer on this CPU. 4978 * as the task is no longer on this CPU.
4979 * 4979 *
4980 * Returns non-zero if task was successfully migrated. 4980 * Returns non-zero if task was successfully migrated.
4981 */ 4981 */
4982 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4982 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4983 { 4983 {
4984 struct rq *rq_dest, *rq_src; 4984 struct rq *rq_dest, *rq_src;
4985 int ret = 0; 4985 int ret = 0;
4986 4986
4987 if (unlikely(!cpu_active(dest_cpu))) 4987 if (unlikely(!cpu_active(dest_cpu)))
4988 return ret; 4988 return ret;
4989 4989
4990 rq_src = cpu_rq(src_cpu); 4990 rq_src = cpu_rq(src_cpu);
4991 rq_dest = cpu_rq(dest_cpu); 4991 rq_dest = cpu_rq(dest_cpu);
4992 4992
4993 raw_spin_lock(&p->pi_lock); 4993 raw_spin_lock(&p->pi_lock);
4994 double_rq_lock(rq_src, rq_dest); 4994 double_rq_lock(rq_src, rq_dest);
4995 /* Already moved. */ 4995 /* Already moved. */
4996 if (task_cpu(p) != src_cpu) 4996 if (task_cpu(p) != src_cpu)
4997 goto done; 4997 goto done;
4998 /* Affinity changed (again). */ 4998 /* Affinity changed (again). */
4999 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4999 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
5000 goto fail; 5000 goto fail;
5001 5001
5002 /* 5002 /*
5003 * If we're not on a rq, the next wake-up will ensure we're 5003 * If we're not on a rq, the next wake-up will ensure we're
5004 * placed properly. 5004 * placed properly.
5005 */ 5005 */
5006 if (p->on_rq) { 5006 if (p->on_rq) {
5007 dequeue_task(rq_src, p, 0); 5007 dequeue_task(rq_src, p, 0);
5008 set_task_cpu(p, dest_cpu); 5008 set_task_cpu(p, dest_cpu);
5009 enqueue_task(rq_dest, p, 0); 5009 enqueue_task(rq_dest, p, 0);
5010 check_preempt_curr(rq_dest, p, 0); 5010 check_preempt_curr(rq_dest, p, 0);
5011 } 5011 }
5012 done: 5012 done:
5013 ret = 1; 5013 ret = 1;
5014 fail: 5014 fail:
5015 double_rq_unlock(rq_src, rq_dest); 5015 double_rq_unlock(rq_src, rq_dest);
5016 raw_spin_unlock(&p->pi_lock); 5016 raw_spin_unlock(&p->pi_lock);
5017 return ret; 5017 return ret;
5018 } 5018 }
5019 5019
5020 /* 5020 /*
5021 * migration_cpu_stop - this will be executed by a highprio stopper thread 5021 * migration_cpu_stop - this will be executed by a highprio stopper thread
5022 * and performs thread migration by bumping thread off CPU then 5022 * and performs thread migration by bumping thread off CPU then
5023 * 'pushing' onto another runqueue. 5023 * 'pushing' onto another runqueue.
5024 */ 5024 */
5025 static int migration_cpu_stop(void *data) 5025 static int migration_cpu_stop(void *data)
5026 { 5026 {
5027 struct migration_arg *arg = data; 5027 struct migration_arg *arg = data;
5028 5028
5029 /* 5029 /*
5030 * The original target cpu might have gone down and we might 5030 * The original target cpu might have gone down and we might
5031 * be on another cpu but it doesn't matter. 5031 * be on another cpu but it doesn't matter.
5032 */ 5032 */
5033 local_irq_disable(); 5033 local_irq_disable();
5034 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 5034 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5035 local_irq_enable(); 5035 local_irq_enable();
5036 return 0; 5036 return 0;
5037 } 5037 }
5038 5038
5039 #ifdef CONFIG_HOTPLUG_CPU 5039 #ifdef CONFIG_HOTPLUG_CPU
5040 5040
5041 /* 5041 /*
5042 * Ensures that the idle task is using init_mm right before its cpu goes 5042 * Ensures that the idle task is using init_mm right before its cpu goes
5043 * offline. 5043 * offline.
5044 */ 5044 */
5045 void idle_task_exit(void) 5045 void idle_task_exit(void)
5046 { 5046 {
5047 struct mm_struct *mm = current->active_mm; 5047 struct mm_struct *mm = current->active_mm;
5048 5048
5049 BUG_ON(cpu_online(smp_processor_id())); 5049 BUG_ON(cpu_online(smp_processor_id()));
5050 5050
5051 if (mm != &init_mm) 5051 if (mm != &init_mm)
5052 switch_mm(mm, &init_mm, current); 5052 switch_mm(mm, &init_mm, current);
5053 mmdrop(mm); 5053 mmdrop(mm);
5054 } 5054 }
5055 5055
5056 /* 5056 /*
5057 * While a dead CPU has no uninterruptible tasks queued at this point, 5057 * While a dead CPU has no uninterruptible tasks queued at this point,
5058 * it might still have a nonzero ->nr_uninterruptible counter, because 5058 * it might still have a nonzero ->nr_uninterruptible counter, because
5059 * for performance reasons the counter is not stricly tracking tasks to 5059 * for performance reasons the counter is not stricly tracking tasks to
5060 * their home CPUs. So we just add the counter to another CPU's counter, 5060 * their home CPUs. So we just add the counter to another CPU's counter,
5061 * to keep the global sum constant after CPU-down: 5061 * to keep the global sum constant after CPU-down:
5062 */ 5062 */
5063 static void migrate_nr_uninterruptible(struct rq *rq_src) 5063 static void migrate_nr_uninterruptible(struct rq *rq_src)
5064 { 5064 {
5065 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5065 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5066 5066
5067 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5067 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5068 rq_src->nr_uninterruptible = 0; 5068 rq_src->nr_uninterruptible = 0;
5069 } 5069 }
5070 5070
5071 /* 5071 /*
5072 * remove the tasks which were accounted by rq from calc_load_tasks. 5072 * remove the tasks which were accounted by rq from calc_load_tasks.
5073 */ 5073 */
5074 static void calc_global_load_remove(struct rq *rq) 5074 static void calc_global_load_remove(struct rq *rq)
5075 { 5075 {
5076 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 5076 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5077 rq->calc_load_active = 0; 5077 rq->calc_load_active = 0;
5078 } 5078 }
5079 5079
5080 /* 5080 /*
5081 * Migrate all tasks from the rq, sleeping tasks will be migrated by 5081 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5082 * try_to_wake_up()->select_task_rq(). 5082 * try_to_wake_up()->select_task_rq().
5083 * 5083 *
5084 * Called with rq->lock held even though we'er in stop_machine() and 5084 * Called with rq->lock held even though we'er in stop_machine() and
5085 * there's no concurrency possible, we hold the required locks anyway 5085 * there's no concurrency possible, we hold the required locks anyway
5086 * because of lock validation efforts. 5086 * because of lock validation efforts.
5087 */ 5087 */
5088 static void migrate_tasks(unsigned int dead_cpu) 5088 static void migrate_tasks(unsigned int dead_cpu)
5089 { 5089 {
5090 struct rq *rq = cpu_rq(dead_cpu); 5090 struct rq *rq = cpu_rq(dead_cpu);
5091 struct task_struct *next, *stop = rq->stop; 5091 struct task_struct *next, *stop = rq->stop;
5092 int dest_cpu; 5092 int dest_cpu;
5093 5093
5094 /* 5094 /*
5095 * Fudge the rq selection such that the below task selection loop 5095 * Fudge the rq selection such that the below task selection loop
5096 * doesn't get stuck on the currently eligible stop task. 5096 * doesn't get stuck on the currently eligible stop task.
5097 * 5097 *
5098 * We're currently inside stop_machine() and the rq is either stuck 5098 * We're currently inside stop_machine() and the rq is either stuck
5099 * in the stop_machine_cpu_stop() loop, or we're executing this code, 5099 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5100 * either way we should never end up calling schedule() until we're 5100 * either way we should never end up calling schedule() until we're
5101 * done here. 5101 * done here.
5102 */ 5102 */
5103 rq->stop = NULL; 5103 rq->stop = NULL;
5104 5104
5105 /* Ensure any throttled groups are reachable by pick_next_task */ 5105 /* Ensure any throttled groups are reachable by pick_next_task */
5106 unthrottle_offline_cfs_rqs(rq); 5106 unthrottle_offline_cfs_rqs(rq);
5107 5107
5108 for ( ; ; ) { 5108 for ( ; ; ) {
5109 /* 5109 /*
5110 * There's this thread running, bail when that's the only 5110 * There's this thread running, bail when that's the only
5111 * remaining thread. 5111 * remaining thread.
5112 */ 5112 */
5113 if (rq->nr_running == 1) 5113 if (rq->nr_running == 1)
5114 break; 5114 break;
5115 5115
5116 next = pick_next_task(rq); 5116 next = pick_next_task(rq);
5117 BUG_ON(!next); 5117 BUG_ON(!next);
5118 next->sched_class->put_prev_task(rq, next); 5118 next->sched_class->put_prev_task(rq, next);
5119 5119
5120 /* Find suitable destination for @next, with force if needed. */ 5120 /* Find suitable destination for @next, with force if needed. */
5121 dest_cpu = select_fallback_rq(dead_cpu, next); 5121 dest_cpu = select_fallback_rq(dead_cpu, next);
5122 raw_spin_unlock(&rq->lock); 5122 raw_spin_unlock(&rq->lock);
5123 5123
5124 __migrate_task(next, dead_cpu, dest_cpu); 5124 __migrate_task(next, dead_cpu, dest_cpu);
5125 5125
5126 raw_spin_lock(&rq->lock); 5126 raw_spin_lock(&rq->lock);
5127 } 5127 }
5128 5128
5129 rq->stop = stop; 5129 rq->stop = stop;
5130 } 5130 }
5131 5131
5132 #endif /* CONFIG_HOTPLUG_CPU */ 5132 #endif /* CONFIG_HOTPLUG_CPU */
5133 5133
5134 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5134 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5135 5135
5136 static struct ctl_table sd_ctl_dir[] = { 5136 static struct ctl_table sd_ctl_dir[] = {
5137 { 5137 {
5138 .procname = "sched_domain", 5138 .procname = "sched_domain",
5139 .mode = 0555, 5139 .mode = 0555,
5140 }, 5140 },
5141 {} 5141 {}
5142 }; 5142 };
5143 5143
5144 static struct ctl_table sd_ctl_root[] = { 5144 static struct ctl_table sd_ctl_root[] = {
5145 { 5145 {
5146 .procname = "kernel", 5146 .procname = "kernel",
5147 .mode = 0555, 5147 .mode = 0555,
5148 .child = sd_ctl_dir, 5148 .child = sd_ctl_dir,
5149 }, 5149 },
5150 {} 5150 {}
5151 }; 5151 };
5152 5152
5153 static struct ctl_table *sd_alloc_ctl_entry(int n) 5153 static struct ctl_table *sd_alloc_ctl_entry(int n)
5154 { 5154 {
5155 struct ctl_table *entry = 5155 struct ctl_table *entry =
5156 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5156 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5157 5157
5158 return entry; 5158 return entry;
5159 } 5159 }
5160 5160
5161 static void sd_free_ctl_entry(struct ctl_table **tablep) 5161 static void sd_free_ctl_entry(struct ctl_table **tablep)
5162 { 5162 {
5163 struct ctl_table *entry; 5163 struct ctl_table *entry;
5164 5164
5165 /* 5165 /*
5166 * In the intermediate directories, both the child directory and 5166 * In the intermediate directories, both the child directory and
5167 * procname are dynamically allocated and could fail but the mode 5167 * procname are dynamically allocated and could fail but the mode
5168 * will always be set. In the lowest directory the names are 5168 * will always be set. In the lowest directory the names are
5169 * static strings and all have proc handlers. 5169 * static strings and all have proc handlers.
5170 */ 5170 */
5171 for (entry = *tablep; entry->mode; entry++) { 5171 for (entry = *tablep; entry->mode; entry++) {
5172 if (entry->child) 5172 if (entry->child)
5173 sd_free_ctl_entry(&entry->child); 5173 sd_free_ctl_entry(&entry->child);
5174 if (entry->proc_handler == NULL) 5174 if (entry->proc_handler == NULL)
5175 kfree(entry->procname); 5175 kfree(entry->procname);
5176 } 5176 }
5177 5177
5178 kfree(*tablep); 5178 kfree(*tablep);
5179 *tablep = NULL; 5179 *tablep = NULL;
5180 } 5180 }
5181 5181
5182 static void 5182 static void
5183 set_table_entry(struct ctl_table *entry, 5183 set_table_entry(struct ctl_table *entry,
5184 const char *procname, void *data, int maxlen, 5184 const char *procname, void *data, int maxlen,
5185 umode_t mode, proc_handler *proc_handler) 5185 umode_t mode, proc_handler *proc_handler)
5186 { 5186 {
5187 entry->procname = procname; 5187 entry->procname = procname;
5188 entry->data = data; 5188 entry->data = data;
5189 entry->maxlen = maxlen; 5189 entry->maxlen = maxlen;
5190 entry->mode = mode; 5190 entry->mode = mode;
5191 entry->proc_handler = proc_handler; 5191 entry->proc_handler = proc_handler;
5192 } 5192 }
5193 5193
5194 static struct ctl_table * 5194 static struct ctl_table *
5195 sd_alloc_ctl_domain_table(struct sched_domain *sd) 5195 sd_alloc_ctl_domain_table(struct sched_domain *sd)
5196 { 5196 {
5197 struct ctl_table *table = sd_alloc_ctl_entry(13); 5197 struct ctl_table *table = sd_alloc_ctl_entry(13);
5198 5198
5199 if (table == NULL) 5199 if (table == NULL)
5200 return NULL; 5200 return NULL;
5201 5201
5202 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5202 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5203 sizeof(long), 0644, proc_doulongvec_minmax); 5203 sizeof(long), 0644, proc_doulongvec_minmax);
5204 set_table_entry(&table[1], "max_interval", &sd->max_interval, 5204 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5205 sizeof(long), 0644, proc_doulongvec_minmax); 5205 sizeof(long), 0644, proc_doulongvec_minmax);
5206 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5206 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5207 sizeof(int), 0644, proc_dointvec_minmax); 5207 sizeof(int), 0644, proc_dointvec_minmax);
5208 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5208 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5209 sizeof(int), 0644, proc_dointvec_minmax); 5209 sizeof(int), 0644, proc_dointvec_minmax);
5210 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5210 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5211 sizeof(int), 0644, proc_dointvec_minmax); 5211 sizeof(int), 0644, proc_dointvec_minmax);
5212 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5212 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5213 sizeof(int), 0644, proc_dointvec_minmax); 5213 sizeof(int), 0644, proc_dointvec_minmax);
5214 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5214 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5215 sizeof(int), 0644, proc_dointvec_minmax); 5215 sizeof(int), 0644, proc_dointvec_minmax);
5216 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5216 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5217 sizeof(int), 0644, proc_dointvec_minmax); 5217 sizeof(int), 0644, proc_dointvec_minmax);
5218 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5218 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5219 sizeof(int), 0644, proc_dointvec_minmax); 5219 sizeof(int), 0644, proc_dointvec_minmax);
5220 set_table_entry(&table[9], "cache_nice_tries", 5220 set_table_entry(&table[9], "cache_nice_tries",
5221 &sd->cache_nice_tries, 5221 &sd->cache_nice_tries,
5222 sizeof(int), 0644, proc_dointvec_minmax); 5222 sizeof(int), 0644, proc_dointvec_minmax);
5223 set_table_entry(&table[10], "flags", &sd->flags, 5223 set_table_entry(&table[10], "flags", &sd->flags,
5224 sizeof(int), 0644, proc_dointvec_minmax); 5224 sizeof(int), 0644, proc_dointvec_minmax);
5225 set_table_entry(&table[11], "name", sd->name, 5225 set_table_entry(&table[11], "name", sd->name,
5226 CORENAME_MAX_SIZE, 0444, proc_dostring); 5226 CORENAME_MAX_SIZE, 0444, proc_dostring);
5227 /* &table[12] is terminator */ 5227 /* &table[12] is terminator */
5228 5228
5229 return table; 5229 return table;
5230 } 5230 }
5231 5231
5232 static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5232 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5233 { 5233 {
5234 struct ctl_table *entry, *table; 5234 struct ctl_table *entry, *table;
5235 struct sched_domain *sd; 5235 struct sched_domain *sd;
5236 int domain_num = 0, i; 5236 int domain_num = 0, i;
5237 char buf[32]; 5237 char buf[32];
5238 5238
5239 for_each_domain(cpu, sd) 5239 for_each_domain(cpu, sd)
5240 domain_num++; 5240 domain_num++;
5241 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5241 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5242 if (table == NULL) 5242 if (table == NULL)
5243 return NULL; 5243 return NULL;
5244 5244
5245 i = 0; 5245 i = 0;
5246 for_each_domain(cpu, sd) { 5246 for_each_domain(cpu, sd) {
5247 snprintf(buf, 32, "domain%d", i); 5247 snprintf(buf, 32, "domain%d", i);
5248 entry->procname = kstrdup(buf, GFP_KERNEL); 5248 entry->procname = kstrdup(buf, GFP_KERNEL);
5249 entry->mode = 0555; 5249 entry->mode = 0555;
5250 entry->child = sd_alloc_ctl_domain_table(sd); 5250 entry->child = sd_alloc_ctl_domain_table(sd);
5251 entry++; 5251 entry++;
5252 i++; 5252 i++;
5253 } 5253 }
5254 return table; 5254 return table;
5255 } 5255 }
5256 5256
5257 static struct ctl_table_header *sd_sysctl_header; 5257 static struct ctl_table_header *sd_sysctl_header;
5258 static void register_sched_domain_sysctl(void) 5258 static void register_sched_domain_sysctl(void)
5259 { 5259 {
5260 int i, cpu_num = num_possible_cpus(); 5260 int i, cpu_num = num_possible_cpus();
5261 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5261 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5262 char buf[32]; 5262 char buf[32];
5263 5263
5264 WARN_ON(sd_ctl_dir[0].child); 5264 WARN_ON(sd_ctl_dir[0].child);
5265 sd_ctl_dir[0].child = entry; 5265 sd_ctl_dir[0].child = entry;
5266 5266
5267 if (entry == NULL) 5267 if (entry == NULL)
5268 return; 5268 return;
5269 5269
5270 for_each_possible_cpu(i) { 5270 for_each_possible_cpu(i) {
5271 snprintf(buf, 32, "cpu%d", i); 5271 snprintf(buf, 32, "cpu%d", i);
5272 entry->procname = kstrdup(buf, GFP_KERNEL); 5272 entry->procname = kstrdup(buf, GFP_KERNEL);
5273 entry->mode = 0555; 5273 entry->mode = 0555;
5274 entry->child = sd_alloc_ctl_cpu_table(i); 5274 entry->child = sd_alloc_ctl_cpu_table(i);
5275 entry++; 5275 entry++;
5276 } 5276 }
5277 5277
5278 WARN_ON(sd_sysctl_header); 5278 WARN_ON(sd_sysctl_header);
5279 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5279 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5280 } 5280 }
5281 5281
5282 /* may be called multiple times per register */ 5282 /* may be called multiple times per register */
5283 static void unregister_sched_domain_sysctl(void) 5283 static void unregister_sched_domain_sysctl(void)
5284 { 5284 {
5285 if (sd_sysctl_header) 5285 if (sd_sysctl_header)
5286 unregister_sysctl_table(sd_sysctl_header); 5286 unregister_sysctl_table(sd_sysctl_header);
5287 sd_sysctl_header = NULL; 5287 sd_sysctl_header = NULL;
5288 if (sd_ctl_dir[0].child) 5288 if (sd_ctl_dir[0].child)
5289 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5289 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5290 } 5290 }
5291 #else 5291 #else
5292 static void register_sched_domain_sysctl(void) 5292 static void register_sched_domain_sysctl(void)
5293 { 5293 {
5294 } 5294 }
5295 static void unregister_sched_domain_sysctl(void) 5295 static void unregister_sched_domain_sysctl(void)
5296 { 5296 {
5297 } 5297 }
5298 #endif 5298 #endif
5299 5299
5300 static void set_rq_online(struct rq *rq) 5300 static void set_rq_online(struct rq *rq)
5301 { 5301 {
5302 if (!rq->online) { 5302 if (!rq->online) {
5303 const struct sched_class *class; 5303 const struct sched_class *class;
5304 5304
5305 cpumask_set_cpu(rq->cpu, rq->rd->online); 5305 cpumask_set_cpu(rq->cpu, rq->rd->online);
5306 rq->online = 1; 5306 rq->online = 1;
5307 5307
5308 for_each_class(class) { 5308 for_each_class(class) {
5309 if (class->rq_online) 5309 if (class->rq_online)
5310 class->rq_online(rq); 5310 class->rq_online(rq);
5311 } 5311 }
5312 } 5312 }
5313 } 5313 }
5314 5314
5315 static void set_rq_offline(struct rq *rq) 5315 static void set_rq_offline(struct rq *rq)
5316 { 5316 {
5317 if (rq->online) { 5317 if (rq->online) {
5318 const struct sched_class *class; 5318 const struct sched_class *class;
5319 5319
5320 for_each_class(class) { 5320 for_each_class(class) {
5321 if (class->rq_offline) 5321 if (class->rq_offline)
5322 class->rq_offline(rq); 5322 class->rq_offline(rq);
5323 } 5323 }
5324 5324
5325 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5325 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5326 rq->online = 0; 5326 rq->online = 0;
5327 } 5327 }
5328 } 5328 }
5329 5329
5330 /* 5330 /*
5331 * migration_call - callback that gets triggered when a CPU is added. 5331 * migration_call - callback that gets triggered when a CPU is added.
5332 * Here we can start up the necessary migration thread for the new CPU. 5332 * Here we can start up the necessary migration thread for the new CPU.
5333 */ 5333 */
5334 static int __cpuinit 5334 static int __cpuinit
5335 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5335 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5336 { 5336 {
5337 int cpu = (long)hcpu; 5337 int cpu = (long)hcpu;
5338 unsigned long flags; 5338 unsigned long flags;
5339 struct rq *rq = cpu_rq(cpu); 5339 struct rq *rq = cpu_rq(cpu);
5340 5340
5341 switch (action & ~CPU_TASKS_FROZEN) { 5341 switch (action & ~CPU_TASKS_FROZEN) {
5342 5342
5343 case CPU_UP_PREPARE: 5343 case CPU_UP_PREPARE:
5344 rq->calc_load_update = calc_load_update; 5344 rq->calc_load_update = calc_load_update;
5345 break; 5345 break;
5346 5346
5347 case CPU_ONLINE: 5347 case CPU_ONLINE:
5348 /* Update our root-domain */ 5348 /* Update our root-domain */
5349 raw_spin_lock_irqsave(&rq->lock, flags); 5349 raw_spin_lock_irqsave(&rq->lock, flags);
5350 if (rq->rd) { 5350 if (rq->rd) {
5351 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5351 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5352 5352
5353 set_rq_online(rq); 5353 set_rq_online(rq);
5354 } 5354 }
5355 raw_spin_unlock_irqrestore(&rq->lock, flags); 5355 raw_spin_unlock_irqrestore(&rq->lock, flags);
5356 break; 5356 break;
5357 5357
5358 #ifdef CONFIG_HOTPLUG_CPU 5358 #ifdef CONFIG_HOTPLUG_CPU
5359 case CPU_DYING: 5359 case CPU_DYING:
5360 sched_ttwu_pending(); 5360 sched_ttwu_pending();
5361 /* Update our root-domain */ 5361 /* Update our root-domain */
5362 raw_spin_lock_irqsave(&rq->lock, flags); 5362 raw_spin_lock_irqsave(&rq->lock, flags);
5363 if (rq->rd) { 5363 if (rq->rd) {
5364 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5364 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5365 set_rq_offline(rq); 5365 set_rq_offline(rq);
5366 } 5366 }
5367 migrate_tasks(cpu); 5367 migrate_tasks(cpu);
5368 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5368 BUG_ON(rq->nr_running != 1); /* the migration thread */
5369 raw_spin_unlock_irqrestore(&rq->lock, flags); 5369 raw_spin_unlock_irqrestore(&rq->lock, flags);
5370 5370
5371 migrate_nr_uninterruptible(rq); 5371 migrate_nr_uninterruptible(rq);
5372 calc_global_load_remove(rq); 5372 calc_global_load_remove(rq);
5373 break; 5373 break;
5374 #endif 5374 #endif
5375 } 5375 }
5376 5376
5377 update_max_interval(); 5377 update_max_interval();
5378 5378
5379 return NOTIFY_OK; 5379 return NOTIFY_OK;
5380 } 5380 }
5381 5381
5382 /* 5382 /*
5383 * Register at high priority so that task migration (migrate_all_tasks) 5383 * Register at high priority so that task migration (migrate_all_tasks)
5384 * happens before everything else. This has to be lower priority than 5384 * happens before everything else. This has to be lower priority than
5385 * the notifier in the perf_event subsystem, though. 5385 * the notifier in the perf_event subsystem, though.
5386 */ 5386 */
5387 static struct notifier_block __cpuinitdata migration_notifier = { 5387 static struct notifier_block __cpuinitdata migration_notifier = {
5388 .notifier_call = migration_call, 5388 .notifier_call = migration_call,
5389 .priority = CPU_PRI_MIGRATION, 5389 .priority = CPU_PRI_MIGRATION,
5390 }; 5390 };
5391 5391
5392 static int __cpuinit sched_cpu_active(struct notifier_block *nfb, 5392 static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5393 unsigned long action, void *hcpu) 5393 unsigned long action, void *hcpu)
5394 { 5394 {
5395 switch (action & ~CPU_TASKS_FROZEN) { 5395 switch (action & ~CPU_TASKS_FROZEN) {
5396 case CPU_ONLINE: 5396 case CPU_ONLINE:
5397 case CPU_DOWN_FAILED: 5397 case CPU_DOWN_FAILED:
5398 set_cpu_active((long)hcpu, true); 5398 set_cpu_active((long)hcpu, true);
5399 return NOTIFY_OK; 5399 return NOTIFY_OK;
5400 default: 5400 default:
5401 return NOTIFY_DONE; 5401 return NOTIFY_DONE;
5402 } 5402 }
5403 } 5403 }
5404 5404
5405 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, 5405 static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5406 unsigned long action, void *hcpu) 5406 unsigned long action, void *hcpu)
5407 { 5407 {
5408 switch (action & ~CPU_TASKS_FROZEN) { 5408 switch (action & ~CPU_TASKS_FROZEN) {
5409 case CPU_DOWN_PREPARE: 5409 case CPU_DOWN_PREPARE:
5410 set_cpu_active((long)hcpu, false); 5410 set_cpu_active((long)hcpu, false);
5411 return NOTIFY_OK; 5411 return NOTIFY_OK;
5412 default: 5412 default:
5413 return NOTIFY_DONE; 5413 return NOTIFY_DONE;
5414 } 5414 }
5415 } 5415 }
5416 5416
5417 static int __init migration_init(void) 5417 static int __init migration_init(void)
5418 { 5418 {
5419 void *cpu = (void *)(long)smp_processor_id(); 5419 void *cpu = (void *)(long)smp_processor_id();
5420 int err; 5420 int err;
5421 5421
5422 /* Initialize migration for the boot CPU */ 5422 /* Initialize migration for the boot CPU */
5423 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5423 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5424 BUG_ON(err == NOTIFY_BAD); 5424 BUG_ON(err == NOTIFY_BAD);
5425 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5425 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5426 register_cpu_notifier(&migration_notifier); 5426 register_cpu_notifier(&migration_notifier);
5427 5427
5428 /* Register cpu active notifiers */ 5428 /* Register cpu active notifiers */
5429 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5429 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5430 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5430 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5431 5431
5432 return 0; 5432 return 0;
5433 } 5433 }
5434 early_initcall(migration_init); 5434 early_initcall(migration_init);
5435 #endif 5435 #endif
5436 5436
5437 #ifdef CONFIG_SMP 5437 #ifdef CONFIG_SMP
5438 5438
5439 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5439 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5440 5440
5441 #ifdef CONFIG_SCHED_DEBUG 5441 #ifdef CONFIG_SCHED_DEBUG
5442 5442
5443 static __read_mostly int sched_domain_debug_enabled; 5443 static __read_mostly int sched_domain_debug_enabled;
5444 5444
5445 static int __init sched_domain_debug_setup(char *str) 5445 static int __init sched_domain_debug_setup(char *str)
5446 { 5446 {
5447 sched_domain_debug_enabled = 1; 5447 sched_domain_debug_enabled = 1;
5448 5448
5449 return 0; 5449 return 0;
5450 } 5450 }
5451 early_param("sched_debug", sched_domain_debug_setup); 5451 early_param("sched_debug", sched_domain_debug_setup);
5452 5452
5453 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5453 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5454 struct cpumask *groupmask) 5454 struct cpumask *groupmask)
5455 { 5455 {
5456 struct sched_group *group = sd->groups; 5456 struct sched_group *group = sd->groups;
5457 char str[256]; 5457 char str[256];
5458 5458
5459 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 5459 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5460 cpumask_clear(groupmask); 5460 cpumask_clear(groupmask);
5461 5461
5462 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5462 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5463 5463
5464 if (!(sd->flags & SD_LOAD_BALANCE)) { 5464 if (!(sd->flags & SD_LOAD_BALANCE)) {
5465 printk("does not load-balance\n"); 5465 printk("does not load-balance\n");
5466 if (sd->parent) 5466 if (sd->parent)
5467 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5467 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5468 " has parent"); 5468 " has parent");
5469 return -1; 5469 return -1;
5470 } 5470 }
5471 5471
5472 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5472 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5473 5473
5474 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5474 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5475 printk(KERN_ERR "ERROR: domain->span does not contain " 5475 printk(KERN_ERR "ERROR: domain->span does not contain "
5476 "CPU%d\n", cpu); 5476 "CPU%d\n", cpu);
5477 } 5477 }
5478 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5478 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5479 printk(KERN_ERR "ERROR: domain->groups does not contain" 5479 printk(KERN_ERR "ERROR: domain->groups does not contain"
5480 " CPU%d\n", cpu); 5480 " CPU%d\n", cpu);
5481 } 5481 }
5482 5482
5483 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5483 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5484 do { 5484 do {
5485 if (!group) { 5485 if (!group) {
5486 printk("\n"); 5486 printk("\n");
5487 printk(KERN_ERR "ERROR: group is NULL\n"); 5487 printk(KERN_ERR "ERROR: group is NULL\n");
5488 break; 5488 break;
5489 } 5489 }
5490 5490
5491 if (!group->sgp->power) { 5491 if (!group->sgp->power) {
5492 printk(KERN_CONT "\n"); 5492 printk(KERN_CONT "\n");
5493 printk(KERN_ERR "ERROR: domain->cpu_power not " 5493 printk(KERN_ERR "ERROR: domain->cpu_power not "
5494 "set\n"); 5494 "set\n");
5495 break; 5495 break;
5496 } 5496 }
5497 5497
5498 if (!cpumask_weight(sched_group_cpus(group))) { 5498 if (!cpumask_weight(sched_group_cpus(group))) {
5499 printk(KERN_CONT "\n"); 5499 printk(KERN_CONT "\n");
5500 printk(KERN_ERR "ERROR: empty group\n"); 5500 printk(KERN_ERR "ERROR: empty group\n");
5501 break; 5501 break;
5502 } 5502 }
5503 5503
5504 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5504 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
5505 printk(KERN_CONT "\n"); 5505 printk(KERN_CONT "\n");
5506 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5506 printk(KERN_ERR "ERROR: repeated CPUs\n");
5507 break; 5507 break;
5508 } 5508 }
5509 5509
5510 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5510 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5511 5511
5512 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5512 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5513 5513
5514 printk(KERN_CONT " %s", str); 5514 printk(KERN_CONT " %s", str);
5515 if (group->sgp->power != SCHED_POWER_SCALE) { 5515 if (group->sgp->power != SCHED_POWER_SCALE) {
5516 printk(KERN_CONT " (cpu_power = %d)", 5516 printk(KERN_CONT " (cpu_power = %d)",
5517 group->sgp->power); 5517 group->sgp->power);
5518 } 5518 }
5519 5519
5520 group = group->next; 5520 group = group->next;
5521 } while (group != sd->groups); 5521 } while (group != sd->groups);
5522 printk(KERN_CONT "\n"); 5522 printk(KERN_CONT "\n");
5523 5523
5524 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5524 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5525 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5525 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5526 5526
5527 if (sd->parent && 5527 if (sd->parent &&
5528 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5528 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5529 printk(KERN_ERR "ERROR: parent span is not a superset " 5529 printk(KERN_ERR "ERROR: parent span is not a superset "
5530 "of domain->span\n"); 5530 "of domain->span\n");
5531 return 0; 5531 return 0;
5532 } 5532 }
5533 5533
5534 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5534 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5535 { 5535 {
5536 int level = 0; 5536 int level = 0;
5537 5537
5538 if (!sched_domain_debug_enabled) 5538 if (!sched_domain_debug_enabled)
5539 return; 5539 return;
5540 5540
5541 if (!sd) { 5541 if (!sd) {
5542 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5542 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5543 return; 5543 return;
5544 } 5544 }
5545 5545
5546 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5546 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5547 5547
5548 for (;;) { 5548 for (;;) {
5549 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5549 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5550 break; 5550 break;
5551 level++; 5551 level++;
5552 sd = sd->parent; 5552 sd = sd->parent;
5553 if (!sd) 5553 if (!sd)
5554 break; 5554 break;
5555 } 5555 }
5556 } 5556 }
5557 #else /* !CONFIG_SCHED_DEBUG */ 5557 #else /* !CONFIG_SCHED_DEBUG */
5558 # define sched_domain_debug(sd, cpu) do { } while (0) 5558 # define sched_domain_debug(sd, cpu) do { } while (0)
5559 #endif /* CONFIG_SCHED_DEBUG */ 5559 #endif /* CONFIG_SCHED_DEBUG */
5560 5560
5561 static int sd_degenerate(struct sched_domain *sd) 5561 static int sd_degenerate(struct sched_domain *sd)
5562 { 5562 {
5563 if (cpumask_weight(sched_domain_span(sd)) == 1) 5563 if (cpumask_weight(sched_domain_span(sd)) == 1)
5564 return 1; 5564 return 1;
5565 5565
5566 /* Following flags need at least 2 groups */ 5566 /* Following flags need at least 2 groups */
5567 if (sd->flags & (SD_LOAD_BALANCE | 5567 if (sd->flags & (SD_LOAD_BALANCE |
5568 SD_BALANCE_NEWIDLE | 5568 SD_BALANCE_NEWIDLE |
5569 SD_BALANCE_FORK | 5569 SD_BALANCE_FORK |
5570 SD_BALANCE_EXEC | 5570 SD_BALANCE_EXEC |
5571 SD_SHARE_CPUPOWER | 5571 SD_SHARE_CPUPOWER |
5572 SD_SHARE_PKG_RESOURCES)) { 5572 SD_SHARE_PKG_RESOURCES)) {
5573 if (sd->groups != sd->groups->next) 5573 if (sd->groups != sd->groups->next)
5574 return 0; 5574 return 0;
5575 } 5575 }
5576 5576
5577 /* Following flags don't use groups */ 5577 /* Following flags don't use groups */
5578 if (sd->flags & (SD_WAKE_AFFINE)) 5578 if (sd->flags & (SD_WAKE_AFFINE))
5579 return 0; 5579 return 0;
5580 5580
5581 return 1; 5581 return 1;
5582 } 5582 }
5583 5583
5584 static int 5584 static int
5585 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5585 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5586 { 5586 {
5587 unsigned long cflags = sd->flags, pflags = parent->flags; 5587 unsigned long cflags = sd->flags, pflags = parent->flags;
5588 5588
5589 if (sd_degenerate(parent)) 5589 if (sd_degenerate(parent))
5590 return 1; 5590 return 1;
5591 5591
5592 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5592 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5593 return 0; 5593 return 0;
5594 5594
5595 /* Flags needing groups don't count if only 1 group in parent */ 5595 /* Flags needing groups don't count if only 1 group in parent */
5596 if (parent->groups == parent->groups->next) { 5596 if (parent->groups == parent->groups->next) {
5597 pflags &= ~(SD_LOAD_BALANCE | 5597 pflags &= ~(SD_LOAD_BALANCE |
5598 SD_BALANCE_NEWIDLE | 5598 SD_BALANCE_NEWIDLE |
5599 SD_BALANCE_FORK | 5599 SD_BALANCE_FORK |
5600 SD_BALANCE_EXEC | 5600 SD_BALANCE_EXEC |
5601 SD_SHARE_CPUPOWER | 5601 SD_SHARE_CPUPOWER |
5602 SD_SHARE_PKG_RESOURCES); 5602 SD_SHARE_PKG_RESOURCES);
5603 if (nr_node_ids == 1) 5603 if (nr_node_ids == 1)
5604 pflags &= ~SD_SERIALIZE; 5604 pflags &= ~SD_SERIALIZE;
5605 } 5605 }
5606 if (~cflags & pflags) 5606 if (~cflags & pflags)
5607 return 0; 5607 return 0;
5608 5608
5609 return 1; 5609 return 1;
5610 } 5610 }
5611 5611
5612 static void free_rootdomain(struct rcu_head *rcu) 5612 static void free_rootdomain(struct rcu_head *rcu)
5613 { 5613 {
5614 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5614 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5615 5615
5616 cpupri_cleanup(&rd->cpupri); 5616 cpupri_cleanup(&rd->cpupri);
5617 free_cpumask_var(rd->rto_mask); 5617 free_cpumask_var(rd->rto_mask);
5618 free_cpumask_var(rd->online); 5618 free_cpumask_var(rd->online);
5619 free_cpumask_var(rd->span); 5619 free_cpumask_var(rd->span);
5620 kfree(rd); 5620 kfree(rd);
5621 } 5621 }
5622 5622
5623 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5623 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5624 { 5624 {
5625 struct root_domain *old_rd = NULL; 5625 struct root_domain *old_rd = NULL;
5626 unsigned long flags; 5626 unsigned long flags;
5627 5627
5628 raw_spin_lock_irqsave(&rq->lock, flags); 5628 raw_spin_lock_irqsave(&rq->lock, flags);
5629 5629
5630 if (rq->rd) { 5630 if (rq->rd) {
5631 old_rd = rq->rd; 5631 old_rd = rq->rd;
5632 5632
5633 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5633 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5634 set_rq_offline(rq); 5634 set_rq_offline(rq);
5635 5635
5636 cpumask_clear_cpu(rq->cpu, old_rd->span); 5636 cpumask_clear_cpu(rq->cpu, old_rd->span);
5637 5637
5638 /* 5638 /*
5639 * If we dont want to free the old_rt yet then 5639 * If we dont want to free the old_rt yet then
5640 * set old_rd to NULL to skip the freeing later 5640 * set old_rd to NULL to skip the freeing later
5641 * in this function: 5641 * in this function:
5642 */ 5642 */
5643 if (!atomic_dec_and_test(&old_rd->refcount)) 5643 if (!atomic_dec_and_test(&old_rd->refcount))
5644 old_rd = NULL; 5644 old_rd = NULL;
5645 } 5645 }
5646 5646
5647 atomic_inc(&rd->refcount); 5647 atomic_inc(&rd->refcount);
5648 rq->rd = rd; 5648 rq->rd = rd;
5649 5649
5650 cpumask_set_cpu(rq->cpu, rd->span); 5650 cpumask_set_cpu(rq->cpu, rd->span);
5651 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5651 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5652 set_rq_online(rq); 5652 set_rq_online(rq);
5653 5653
5654 raw_spin_unlock_irqrestore(&rq->lock, flags); 5654 raw_spin_unlock_irqrestore(&rq->lock, flags);
5655 5655
5656 if (old_rd) 5656 if (old_rd)
5657 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5657 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5658 } 5658 }
5659 5659
5660 static int init_rootdomain(struct root_domain *rd) 5660 static int init_rootdomain(struct root_domain *rd)
5661 { 5661 {
5662 memset(rd, 0, sizeof(*rd)); 5662 memset(rd, 0, sizeof(*rd));
5663 5663
5664 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5664 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5665 goto out; 5665 goto out;
5666 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5666 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5667 goto free_span; 5667 goto free_span;
5668 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5668 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5669 goto free_online; 5669 goto free_online;
5670 5670
5671 if (cpupri_init(&rd->cpupri) != 0) 5671 if (cpupri_init(&rd->cpupri) != 0)
5672 goto free_rto_mask; 5672 goto free_rto_mask;
5673 return 0; 5673 return 0;
5674 5674
5675 free_rto_mask: 5675 free_rto_mask:
5676 free_cpumask_var(rd->rto_mask); 5676 free_cpumask_var(rd->rto_mask);
5677 free_online: 5677 free_online:
5678 free_cpumask_var(rd->online); 5678 free_cpumask_var(rd->online);
5679 free_span: 5679 free_span:
5680 free_cpumask_var(rd->span); 5680 free_cpumask_var(rd->span);
5681 out: 5681 out:
5682 return -ENOMEM; 5682 return -ENOMEM;
5683 } 5683 }
5684 5684
5685 /* 5685 /*
5686 * By default the system creates a single root-domain with all cpus as 5686 * By default the system creates a single root-domain with all cpus as
5687 * members (mimicking the global state we have today). 5687 * members (mimicking the global state we have today).
5688 */ 5688 */
5689 struct root_domain def_root_domain; 5689 struct root_domain def_root_domain;
5690 5690
5691 static void init_defrootdomain(void) 5691 static void init_defrootdomain(void)
5692 { 5692 {
5693 init_rootdomain(&def_root_domain); 5693 init_rootdomain(&def_root_domain);
5694 5694
5695 atomic_set(&def_root_domain.refcount, 1); 5695 atomic_set(&def_root_domain.refcount, 1);
5696 } 5696 }
5697 5697
5698 static struct root_domain *alloc_rootdomain(void) 5698 static struct root_domain *alloc_rootdomain(void)
5699 { 5699 {
5700 struct root_domain *rd; 5700 struct root_domain *rd;
5701 5701
5702 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5702 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5703 if (!rd) 5703 if (!rd)
5704 return NULL; 5704 return NULL;
5705 5705
5706 if (init_rootdomain(rd) != 0) { 5706 if (init_rootdomain(rd) != 0) {
5707 kfree(rd); 5707 kfree(rd);
5708 return NULL; 5708 return NULL;
5709 } 5709 }
5710 5710
5711 return rd; 5711 return rd;
5712 } 5712 }
5713 5713
5714 static void free_sched_groups(struct sched_group *sg, int free_sgp) 5714 static void free_sched_groups(struct sched_group *sg, int free_sgp)
5715 { 5715 {
5716 struct sched_group *tmp, *first; 5716 struct sched_group *tmp, *first;
5717 5717
5718 if (!sg) 5718 if (!sg)
5719 return; 5719 return;
5720 5720
5721 first = sg; 5721 first = sg;
5722 do { 5722 do {
5723 tmp = sg->next; 5723 tmp = sg->next;
5724 5724
5725 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5725 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5726 kfree(sg->sgp); 5726 kfree(sg->sgp);
5727 5727
5728 kfree(sg); 5728 kfree(sg);
5729 sg = tmp; 5729 sg = tmp;
5730 } while (sg != first); 5730 } while (sg != first);
5731 } 5731 }
5732 5732
5733 static void free_sched_domain(struct rcu_head *rcu) 5733 static void free_sched_domain(struct rcu_head *rcu)
5734 { 5734 {
5735 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5735 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5736 5736
5737 /* 5737 /*
5738 * If its an overlapping domain it has private groups, iterate and 5738 * If its an overlapping domain it has private groups, iterate and
5739 * nuke them all. 5739 * nuke them all.
5740 */ 5740 */
5741 if (sd->flags & SD_OVERLAP) { 5741 if (sd->flags & SD_OVERLAP) {
5742 free_sched_groups(sd->groups, 1); 5742 free_sched_groups(sd->groups, 1);
5743 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5743 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5744 kfree(sd->groups->sgp); 5744 kfree(sd->groups->sgp);
5745 kfree(sd->groups); 5745 kfree(sd->groups);
5746 } 5746 }
5747 kfree(sd); 5747 kfree(sd);
5748 } 5748 }
5749 5749
5750 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5750 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5751 { 5751 {
5752 call_rcu(&sd->rcu, free_sched_domain); 5752 call_rcu(&sd->rcu, free_sched_domain);
5753 } 5753 }
5754 5754
5755 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5755 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5756 { 5756 {
5757 for (; sd; sd = sd->parent) 5757 for (; sd; sd = sd->parent)
5758 destroy_sched_domain(sd, cpu); 5758 destroy_sched_domain(sd, cpu);
5759 } 5759 }
5760 5760
5761 /* 5761 /*
5762 * Keep a special pointer to the highest sched_domain that has 5762 * Keep a special pointer to the highest sched_domain that has
5763 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5763 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5764 * allows us to avoid some pointer chasing select_idle_sibling(). 5764 * allows us to avoid some pointer chasing select_idle_sibling().
5765 * 5765 *
5766 * Also keep a unique ID per domain (we use the first cpu number in 5766 * Also keep a unique ID per domain (we use the first cpu number in
5767 * the cpumask of the domain), this allows us to quickly tell if 5767 * the cpumask of the domain), this allows us to quickly tell if
5768 * two cpus are in the same cache domain, see cpus_share_cache(). 5768 * two cpus are in the same cache domain, see cpus_share_cache().
5769 */ 5769 */
5770 DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5770 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5771 DEFINE_PER_CPU(int, sd_llc_id); 5771 DEFINE_PER_CPU(int, sd_llc_id);
5772 5772
5773 static void update_top_cache_domain(int cpu) 5773 static void update_top_cache_domain(int cpu)
5774 { 5774 {
5775 struct sched_domain *sd; 5775 struct sched_domain *sd;
5776 int id = cpu; 5776 int id = cpu;
5777 5777
5778 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5778 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5779 if (sd) 5779 if (sd)
5780 id = cpumask_first(sched_domain_span(sd)); 5780 id = cpumask_first(sched_domain_span(sd));
5781 5781
5782 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5782 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5783 per_cpu(sd_llc_id, cpu) = id; 5783 per_cpu(sd_llc_id, cpu) = id;
5784 } 5784 }
5785 5785
5786 /* 5786 /*
5787 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5787 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5788 * hold the hotplug lock. 5788 * hold the hotplug lock.
5789 */ 5789 */
5790 static void 5790 static void
5791 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5791 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5792 { 5792 {
5793 struct rq *rq = cpu_rq(cpu); 5793 struct rq *rq = cpu_rq(cpu);
5794 struct sched_domain *tmp; 5794 struct sched_domain *tmp;
5795 5795
5796 /* Remove the sched domains which do not contribute to scheduling. */ 5796 /* Remove the sched domains which do not contribute to scheduling. */
5797 for (tmp = sd; tmp; ) { 5797 for (tmp = sd; tmp; ) {
5798 struct sched_domain *parent = tmp->parent; 5798 struct sched_domain *parent = tmp->parent;
5799 if (!parent) 5799 if (!parent)
5800 break; 5800 break;
5801 5801
5802 if (sd_parent_degenerate(tmp, parent)) { 5802 if (sd_parent_degenerate(tmp, parent)) {
5803 tmp->parent = parent->parent; 5803 tmp->parent = parent->parent;
5804 if (parent->parent) 5804 if (parent->parent)
5805 parent->parent->child = tmp; 5805 parent->parent->child = tmp;
5806 destroy_sched_domain(parent, cpu); 5806 destroy_sched_domain(parent, cpu);
5807 } else 5807 } else
5808 tmp = tmp->parent; 5808 tmp = tmp->parent;
5809 } 5809 }
5810 5810
5811 if (sd && sd_degenerate(sd)) { 5811 if (sd && sd_degenerate(sd)) {
5812 tmp = sd; 5812 tmp = sd;
5813 sd = sd->parent; 5813 sd = sd->parent;
5814 destroy_sched_domain(tmp, cpu); 5814 destroy_sched_domain(tmp, cpu);
5815 if (sd) 5815 if (sd)
5816 sd->child = NULL; 5816 sd->child = NULL;
5817 } 5817 }
5818 5818
5819 sched_domain_debug(sd, cpu); 5819 sched_domain_debug(sd, cpu);
5820 5820
5821 rq_attach_root(rq, rd); 5821 rq_attach_root(rq, rd);
5822 tmp = rq->sd; 5822 tmp = rq->sd;
5823 rcu_assign_pointer(rq->sd, sd); 5823 rcu_assign_pointer(rq->sd, sd);
5824 destroy_sched_domains(tmp, cpu); 5824 destroy_sched_domains(tmp, cpu);
5825 5825
5826 update_top_cache_domain(cpu); 5826 update_top_cache_domain(cpu);
5827 } 5827 }
5828 5828
5829 /* cpus with isolated domains */ 5829 /* cpus with isolated domains */
5830 static cpumask_var_t cpu_isolated_map; 5830 static cpumask_var_t cpu_isolated_map;
5831 5831
5832 /* Setup the mask of cpus configured for isolated domains */ 5832 /* Setup the mask of cpus configured for isolated domains */
5833 static int __init isolated_cpu_setup(char *str) 5833 static int __init isolated_cpu_setup(char *str)
5834 { 5834 {
5835 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5835 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5836 cpulist_parse(str, cpu_isolated_map); 5836 cpulist_parse(str, cpu_isolated_map);
5837 return 1; 5837 return 1;
5838 } 5838 }
5839 5839
5840 __setup("isolcpus=", isolated_cpu_setup); 5840 __setup("isolcpus=", isolated_cpu_setup);
5841 5841
5842 #ifdef CONFIG_NUMA 5842 #ifdef CONFIG_NUMA
5843 5843
5844 /** 5844 /**
5845 * find_next_best_node - find the next node to include in a sched_domain 5845 * find_next_best_node - find the next node to include in a sched_domain
5846 * @node: node whose sched_domain we're building 5846 * @node: node whose sched_domain we're building
5847 * @used_nodes: nodes already in the sched_domain 5847 * @used_nodes: nodes already in the sched_domain
5848 * 5848 *
5849 * Find the next node to include in a given scheduling domain. Simply 5849 * Find the next node to include in a given scheduling domain. Simply
5850 * finds the closest node not already in the @used_nodes map. 5850 * finds the closest node not already in the @used_nodes map.
5851 * 5851 *
5852 * Should use nodemask_t. 5852 * Should use nodemask_t.
5853 */ 5853 */
5854 static int find_next_best_node(int node, nodemask_t *used_nodes) 5854 static int find_next_best_node(int node, nodemask_t *used_nodes)
5855 { 5855 {
5856 int i, n, val, min_val, best_node = -1; 5856 int i, n, val, min_val, best_node = -1;
5857 5857
5858 min_val = INT_MAX; 5858 min_val = INT_MAX;
5859 5859
5860 for (i = 0; i < nr_node_ids; i++) { 5860 for (i = 0; i < nr_node_ids; i++) {
5861 /* Start at @node */ 5861 /* Start at @node */
5862 n = (node + i) % nr_node_ids; 5862 n = (node + i) % nr_node_ids;
5863 5863
5864 if (!nr_cpus_node(n)) 5864 if (!nr_cpus_node(n))
5865 continue; 5865 continue;
5866 5866
5867 /* Skip already used nodes */ 5867 /* Skip already used nodes */
5868 if (node_isset(n, *used_nodes)) 5868 if (node_isset(n, *used_nodes))
5869 continue; 5869 continue;
5870 5870
5871 /* Simple min distance search */ 5871 /* Simple min distance search */
5872 val = node_distance(node, n); 5872 val = node_distance(node, n);
5873 5873
5874 if (val < min_val) { 5874 if (val < min_val) {
5875 min_val = val; 5875 min_val = val;
5876 best_node = n; 5876 best_node = n;
5877 } 5877 }
5878 } 5878 }
5879 5879
5880 if (best_node != -1) 5880 if (best_node != -1)
5881 node_set(best_node, *used_nodes); 5881 node_set(best_node, *used_nodes);
5882 return best_node; 5882 return best_node;
5883 } 5883 }
5884 5884
5885 /** 5885 /**
5886 * sched_domain_node_span - get a cpumask for a node's sched_domain 5886 * sched_domain_node_span - get a cpumask for a node's sched_domain
5887 * @node: node whose cpumask we're constructing 5887 * @node: node whose cpumask we're constructing
5888 * @span: resulting cpumask 5888 * @span: resulting cpumask
5889 * 5889 *
5890 * Given a node, construct a good cpumask for its sched_domain to span. It 5890 * Given a node, construct a good cpumask for its sched_domain to span. It
5891 * should be one that prevents unnecessary balancing, but also spreads tasks 5891 * should be one that prevents unnecessary balancing, but also spreads tasks
5892 * out optimally. 5892 * out optimally.
5893 */ 5893 */
5894 static void sched_domain_node_span(int node, struct cpumask *span) 5894 static void sched_domain_node_span(int node, struct cpumask *span)
5895 { 5895 {
5896 nodemask_t used_nodes; 5896 nodemask_t used_nodes;
5897 int i; 5897 int i;
5898 5898
5899 cpumask_clear(span); 5899 cpumask_clear(span);
5900 nodes_clear(used_nodes); 5900 nodes_clear(used_nodes);
5901 5901
5902 cpumask_or(span, span, cpumask_of_node(node)); 5902 cpumask_or(span, span, cpumask_of_node(node));
5903 node_set(node, used_nodes); 5903 node_set(node, used_nodes);
5904 5904
5905 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 5905 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5906 int next_node = find_next_best_node(node, &used_nodes); 5906 int next_node = find_next_best_node(node, &used_nodes);
5907 if (next_node < 0) 5907 if (next_node < 0)
5908 break; 5908 break;
5909 cpumask_or(span, span, cpumask_of_node(next_node)); 5909 cpumask_or(span, span, cpumask_of_node(next_node));
5910 } 5910 }
5911 } 5911 }
5912 5912
5913 static const struct cpumask *cpu_node_mask(int cpu) 5913 static const struct cpumask *cpu_node_mask(int cpu)
5914 { 5914 {
5915 lockdep_assert_held(&sched_domains_mutex); 5915 lockdep_assert_held(&sched_domains_mutex);
5916 5916
5917 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); 5917 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5918 5918
5919 return sched_domains_tmpmask; 5919 return sched_domains_tmpmask;
5920 } 5920 }
5921 5921
5922 static const struct cpumask *cpu_allnodes_mask(int cpu) 5922 static const struct cpumask *cpu_allnodes_mask(int cpu)
5923 { 5923 {
5924 return cpu_possible_mask; 5924 return cpu_possible_mask;
5925 } 5925 }
5926 #endif /* CONFIG_NUMA */ 5926 #endif /* CONFIG_NUMA */
5927 5927
5928 static const struct cpumask *cpu_cpu_mask(int cpu) 5928 static const struct cpumask *cpu_cpu_mask(int cpu)
5929 { 5929 {
5930 return cpumask_of_node(cpu_to_node(cpu)); 5930 return cpumask_of_node(cpu_to_node(cpu));
5931 } 5931 }
5932 5932
5933 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 5933 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5934 5934
5935 struct sd_data { 5935 struct sd_data {
5936 struct sched_domain **__percpu sd; 5936 struct sched_domain **__percpu sd;
5937 struct sched_group **__percpu sg; 5937 struct sched_group **__percpu sg;
5938 struct sched_group_power **__percpu sgp; 5938 struct sched_group_power **__percpu sgp;
5939 }; 5939 };
5940 5940
5941 struct s_data { 5941 struct s_data {
5942 struct sched_domain ** __percpu sd; 5942 struct sched_domain ** __percpu sd;
5943 struct root_domain *rd; 5943 struct root_domain *rd;
5944 }; 5944 };
5945 5945
5946 enum s_alloc { 5946 enum s_alloc {
5947 sa_rootdomain, 5947 sa_rootdomain,
5948 sa_sd, 5948 sa_sd,
5949 sa_sd_storage, 5949 sa_sd_storage,
5950 sa_none, 5950 sa_none,
5951 }; 5951 };
5952 5952
5953 struct sched_domain_topology_level; 5953 struct sched_domain_topology_level;
5954 5954
5955 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5955 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5956 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5956 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5957 5957
5958 #define SDTL_OVERLAP 0x01 5958 #define SDTL_OVERLAP 0x01
5959 5959
5960 struct sched_domain_topology_level { 5960 struct sched_domain_topology_level {
5961 sched_domain_init_f init; 5961 sched_domain_init_f init;
5962 sched_domain_mask_f mask; 5962 sched_domain_mask_f mask;
5963 int flags; 5963 int flags;
5964 struct sd_data data; 5964 struct sd_data data;
5965 }; 5965 };
5966 5966
5967 static int 5967 static int
5968 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5968 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5969 { 5969 {
5970 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5970 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5971 const struct cpumask *span = sched_domain_span(sd); 5971 const struct cpumask *span = sched_domain_span(sd);
5972 struct cpumask *covered = sched_domains_tmpmask; 5972 struct cpumask *covered = sched_domains_tmpmask;
5973 struct sd_data *sdd = sd->private; 5973 struct sd_data *sdd = sd->private;
5974 struct sched_domain *child; 5974 struct sched_domain *child;
5975 int i; 5975 int i;
5976 5976
5977 cpumask_clear(covered); 5977 cpumask_clear(covered);
5978 5978
5979 for_each_cpu(i, span) { 5979 for_each_cpu(i, span) {
5980 struct cpumask *sg_span; 5980 struct cpumask *sg_span;
5981 5981
5982 if (cpumask_test_cpu(i, covered)) 5982 if (cpumask_test_cpu(i, covered))
5983 continue; 5983 continue;
5984 5984
5985 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5985 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5986 GFP_KERNEL, cpu_to_node(cpu)); 5986 GFP_KERNEL, cpu_to_node(cpu));
5987 5987
5988 if (!sg) 5988 if (!sg)
5989 goto fail; 5989 goto fail;
5990 5990
5991 sg_span = sched_group_cpus(sg); 5991 sg_span = sched_group_cpus(sg);
5992 5992
5993 child = *per_cpu_ptr(sdd->sd, i); 5993 child = *per_cpu_ptr(sdd->sd, i);
5994 if (child->child) { 5994 if (child->child) {
5995 child = child->child; 5995 child = child->child;
5996 cpumask_copy(sg_span, sched_domain_span(child)); 5996 cpumask_copy(sg_span, sched_domain_span(child));
5997 } else 5997 } else
5998 cpumask_set_cpu(i, sg_span); 5998 cpumask_set_cpu(i, sg_span);
5999 5999
6000 cpumask_or(covered, covered, sg_span); 6000 cpumask_or(covered, covered, sg_span);
6001 6001
6002 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6002 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
6003 atomic_inc(&sg->sgp->ref); 6003 atomic_inc(&sg->sgp->ref);
6004 6004
6005 if (cpumask_test_cpu(cpu, sg_span)) 6005 if (cpumask_test_cpu(cpu, sg_span))
6006 groups = sg; 6006 groups = sg;
6007 6007
6008 if (!first) 6008 if (!first)
6009 first = sg; 6009 first = sg;
6010 if (last) 6010 if (last)
6011 last->next = sg; 6011 last->next = sg;
6012 last = sg; 6012 last = sg;
6013 last->next = first; 6013 last->next = first;
6014 } 6014 }
6015 sd->groups = groups; 6015 sd->groups = groups;
6016 6016
6017 return 0; 6017 return 0;
6018 6018
6019 fail: 6019 fail:
6020 free_sched_groups(first, 0); 6020 free_sched_groups(first, 0);
6021 6021
6022 return -ENOMEM; 6022 return -ENOMEM;
6023 } 6023 }
6024 6024
6025 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 6025 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6026 { 6026 {
6027 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 6027 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6028 struct sched_domain *child = sd->child; 6028 struct sched_domain *child = sd->child;
6029 6029
6030 if (child) 6030 if (child)
6031 cpu = cpumask_first(sched_domain_span(child)); 6031 cpu = cpumask_first(sched_domain_span(child));
6032 6032
6033 if (sg) { 6033 if (sg) {
6034 *sg = *per_cpu_ptr(sdd->sg, cpu); 6034 *sg = *per_cpu_ptr(sdd->sg, cpu);
6035 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 6035 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
6036 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 6036 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
6037 } 6037 }
6038 6038
6039 return cpu; 6039 return cpu;
6040 } 6040 }
6041 6041
6042 /* 6042 /*
6043 * build_sched_groups will build a circular linked list of the groups 6043 * build_sched_groups will build a circular linked list of the groups
6044 * covered by the given span, and will set each group's ->cpumask correctly, 6044 * covered by the given span, and will set each group's ->cpumask correctly,
6045 * and ->cpu_power to 0. 6045 * and ->cpu_power to 0.
6046 * 6046 *
6047 * Assumes the sched_domain tree is fully constructed 6047 * Assumes the sched_domain tree is fully constructed
6048 */ 6048 */
6049 static int 6049 static int
6050 build_sched_groups(struct sched_domain *sd, int cpu) 6050 build_sched_groups(struct sched_domain *sd, int cpu)
6051 { 6051 {
6052 struct sched_group *first = NULL, *last = NULL; 6052 struct sched_group *first = NULL, *last = NULL;
6053 struct sd_data *sdd = sd->private; 6053 struct sd_data *sdd = sd->private;
6054 const struct cpumask *span = sched_domain_span(sd); 6054 const struct cpumask *span = sched_domain_span(sd);
6055 struct cpumask *covered; 6055 struct cpumask *covered;
6056 int i; 6056 int i;
6057 6057
6058 get_group(cpu, sdd, &sd->groups); 6058 get_group(cpu, sdd, &sd->groups);
6059 atomic_inc(&sd->groups->ref); 6059 atomic_inc(&sd->groups->ref);
6060 6060
6061 if (cpu != cpumask_first(sched_domain_span(sd))) 6061 if (cpu != cpumask_first(sched_domain_span(sd)))
6062 return 0; 6062 return 0;
6063 6063
6064 lockdep_assert_held(&sched_domains_mutex); 6064 lockdep_assert_held(&sched_domains_mutex);
6065 covered = sched_domains_tmpmask; 6065 covered = sched_domains_tmpmask;
6066 6066
6067 cpumask_clear(covered); 6067 cpumask_clear(covered);
6068 6068
6069 for_each_cpu(i, span) { 6069 for_each_cpu(i, span) {
6070 struct sched_group *sg; 6070 struct sched_group *sg;
6071 int group = get_group(i, sdd, &sg); 6071 int group = get_group(i, sdd, &sg);
6072 int j; 6072 int j;
6073 6073
6074 if (cpumask_test_cpu(i, covered)) 6074 if (cpumask_test_cpu(i, covered))
6075 continue; 6075 continue;
6076 6076
6077 cpumask_clear(sched_group_cpus(sg)); 6077 cpumask_clear(sched_group_cpus(sg));
6078 sg->sgp->power = 0; 6078 sg->sgp->power = 0;
6079 6079
6080 for_each_cpu(j, span) { 6080 for_each_cpu(j, span) {
6081 if (get_group(j, sdd, NULL) != group) 6081 if (get_group(j, sdd, NULL) != group)
6082 continue; 6082 continue;
6083 6083
6084 cpumask_set_cpu(j, covered); 6084 cpumask_set_cpu(j, covered);
6085 cpumask_set_cpu(j, sched_group_cpus(sg)); 6085 cpumask_set_cpu(j, sched_group_cpus(sg));
6086 } 6086 }
6087 6087
6088 if (!first) 6088 if (!first)
6089 first = sg; 6089 first = sg;
6090 if (last) 6090 if (last)
6091 last->next = sg; 6091 last->next = sg;
6092 last = sg; 6092 last = sg;
6093 } 6093 }
6094 last->next = first; 6094 last->next = first;
6095 6095
6096 return 0; 6096 return 0;
6097 } 6097 }
6098 6098
6099 /* 6099 /*
6100 * Initialize sched groups cpu_power. 6100 * Initialize sched groups cpu_power.
6101 * 6101 *
6102 * cpu_power indicates the capacity of sched group, which is used while 6102 * cpu_power indicates the capacity of sched group, which is used while
6103 * distributing the load between different sched groups in a sched domain. 6103 * distributing the load between different sched groups in a sched domain.
6104 * Typically cpu_power for all the groups in a sched domain will be same unless 6104 * Typically cpu_power for all the groups in a sched domain will be same unless
6105 * there are asymmetries in the topology. If there are asymmetries, group 6105 * there are asymmetries in the topology. If there are asymmetries, group
6106 * having more cpu_power will pickup more load compared to the group having 6106 * having more cpu_power will pickup more load compared to the group having
6107 * less cpu_power. 6107 * less cpu_power.
6108 */ 6108 */
6109 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6109 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6110 { 6110 {
6111 struct sched_group *sg = sd->groups; 6111 struct sched_group *sg = sd->groups;
6112 6112
6113 WARN_ON(!sd || !sg); 6113 WARN_ON(!sd || !sg);
6114 6114
6115 do { 6115 do {
6116 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 6116 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6117 sg = sg->next; 6117 sg = sg->next;
6118 } while (sg != sd->groups); 6118 } while (sg != sd->groups);
6119 6119
6120 if (cpu != group_first_cpu(sg)) 6120 if (cpu != group_first_cpu(sg))
6121 return; 6121 return;
6122 6122
6123 update_group_power(sd, cpu); 6123 update_group_power(sd, cpu);
6124 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 6124 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
6125 } 6125 }
6126 6126
6127 int __weak arch_sd_sibling_asym_packing(void) 6127 int __weak arch_sd_sibling_asym_packing(void)
6128 { 6128 {
6129 return 0*SD_ASYM_PACKING; 6129 return 0*SD_ASYM_PACKING;
6130 } 6130 }
6131 6131
6132 /* 6132 /*
6133 * Initializers for schedule domains 6133 * Initializers for schedule domains
6134 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6134 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6135 */ 6135 */
6136 6136
6137 #ifdef CONFIG_SCHED_DEBUG 6137 #ifdef CONFIG_SCHED_DEBUG
6138 # define SD_INIT_NAME(sd, type) sd->name = #type 6138 # define SD_INIT_NAME(sd, type) sd->name = #type
6139 #else 6139 #else
6140 # define SD_INIT_NAME(sd, type) do { } while (0) 6140 # define SD_INIT_NAME(sd, type) do { } while (0)
6141 #endif 6141 #endif
6142 6142
6143 #define SD_INIT_FUNC(type) \ 6143 #define SD_INIT_FUNC(type) \
6144 static noinline struct sched_domain * \ 6144 static noinline struct sched_domain * \
6145 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 6145 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6146 { \ 6146 { \
6147 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 6147 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
6148 *sd = SD_##type##_INIT; \ 6148 *sd = SD_##type##_INIT; \
6149 SD_INIT_NAME(sd, type); \ 6149 SD_INIT_NAME(sd, type); \
6150 sd->private = &tl->data; \ 6150 sd->private = &tl->data; \
6151 return sd; \ 6151 return sd; \
6152 } 6152 }
6153 6153
6154 SD_INIT_FUNC(CPU) 6154 SD_INIT_FUNC(CPU)
6155 #ifdef CONFIG_NUMA 6155 #ifdef CONFIG_NUMA
6156 SD_INIT_FUNC(ALLNODES) 6156 SD_INIT_FUNC(ALLNODES)
6157 SD_INIT_FUNC(NODE) 6157 SD_INIT_FUNC(NODE)
6158 #endif 6158 #endif
6159 #ifdef CONFIG_SCHED_SMT 6159 #ifdef CONFIG_SCHED_SMT
6160 SD_INIT_FUNC(SIBLING) 6160 SD_INIT_FUNC(SIBLING)
6161 #endif 6161 #endif
6162 #ifdef CONFIG_SCHED_MC 6162 #ifdef CONFIG_SCHED_MC
6163 SD_INIT_FUNC(MC) 6163 SD_INIT_FUNC(MC)
6164 #endif 6164 #endif
6165 #ifdef CONFIG_SCHED_BOOK 6165 #ifdef CONFIG_SCHED_BOOK
6166 SD_INIT_FUNC(BOOK) 6166 SD_INIT_FUNC(BOOK)
6167 #endif 6167 #endif
6168 6168
6169 static int default_relax_domain_level = -1; 6169 static int default_relax_domain_level = -1;
6170 int sched_domain_level_max; 6170 int sched_domain_level_max;
6171 6171
6172 static int __init setup_relax_domain_level(char *str) 6172 static int __init setup_relax_domain_level(char *str)
6173 { 6173 {
6174 unsigned long val; 6174 unsigned long val;
6175 6175
6176 val = simple_strtoul(str, NULL, 0); 6176 val = simple_strtoul(str, NULL, 0);
6177 if (val < sched_domain_level_max) 6177 if (val < sched_domain_level_max)
6178 default_relax_domain_level = val; 6178 default_relax_domain_level = val;
6179 6179
6180 return 1; 6180 return 1;
6181 } 6181 }
6182 __setup("relax_domain_level=", setup_relax_domain_level); 6182 __setup("relax_domain_level=", setup_relax_domain_level);
6183 6183
6184 static void set_domain_attribute(struct sched_domain *sd, 6184 static void set_domain_attribute(struct sched_domain *sd,
6185 struct sched_domain_attr *attr) 6185 struct sched_domain_attr *attr)
6186 { 6186 {
6187 int request; 6187 int request;
6188 6188
6189 if (!attr || attr->relax_domain_level < 0) { 6189 if (!attr || attr->relax_domain_level < 0) {
6190 if (default_relax_domain_level < 0) 6190 if (default_relax_domain_level < 0)
6191 return; 6191 return;
6192 else 6192 else
6193 request = default_relax_domain_level; 6193 request = default_relax_domain_level;
6194 } else 6194 } else
6195 request = attr->relax_domain_level; 6195 request = attr->relax_domain_level;
6196 if (request < sd->level) { 6196 if (request < sd->level) {
6197 /* turn off idle balance on this domain */ 6197 /* turn off idle balance on this domain */
6198 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6198 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6199 } else { 6199 } else {
6200 /* turn on idle balance on this domain */ 6200 /* turn on idle balance on this domain */
6201 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6201 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6202 } 6202 }
6203 } 6203 }
6204 6204
6205 static void __sdt_free(const struct cpumask *cpu_map); 6205 static void __sdt_free(const struct cpumask *cpu_map);
6206 static int __sdt_alloc(const struct cpumask *cpu_map); 6206 static int __sdt_alloc(const struct cpumask *cpu_map);
6207 6207
6208 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 6208 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6209 const struct cpumask *cpu_map) 6209 const struct cpumask *cpu_map)
6210 { 6210 {
6211 switch (what) { 6211 switch (what) {
6212 case sa_rootdomain: 6212 case sa_rootdomain:
6213 if (!atomic_read(&d->rd->refcount)) 6213 if (!atomic_read(&d->rd->refcount))
6214 free_rootdomain(&d->rd->rcu); /* fall through */ 6214 free_rootdomain(&d->rd->rcu); /* fall through */
6215 case sa_sd: 6215 case sa_sd:
6216 free_percpu(d->sd); /* fall through */ 6216 free_percpu(d->sd); /* fall through */
6217 case sa_sd_storage: 6217 case sa_sd_storage:
6218 __sdt_free(cpu_map); /* fall through */ 6218 __sdt_free(cpu_map); /* fall through */
6219 case sa_none: 6219 case sa_none:
6220 break; 6220 break;
6221 } 6221 }
6222 } 6222 }
6223 6223
6224 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 6224 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6225 const struct cpumask *cpu_map) 6225 const struct cpumask *cpu_map)
6226 { 6226 {
6227 memset(d, 0, sizeof(*d)); 6227 memset(d, 0, sizeof(*d));
6228 6228
6229 if (__sdt_alloc(cpu_map)) 6229 if (__sdt_alloc(cpu_map))
6230 return sa_sd_storage; 6230 return sa_sd_storage;
6231 d->sd = alloc_percpu(struct sched_domain *); 6231 d->sd = alloc_percpu(struct sched_domain *);
6232 if (!d->sd) 6232 if (!d->sd)
6233 return sa_sd_storage; 6233 return sa_sd_storage;
6234 d->rd = alloc_rootdomain(); 6234 d->rd = alloc_rootdomain();
6235 if (!d->rd) 6235 if (!d->rd)
6236 return sa_sd; 6236 return sa_sd;
6237 return sa_rootdomain; 6237 return sa_rootdomain;
6238 } 6238 }
6239 6239
6240 /* 6240 /*
6241 * NULL the sd_data elements we've used to build the sched_domain and 6241 * NULL the sd_data elements we've used to build the sched_domain and
6242 * sched_group structure so that the subsequent __free_domain_allocs() 6242 * sched_group structure so that the subsequent __free_domain_allocs()
6243 * will not free the data we're using. 6243 * will not free the data we're using.
6244 */ 6244 */
6245 static void claim_allocations(int cpu, struct sched_domain *sd) 6245 static void claim_allocations(int cpu, struct sched_domain *sd)
6246 { 6246 {
6247 struct sd_data *sdd = sd->private; 6247 struct sd_data *sdd = sd->private;
6248 6248
6249 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 6249 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
6250 *per_cpu_ptr(sdd->sd, cpu) = NULL; 6250 *per_cpu_ptr(sdd->sd, cpu) = NULL;
6251 6251
6252 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 6252 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
6253 *per_cpu_ptr(sdd->sg, cpu) = NULL; 6253 *per_cpu_ptr(sdd->sg, cpu) = NULL;
6254 6254
6255 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 6255 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
6256 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 6256 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
6257 } 6257 }
6258 6258
6259 #ifdef CONFIG_SCHED_SMT 6259 #ifdef CONFIG_SCHED_SMT
6260 static const struct cpumask *cpu_smt_mask(int cpu) 6260 static const struct cpumask *cpu_smt_mask(int cpu)
6261 { 6261 {
6262 return topology_thread_cpumask(cpu); 6262 return topology_thread_cpumask(cpu);
6263 } 6263 }
6264 #endif 6264 #endif
6265 6265
6266 /* 6266 /*
6267 * Topology list, bottom-up. 6267 * Topology list, bottom-up.
6268 */ 6268 */
6269 static struct sched_domain_topology_level default_topology[] = { 6269 static struct sched_domain_topology_level default_topology[] = {
6270 #ifdef CONFIG_SCHED_SMT 6270 #ifdef CONFIG_SCHED_SMT
6271 { sd_init_SIBLING, cpu_smt_mask, }, 6271 { sd_init_SIBLING, cpu_smt_mask, },
6272 #endif 6272 #endif
6273 #ifdef CONFIG_SCHED_MC 6273 #ifdef CONFIG_SCHED_MC
6274 { sd_init_MC, cpu_coregroup_mask, }, 6274 { sd_init_MC, cpu_coregroup_mask, },
6275 #endif 6275 #endif
6276 #ifdef CONFIG_SCHED_BOOK 6276 #ifdef CONFIG_SCHED_BOOK
6277 { sd_init_BOOK, cpu_book_mask, }, 6277 { sd_init_BOOK, cpu_book_mask, },
6278 #endif 6278 #endif
6279 { sd_init_CPU, cpu_cpu_mask, }, 6279 { sd_init_CPU, cpu_cpu_mask, },
6280 #ifdef CONFIG_NUMA 6280 #ifdef CONFIG_NUMA
6281 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, 6281 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6282 { sd_init_ALLNODES, cpu_allnodes_mask, }, 6282 { sd_init_ALLNODES, cpu_allnodes_mask, },
6283 #endif 6283 #endif
6284 { NULL, }, 6284 { NULL, },
6285 }; 6285 };
6286 6286
6287 static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6287 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6288 6288
6289 static int __sdt_alloc(const struct cpumask *cpu_map) 6289 static int __sdt_alloc(const struct cpumask *cpu_map)
6290 { 6290 {
6291 struct sched_domain_topology_level *tl; 6291 struct sched_domain_topology_level *tl;
6292 int j; 6292 int j;
6293 6293
6294 for (tl = sched_domain_topology; tl->init; tl++) { 6294 for (tl = sched_domain_topology; tl->init; tl++) {
6295 struct sd_data *sdd = &tl->data; 6295 struct sd_data *sdd = &tl->data;
6296 6296
6297 sdd->sd = alloc_percpu(struct sched_domain *); 6297 sdd->sd = alloc_percpu(struct sched_domain *);
6298 if (!sdd->sd) 6298 if (!sdd->sd)
6299 return -ENOMEM; 6299 return -ENOMEM;
6300 6300
6301 sdd->sg = alloc_percpu(struct sched_group *); 6301 sdd->sg = alloc_percpu(struct sched_group *);
6302 if (!sdd->sg) 6302 if (!sdd->sg)
6303 return -ENOMEM; 6303 return -ENOMEM;
6304 6304
6305 sdd->sgp = alloc_percpu(struct sched_group_power *); 6305 sdd->sgp = alloc_percpu(struct sched_group_power *);
6306 if (!sdd->sgp) 6306 if (!sdd->sgp)
6307 return -ENOMEM; 6307 return -ENOMEM;
6308 6308
6309 for_each_cpu(j, cpu_map) { 6309 for_each_cpu(j, cpu_map) {
6310 struct sched_domain *sd; 6310 struct sched_domain *sd;
6311 struct sched_group *sg; 6311 struct sched_group *sg;
6312 struct sched_group_power *sgp; 6312 struct sched_group_power *sgp;
6313 6313
6314 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6314 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6315 GFP_KERNEL, cpu_to_node(j)); 6315 GFP_KERNEL, cpu_to_node(j));
6316 if (!sd) 6316 if (!sd)
6317 return -ENOMEM; 6317 return -ENOMEM;
6318 6318
6319 *per_cpu_ptr(sdd->sd, j) = sd; 6319 *per_cpu_ptr(sdd->sd, j) = sd;
6320 6320
6321 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6321 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6322 GFP_KERNEL, cpu_to_node(j)); 6322 GFP_KERNEL, cpu_to_node(j));
6323 if (!sg) 6323 if (!sg)
6324 return -ENOMEM; 6324 return -ENOMEM;
6325 6325
6326 *per_cpu_ptr(sdd->sg, j) = sg; 6326 *per_cpu_ptr(sdd->sg, j) = sg;
6327 6327
6328 sgp = kzalloc_node(sizeof(struct sched_group_power), 6328 sgp = kzalloc_node(sizeof(struct sched_group_power),
6329 GFP_KERNEL, cpu_to_node(j)); 6329 GFP_KERNEL, cpu_to_node(j));
6330 if (!sgp) 6330 if (!sgp)
6331 return -ENOMEM; 6331 return -ENOMEM;
6332 6332
6333 *per_cpu_ptr(sdd->sgp, j) = sgp; 6333 *per_cpu_ptr(sdd->sgp, j) = sgp;
6334 } 6334 }
6335 } 6335 }
6336 6336
6337 return 0; 6337 return 0;
6338 } 6338 }
6339 6339
6340 static void __sdt_free(const struct cpumask *cpu_map) 6340 static void __sdt_free(const struct cpumask *cpu_map)
6341 { 6341 {
6342 struct sched_domain_topology_level *tl; 6342 struct sched_domain_topology_level *tl;
6343 int j; 6343 int j;
6344 6344
6345 for (tl = sched_domain_topology; tl->init; tl++) { 6345 for (tl = sched_domain_topology; tl->init; tl++) {
6346 struct sd_data *sdd = &tl->data; 6346 struct sd_data *sdd = &tl->data;
6347 6347
6348 for_each_cpu(j, cpu_map) { 6348 for_each_cpu(j, cpu_map) {
6349 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); 6349 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
6350 if (sd && (sd->flags & SD_OVERLAP)) 6350 if (sd && (sd->flags & SD_OVERLAP))
6351 free_sched_groups(sd->groups, 0); 6351 free_sched_groups(sd->groups, 0);
6352 kfree(*per_cpu_ptr(sdd->sd, j)); 6352 kfree(*per_cpu_ptr(sdd->sd, j));
6353 kfree(*per_cpu_ptr(sdd->sg, j)); 6353 kfree(*per_cpu_ptr(sdd->sg, j));
6354 kfree(*per_cpu_ptr(sdd->sgp, j)); 6354 kfree(*per_cpu_ptr(sdd->sgp, j));
6355 } 6355 }
6356 free_percpu(sdd->sd); 6356 free_percpu(sdd->sd);
6357 free_percpu(sdd->sg); 6357 free_percpu(sdd->sg);
6358 free_percpu(sdd->sgp); 6358 free_percpu(sdd->sgp);
6359 } 6359 }
6360 } 6360 }
6361 6361
6362 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6362 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6363 struct s_data *d, const struct cpumask *cpu_map, 6363 struct s_data *d, const struct cpumask *cpu_map,
6364 struct sched_domain_attr *attr, struct sched_domain *child, 6364 struct sched_domain_attr *attr, struct sched_domain *child,
6365 int cpu) 6365 int cpu)
6366 { 6366 {
6367 struct sched_domain *sd = tl->init(tl, cpu); 6367 struct sched_domain *sd = tl->init(tl, cpu);
6368 if (!sd) 6368 if (!sd)
6369 return child; 6369 return child;
6370 6370
6371 set_domain_attribute(sd, attr); 6371 set_domain_attribute(sd, attr);
6372 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6372 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6373 if (child) { 6373 if (child) {
6374 sd->level = child->level + 1; 6374 sd->level = child->level + 1;
6375 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6375 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6376 child->parent = sd; 6376 child->parent = sd;
6377 } 6377 }
6378 sd->child = child; 6378 sd->child = child;
6379 6379
6380 return sd; 6380 return sd;
6381 } 6381 }
6382 6382
6383 /* 6383 /*
6384 * Build sched domains for a given set of cpus and attach the sched domains 6384 * Build sched domains for a given set of cpus and attach the sched domains
6385 * to the individual cpus 6385 * to the individual cpus
6386 */ 6386 */
6387 static int build_sched_domains(const struct cpumask *cpu_map, 6387 static int build_sched_domains(const struct cpumask *cpu_map,
6388 struct sched_domain_attr *attr) 6388 struct sched_domain_attr *attr)
6389 { 6389 {
6390 enum s_alloc alloc_state = sa_none; 6390 enum s_alloc alloc_state = sa_none;
6391 struct sched_domain *sd; 6391 struct sched_domain *sd;
6392 struct s_data d; 6392 struct s_data d;
6393 int i, ret = -ENOMEM; 6393 int i, ret = -ENOMEM;
6394 6394
6395 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6395 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6396 if (alloc_state != sa_rootdomain) 6396 if (alloc_state != sa_rootdomain)
6397 goto error; 6397 goto error;
6398 6398
6399 /* Set up domains for cpus specified by the cpu_map. */ 6399 /* Set up domains for cpus specified by the cpu_map. */
6400 for_each_cpu(i, cpu_map) { 6400 for_each_cpu(i, cpu_map) {
6401 struct sched_domain_topology_level *tl; 6401 struct sched_domain_topology_level *tl;
6402 6402
6403 sd = NULL; 6403 sd = NULL;
6404 for (tl = sched_domain_topology; tl->init; tl++) { 6404 for (tl = sched_domain_topology; tl->init; tl++) {
6405 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); 6405 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
6406 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6406 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6407 sd->flags |= SD_OVERLAP; 6407 sd->flags |= SD_OVERLAP;
6408 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6408 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6409 break; 6409 break;
6410 } 6410 }
6411 6411
6412 while (sd->child) 6412 while (sd->child)
6413 sd = sd->child; 6413 sd = sd->child;
6414 6414
6415 *per_cpu_ptr(d.sd, i) = sd; 6415 *per_cpu_ptr(d.sd, i) = sd;
6416 } 6416 }
6417 6417
6418 /* Build the groups for the domains */ 6418 /* Build the groups for the domains */
6419 for_each_cpu(i, cpu_map) { 6419 for_each_cpu(i, cpu_map) {
6420 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6420 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6421 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6421 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6422 if (sd->flags & SD_OVERLAP) { 6422 if (sd->flags & SD_OVERLAP) {
6423 if (build_overlap_sched_groups(sd, i)) 6423 if (build_overlap_sched_groups(sd, i))
6424 goto error; 6424 goto error;
6425 } else { 6425 } else {
6426 if (build_sched_groups(sd, i)) 6426 if (build_sched_groups(sd, i))
6427 goto error; 6427 goto error;
6428 } 6428 }
6429 } 6429 }
6430 } 6430 }
6431 6431
6432 /* Calculate CPU power for physical packages and nodes */ 6432 /* Calculate CPU power for physical packages and nodes */
6433 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6433 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6434 if (!cpumask_test_cpu(i, cpu_map)) 6434 if (!cpumask_test_cpu(i, cpu_map))
6435 continue; 6435 continue;
6436 6436
6437 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6437 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6438 claim_allocations(i, sd); 6438 claim_allocations(i, sd);
6439 init_sched_groups_power(i, sd); 6439 init_sched_groups_power(i, sd);
6440 } 6440 }
6441 } 6441 }
6442 6442
6443 /* Attach the domains */ 6443 /* Attach the domains */
6444 rcu_read_lock(); 6444 rcu_read_lock();
6445 for_each_cpu(i, cpu_map) { 6445 for_each_cpu(i, cpu_map) {
6446 sd = *per_cpu_ptr(d.sd, i); 6446 sd = *per_cpu_ptr(d.sd, i);
6447 cpu_attach_domain(sd, d.rd, i); 6447 cpu_attach_domain(sd, d.rd, i);
6448 } 6448 }
6449 rcu_read_unlock(); 6449 rcu_read_unlock();
6450 6450
6451 ret = 0; 6451 ret = 0;
6452 error: 6452 error:
6453 __free_domain_allocs(&d, alloc_state, cpu_map); 6453 __free_domain_allocs(&d, alloc_state, cpu_map);
6454 return ret; 6454 return ret;
6455 } 6455 }
6456 6456
6457 static cpumask_var_t *doms_cur; /* current sched domains */ 6457 static cpumask_var_t *doms_cur; /* current sched domains */
6458 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6458 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6459 static struct sched_domain_attr *dattr_cur; 6459 static struct sched_domain_attr *dattr_cur;
6460 /* attribues of custom domains in 'doms_cur' */ 6460 /* attribues of custom domains in 'doms_cur' */
6461 6461
6462 /* 6462 /*
6463 * Special case: If a kmalloc of a doms_cur partition (array of 6463 * Special case: If a kmalloc of a doms_cur partition (array of
6464 * cpumask) fails, then fallback to a single sched domain, 6464 * cpumask) fails, then fallback to a single sched domain,
6465 * as determined by the single cpumask fallback_doms. 6465 * as determined by the single cpumask fallback_doms.
6466 */ 6466 */
6467 static cpumask_var_t fallback_doms; 6467 static cpumask_var_t fallback_doms;
6468 6468
6469 /* 6469 /*
6470 * arch_update_cpu_topology lets virtualized architectures update the 6470 * arch_update_cpu_topology lets virtualized architectures update the
6471 * cpu core maps. It is supposed to return 1 if the topology changed 6471 * cpu core maps. It is supposed to return 1 if the topology changed
6472 * or 0 if it stayed the same. 6472 * or 0 if it stayed the same.
6473 */ 6473 */
6474 int __attribute__((weak)) arch_update_cpu_topology(void) 6474 int __attribute__((weak)) arch_update_cpu_topology(void)
6475 { 6475 {
6476 return 0; 6476 return 0;
6477 } 6477 }
6478 6478
6479 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6479 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6480 { 6480 {
6481 int i; 6481 int i;
6482 cpumask_var_t *doms; 6482 cpumask_var_t *doms;
6483 6483
6484 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6484 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6485 if (!doms) 6485 if (!doms)
6486 return NULL; 6486 return NULL;
6487 for (i = 0; i < ndoms; i++) { 6487 for (i = 0; i < ndoms; i++) {
6488 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6488 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6489 free_sched_domains(doms, i); 6489 free_sched_domains(doms, i);
6490 return NULL; 6490 return NULL;
6491 } 6491 }
6492 } 6492 }
6493 return doms; 6493 return doms;
6494 } 6494 }
6495 6495
6496 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6496 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6497 { 6497 {
6498 unsigned int i; 6498 unsigned int i;
6499 for (i = 0; i < ndoms; i++) 6499 for (i = 0; i < ndoms; i++)
6500 free_cpumask_var(doms[i]); 6500 free_cpumask_var(doms[i]);
6501 kfree(doms); 6501 kfree(doms);
6502 } 6502 }
6503 6503
6504 /* 6504 /*
6505 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6505 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6506 * For now this just excludes isolated cpus, but could be used to 6506 * For now this just excludes isolated cpus, but could be used to
6507 * exclude other special cases in the future. 6507 * exclude other special cases in the future.
6508 */ 6508 */
6509 static int init_sched_domains(const struct cpumask *cpu_map) 6509 static int init_sched_domains(const struct cpumask *cpu_map)
6510 { 6510 {
6511 int err; 6511 int err;
6512 6512
6513 arch_update_cpu_topology(); 6513 arch_update_cpu_topology();
6514 ndoms_cur = 1; 6514 ndoms_cur = 1;
6515 doms_cur = alloc_sched_domains(ndoms_cur); 6515 doms_cur = alloc_sched_domains(ndoms_cur);
6516 if (!doms_cur) 6516 if (!doms_cur)
6517 doms_cur = &fallback_doms; 6517 doms_cur = &fallback_doms;
6518 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6518 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6519 dattr_cur = NULL; 6519 dattr_cur = NULL;
6520 err = build_sched_domains(doms_cur[0], NULL); 6520 err = build_sched_domains(doms_cur[0], NULL);
6521 register_sched_domain_sysctl(); 6521 register_sched_domain_sysctl();
6522 6522
6523 return err; 6523 return err;
6524 } 6524 }
6525 6525
6526 /* 6526 /*
6527 * Detach sched domains from a group of cpus specified in cpu_map 6527 * Detach sched domains from a group of cpus specified in cpu_map
6528 * These cpus will now be attached to the NULL domain 6528 * These cpus will now be attached to the NULL domain
6529 */ 6529 */
6530 static void detach_destroy_domains(const struct cpumask *cpu_map) 6530 static void detach_destroy_domains(const struct cpumask *cpu_map)
6531 { 6531 {
6532 int i; 6532 int i;
6533 6533
6534 rcu_read_lock(); 6534 rcu_read_lock();
6535 for_each_cpu(i, cpu_map) 6535 for_each_cpu(i, cpu_map)
6536 cpu_attach_domain(NULL, &def_root_domain, i); 6536 cpu_attach_domain(NULL, &def_root_domain, i);
6537 rcu_read_unlock(); 6537 rcu_read_unlock();
6538 } 6538 }
6539 6539
6540 /* handle null as "default" */ 6540 /* handle null as "default" */
6541 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6541 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6542 struct sched_domain_attr *new, int idx_new) 6542 struct sched_domain_attr *new, int idx_new)
6543 { 6543 {
6544 struct sched_domain_attr tmp; 6544 struct sched_domain_attr tmp;
6545 6545
6546 /* fast path */ 6546 /* fast path */
6547 if (!new && !cur) 6547 if (!new && !cur)
6548 return 1; 6548 return 1;
6549 6549
6550 tmp = SD_ATTR_INIT; 6550 tmp = SD_ATTR_INIT;
6551 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6551 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6552 new ? (new + idx_new) : &tmp, 6552 new ? (new + idx_new) : &tmp,
6553 sizeof(struct sched_domain_attr)); 6553 sizeof(struct sched_domain_attr));
6554 } 6554 }
6555 6555
6556 /* 6556 /*
6557 * Partition sched domains as specified by the 'ndoms_new' 6557 * Partition sched domains as specified by the 'ndoms_new'
6558 * cpumasks in the array doms_new[] of cpumasks. This compares 6558 * cpumasks in the array doms_new[] of cpumasks. This compares
6559 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6559 * doms_new[] to the current sched domain partitioning, doms_cur[].
6560 * It destroys each deleted domain and builds each new domain. 6560 * It destroys each deleted domain and builds each new domain.
6561 * 6561 *
6562 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6562 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6563 * The masks don't intersect (don't overlap.) We should setup one 6563 * The masks don't intersect (don't overlap.) We should setup one
6564 * sched domain for each mask. CPUs not in any of the cpumasks will 6564 * sched domain for each mask. CPUs not in any of the cpumasks will
6565 * not be load balanced. If the same cpumask appears both in the 6565 * not be load balanced. If the same cpumask appears both in the
6566 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6566 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6567 * it as it is. 6567 * it as it is.
6568 * 6568 *
6569 * The passed in 'doms_new' should be allocated using 6569 * The passed in 'doms_new' should be allocated using
6570 * alloc_sched_domains. This routine takes ownership of it and will 6570 * alloc_sched_domains. This routine takes ownership of it and will
6571 * free_sched_domains it when done with it. If the caller failed the 6571 * free_sched_domains it when done with it. If the caller failed the
6572 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6572 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6573 * and partition_sched_domains() will fallback to the single partition 6573 * and partition_sched_domains() will fallback to the single partition
6574 * 'fallback_doms', it also forces the domains to be rebuilt. 6574 * 'fallback_doms', it also forces the domains to be rebuilt.
6575 * 6575 *
6576 * If doms_new == NULL it will be replaced with cpu_online_mask. 6576 * If doms_new == NULL it will be replaced with cpu_online_mask.
6577 * ndoms_new == 0 is a special case for destroying existing domains, 6577 * ndoms_new == 0 is a special case for destroying existing domains,
6578 * and it will not create the default domain. 6578 * and it will not create the default domain.
6579 * 6579 *
6580 * Call with hotplug lock held 6580 * Call with hotplug lock held
6581 */ 6581 */
6582 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6582 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6583 struct sched_domain_attr *dattr_new) 6583 struct sched_domain_attr *dattr_new)
6584 { 6584 {
6585 int i, j, n; 6585 int i, j, n;
6586 int new_topology; 6586 int new_topology;
6587 6587
6588 mutex_lock(&sched_domains_mutex); 6588 mutex_lock(&sched_domains_mutex);
6589 6589
6590 /* always unregister in case we don't destroy any domains */ 6590 /* always unregister in case we don't destroy any domains */
6591 unregister_sched_domain_sysctl(); 6591 unregister_sched_domain_sysctl();
6592 6592
6593 /* Let architecture update cpu core mappings. */ 6593 /* Let architecture update cpu core mappings. */
6594 new_topology = arch_update_cpu_topology(); 6594 new_topology = arch_update_cpu_topology();
6595 6595
6596 n = doms_new ? ndoms_new : 0; 6596 n = doms_new ? ndoms_new : 0;
6597 6597
6598 /* Destroy deleted domains */ 6598 /* Destroy deleted domains */
6599 for (i = 0; i < ndoms_cur; i++) { 6599 for (i = 0; i < ndoms_cur; i++) {
6600 for (j = 0; j < n && !new_topology; j++) { 6600 for (j = 0; j < n && !new_topology; j++) {
6601 if (cpumask_equal(doms_cur[i], doms_new[j]) 6601 if (cpumask_equal(doms_cur[i], doms_new[j])
6602 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6602 && dattrs_equal(dattr_cur, i, dattr_new, j))
6603 goto match1; 6603 goto match1;
6604 } 6604 }
6605 /* no match - a current sched domain not in new doms_new[] */ 6605 /* no match - a current sched domain not in new doms_new[] */
6606 detach_destroy_domains(doms_cur[i]); 6606 detach_destroy_domains(doms_cur[i]);
6607 match1: 6607 match1:
6608 ; 6608 ;
6609 } 6609 }
6610 6610
6611 if (doms_new == NULL) { 6611 if (doms_new == NULL) {
6612 ndoms_cur = 0; 6612 ndoms_cur = 0;
6613 doms_new = &fallback_doms; 6613 doms_new = &fallback_doms;
6614 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6614 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6615 WARN_ON_ONCE(dattr_new); 6615 WARN_ON_ONCE(dattr_new);
6616 } 6616 }
6617 6617
6618 /* Build new domains */ 6618 /* Build new domains */
6619 for (i = 0; i < ndoms_new; i++) { 6619 for (i = 0; i < ndoms_new; i++) {
6620 for (j = 0; j < ndoms_cur && !new_topology; j++) { 6620 for (j = 0; j < ndoms_cur && !new_topology; j++) {
6621 if (cpumask_equal(doms_new[i], doms_cur[j]) 6621 if (cpumask_equal(doms_new[i], doms_cur[j])
6622 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6622 && dattrs_equal(dattr_new, i, dattr_cur, j))
6623 goto match2; 6623 goto match2;
6624 } 6624 }
6625 /* no match - add a new doms_new */ 6625 /* no match - add a new doms_new */
6626 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6626 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6627 match2: 6627 match2:
6628 ; 6628 ;
6629 } 6629 }
6630 6630
6631 /* Remember the new sched domains */ 6631 /* Remember the new sched domains */
6632 if (doms_cur != &fallback_doms) 6632 if (doms_cur != &fallback_doms)
6633 free_sched_domains(doms_cur, ndoms_cur); 6633 free_sched_domains(doms_cur, ndoms_cur);
6634 kfree(dattr_cur); /* kfree(NULL) is safe */ 6634 kfree(dattr_cur); /* kfree(NULL) is safe */
6635 doms_cur = doms_new; 6635 doms_cur = doms_new;
6636 dattr_cur = dattr_new; 6636 dattr_cur = dattr_new;
6637 ndoms_cur = ndoms_new; 6637 ndoms_cur = ndoms_new;
6638 6638
6639 register_sched_domain_sysctl(); 6639 register_sched_domain_sysctl();
6640 6640
6641 mutex_unlock(&sched_domains_mutex); 6641 mutex_unlock(&sched_domains_mutex);
6642 } 6642 }
6643 6643
6644 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6644 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6645 static void reinit_sched_domains(void) 6645 static void reinit_sched_domains(void)
6646 { 6646 {
6647 get_online_cpus(); 6647 get_online_cpus();
6648 6648
6649 /* Destroy domains first to force the rebuild */ 6649 /* Destroy domains first to force the rebuild */
6650 partition_sched_domains(0, NULL, NULL); 6650 partition_sched_domains(0, NULL, NULL);
6651 6651
6652 rebuild_sched_domains(); 6652 rebuild_sched_domains();
6653 put_online_cpus(); 6653 put_online_cpus();
6654 } 6654 }
6655 6655
6656 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 6656 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6657 { 6657 {
6658 unsigned int level = 0; 6658 unsigned int level = 0;
6659 6659
6660 if (sscanf(buf, "%u", &level) != 1) 6660 if (sscanf(buf, "%u", &level) != 1)
6661 return -EINVAL; 6661 return -EINVAL;
6662 6662
6663 /* 6663 /*
6664 * level is always be positive so don't check for 6664 * level is always be positive so don't check for
6665 * level < POWERSAVINGS_BALANCE_NONE which is 0 6665 * level < POWERSAVINGS_BALANCE_NONE which is 0
6666 * What happens on 0 or 1 byte write, 6666 * What happens on 0 or 1 byte write,
6667 * need to check for count as well? 6667 * need to check for count as well?
6668 */ 6668 */
6669 6669
6670 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) 6670 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6671 return -EINVAL; 6671 return -EINVAL;
6672 6672
6673 if (smt) 6673 if (smt)
6674 sched_smt_power_savings = level; 6674 sched_smt_power_savings = level;
6675 else 6675 else
6676 sched_mc_power_savings = level; 6676 sched_mc_power_savings = level;
6677 6677
6678 reinit_sched_domains(); 6678 reinit_sched_domains();
6679 6679
6680 return count; 6680 return count;
6681 } 6681 }
6682 6682
6683 #ifdef CONFIG_SCHED_MC 6683 #ifdef CONFIG_SCHED_MC
6684 static ssize_t sched_mc_power_savings_show(struct device *dev, 6684 static ssize_t sched_mc_power_savings_show(struct device *dev,
6685 struct device_attribute *attr, 6685 struct device_attribute *attr,
6686 char *buf) 6686 char *buf)
6687 { 6687 {
6688 return sprintf(buf, "%u\n", sched_mc_power_savings); 6688 return sprintf(buf, "%u\n", sched_mc_power_savings);
6689 } 6689 }
6690 static ssize_t sched_mc_power_savings_store(struct device *dev, 6690 static ssize_t sched_mc_power_savings_store(struct device *dev,
6691 struct device_attribute *attr, 6691 struct device_attribute *attr,
6692 const char *buf, size_t count) 6692 const char *buf, size_t count)
6693 { 6693 {
6694 return sched_power_savings_store(buf, count, 0); 6694 return sched_power_savings_store(buf, count, 0);
6695 } 6695 }
6696 static DEVICE_ATTR(sched_mc_power_savings, 0644, 6696 static DEVICE_ATTR(sched_mc_power_savings, 0644,
6697 sched_mc_power_savings_show, 6697 sched_mc_power_savings_show,
6698 sched_mc_power_savings_store); 6698 sched_mc_power_savings_store);
6699 #endif 6699 #endif
6700 6700
6701 #ifdef CONFIG_SCHED_SMT 6701 #ifdef CONFIG_SCHED_SMT
6702 static ssize_t sched_smt_power_savings_show(struct device *dev, 6702 static ssize_t sched_smt_power_savings_show(struct device *dev,
6703 struct device_attribute *attr, 6703 struct device_attribute *attr,
6704 char *buf) 6704 char *buf)
6705 { 6705 {
6706 return sprintf(buf, "%u\n", sched_smt_power_savings); 6706 return sprintf(buf, "%u\n", sched_smt_power_savings);
6707 } 6707 }
6708 static ssize_t sched_smt_power_savings_store(struct device *dev, 6708 static ssize_t sched_smt_power_savings_store(struct device *dev,
6709 struct device_attribute *attr, 6709 struct device_attribute *attr,
6710 const char *buf, size_t count) 6710 const char *buf, size_t count)
6711 { 6711 {
6712 return sched_power_savings_store(buf, count, 1); 6712 return sched_power_savings_store(buf, count, 1);
6713 } 6713 }
6714 static DEVICE_ATTR(sched_smt_power_savings, 0644, 6714 static DEVICE_ATTR(sched_smt_power_savings, 0644,
6715 sched_smt_power_savings_show, 6715 sched_smt_power_savings_show,
6716 sched_smt_power_savings_store); 6716 sched_smt_power_savings_store);
6717 #endif 6717 #endif
6718 6718
6719 int __init sched_create_sysfs_power_savings_entries(struct device *dev) 6719 int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6720 { 6720 {
6721 int err = 0; 6721 int err = 0;
6722 6722
6723 #ifdef CONFIG_SCHED_SMT 6723 #ifdef CONFIG_SCHED_SMT
6724 if (smt_capable()) 6724 if (smt_capable())
6725 err = device_create_file(dev, &dev_attr_sched_smt_power_savings); 6725 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6726 #endif 6726 #endif
6727 #ifdef CONFIG_SCHED_MC 6727 #ifdef CONFIG_SCHED_MC
6728 if (!err && mc_capable()) 6728 if (!err && mc_capable())
6729 err = device_create_file(dev, &dev_attr_sched_mc_power_savings); 6729 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6730 #endif 6730 #endif
6731 return err; 6731 return err;
6732 } 6732 }
6733 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 6733 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6734 6734
6735 /* 6735 /*
6736 * Update cpusets according to cpu_active mask. If cpusets are 6736 * Update cpusets according to cpu_active mask. If cpusets are
6737 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6737 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6738 * around partition_sched_domains(). 6738 * around partition_sched_domains().
6739 */ 6739 */
6740 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6740 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6741 void *hcpu) 6741 void *hcpu)
6742 { 6742 {
6743 switch (action & ~CPU_TASKS_FROZEN) { 6743 switch (action & ~CPU_TASKS_FROZEN) {
6744 case CPU_ONLINE: 6744 case CPU_ONLINE:
6745 case CPU_DOWN_FAILED: 6745 case CPU_DOWN_FAILED:
6746 cpuset_update_active_cpus(); 6746 cpuset_update_active_cpus();
6747 return NOTIFY_OK; 6747 return NOTIFY_OK;
6748 default: 6748 default:
6749 return NOTIFY_DONE; 6749 return NOTIFY_DONE;
6750 } 6750 }
6751 } 6751 }
6752 6752
6753 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6753 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6754 void *hcpu) 6754 void *hcpu)
6755 { 6755 {
6756 switch (action & ~CPU_TASKS_FROZEN) { 6756 switch (action & ~CPU_TASKS_FROZEN) {
6757 case CPU_DOWN_PREPARE: 6757 case CPU_DOWN_PREPARE:
6758 cpuset_update_active_cpus(); 6758 cpuset_update_active_cpus();
6759 return NOTIFY_OK; 6759 return NOTIFY_OK;
6760 default: 6760 default:
6761 return NOTIFY_DONE; 6761 return NOTIFY_DONE;
6762 } 6762 }
6763 } 6763 }
6764 6764
6765 void __init sched_init_smp(void) 6765 void __init sched_init_smp(void)
6766 { 6766 {
6767 cpumask_var_t non_isolated_cpus; 6767 cpumask_var_t non_isolated_cpus;
6768 6768
6769 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6769 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6770 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6770 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6771 6771
6772 get_online_cpus(); 6772 get_online_cpus();
6773 mutex_lock(&sched_domains_mutex); 6773 mutex_lock(&sched_domains_mutex);
6774 init_sched_domains(cpu_active_mask); 6774 init_sched_domains(cpu_active_mask);
6775 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6775 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6776 if (cpumask_empty(non_isolated_cpus)) 6776 if (cpumask_empty(non_isolated_cpus))
6777 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6777 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6778 mutex_unlock(&sched_domains_mutex); 6778 mutex_unlock(&sched_domains_mutex);
6779 put_online_cpus(); 6779 put_online_cpus();
6780 6780
6781 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6781 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6782 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6782 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6783 6783
6784 /* RT runtime code needs to handle some hotplug events */ 6784 /* RT runtime code needs to handle some hotplug events */
6785 hotcpu_notifier(update_runtime, 0); 6785 hotcpu_notifier(update_runtime, 0);
6786 6786
6787 init_hrtick(); 6787 init_hrtick();
6788 6788
6789 /* Move init over to a non-isolated CPU */ 6789 /* Move init over to a non-isolated CPU */
6790 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6790 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6791 BUG(); 6791 BUG();
6792 sched_init_granularity(); 6792 sched_init_granularity();
6793 free_cpumask_var(non_isolated_cpus); 6793 free_cpumask_var(non_isolated_cpus);
6794 6794
6795 init_sched_rt_class(); 6795 init_sched_rt_class();
6796 } 6796 }
6797 #else 6797 #else
6798 void __init sched_init_smp(void) 6798 void __init sched_init_smp(void)
6799 { 6799 {
6800 sched_init_granularity(); 6800 sched_init_granularity();
6801 } 6801 }
6802 #endif /* CONFIG_SMP */ 6802 #endif /* CONFIG_SMP */
6803 6803
6804 const_debug unsigned int sysctl_timer_migration = 1; 6804 const_debug unsigned int sysctl_timer_migration = 1;
6805 6805
6806 int in_sched_functions(unsigned long addr) 6806 int in_sched_functions(unsigned long addr)
6807 { 6807 {
6808 return in_lock_functions(addr) || 6808 return in_lock_functions(addr) ||
6809 (addr >= (unsigned long)__sched_text_start 6809 (addr >= (unsigned long)__sched_text_start
6810 && addr < (unsigned long)__sched_text_end); 6810 && addr < (unsigned long)__sched_text_end);
6811 } 6811 }
6812 6812
6813 #ifdef CONFIG_CGROUP_SCHED 6813 #ifdef CONFIG_CGROUP_SCHED
6814 struct task_group root_task_group; 6814 struct task_group root_task_group;
6815 #endif 6815 #endif
6816 6816
6817 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 6817 DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
6818 6818
6819 void __init sched_init(void) 6819 void __init sched_init(void)
6820 { 6820 {
6821 int i, j; 6821 int i, j;
6822 unsigned long alloc_size = 0, ptr; 6822 unsigned long alloc_size = 0, ptr;
6823 6823
6824 #ifdef CONFIG_FAIR_GROUP_SCHED 6824 #ifdef CONFIG_FAIR_GROUP_SCHED
6825 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6825 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6826 #endif 6826 #endif
6827 #ifdef CONFIG_RT_GROUP_SCHED 6827 #ifdef CONFIG_RT_GROUP_SCHED
6828 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6828 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6829 #endif 6829 #endif
6830 #ifdef CONFIG_CPUMASK_OFFSTACK 6830 #ifdef CONFIG_CPUMASK_OFFSTACK
6831 alloc_size += num_possible_cpus() * cpumask_size(); 6831 alloc_size += num_possible_cpus() * cpumask_size();
6832 #endif 6832 #endif
6833 if (alloc_size) { 6833 if (alloc_size) {
6834 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6834 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6835 6835
6836 #ifdef CONFIG_FAIR_GROUP_SCHED 6836 #ifdef CONFIG_FAIR_GROUP_SCHED
6837 root_task_group.se = (struct sched_entity **)ptr; 6837 root_task_group.se = (struct sched_entity **)ptr;
6838 ptr += nr_cpu_ids * sizeof(void **); 6838 ptr += nr_cpu_ids * sizeof(void **);
6839 6839
6840 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6840 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6841 ptr += nr_cpu_ids * sizeof(void **); 6841 ptr += nr_cpu_ids * sizeof(void **);
6842 6842
6843 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6843 #endif /* CONFIG_FAIR_GROUP_SCHED */
6844 #ifdef CONFIG_RT_GROUP_SCHED 6844 #ifdef CONFIG_RT_GROUP_SCHED
6845 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6845 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6846 ptr += nr_cpu_ids * sizeof(void **); 6846 ptr += nr_cpu_ids * sizeof(void **);
6847 6847
6848 root_task_group.rt_rq = (struct rt_rq **)ptr; 6848 root_task_group.rt_rq = (struct rt_rq **)ptr;
6849 ptr += nr_cpu_ids * sizeof(void **); 6849 ptr += nr_cpu_ids * sizeof(void **);
6850 6850
6851 #endif /* CONFIG_RT_GROUP_SCHED */ 6851 #endif /* CONFIG_RT_GROUP_SCHED */
6852 #ifdef CONFIG_CPUMASK_OFFSTACK 6852 #ifdef CONFIG_CPUMASK_OFFSTACK
6853 for_each_possible_cpu(i) { 6853 for_each_possible_cpu(i) {
6854 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 6854 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
6855 ptr += cpumask_size(); 6855 ptr += cpumask_size();
6856 } 6856 }
6857 #endif /* CONFIG_CPUMASK_OFFSTACK */ 6857 #endif /* CONFIG_CPUMASK_OFFSTACK */
6858 } 6858 }
6859 6859
6860 #ifdef CONFIG_SMP 6860 #ifdef CONFIG_SMP
6861 init_defrootdomain(); 6861 init_defrootdomain();
6862 #endif 6862 #endif
6863 6863
6864 init_rt_bandwidth(&def_rt_bandwidth, 6864 init_rt_bandwidth(&def_rt_bandwidth,
6865 global_rt_period(), global_rt_runtime()); 6865 global_rt_period(), global_rt_runtime());
6866 6866
6867 #ifdef CONFIG_RT_GROUP_SCHED 6867 #ifdef CONFIG_RT_GROUP_SCHED
6868 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6868 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6869 global_rt_period(), global_rt_runtime()); 6869 global_rt_period(), global_rt_runtime());
6870 #endif /* CONFIG_RT_GROUP_SCHED */ 6870 #endif /* CONFIG_RT_GROUP_SCHED */
6871 6871
6872 #ifdef CONFIG_CGROUP_SCHED 6872 #ifdef CONFIG_CGROUP_SCHED
6873 list_add(&root_task_group.list, &task_groups); 6873 list_add(&root_task_group.list, &task_groups);
6874 INIT_LIST_HEAD(&root_task_group.children); 6874 INIT_LIST_HEAD(&root_task_group.children);
6875 INIT_LIST_HEAD(&root_task_group.siblings); 6875 INIT_LIST_HEAD(&root_task_group.siblings);
6876 autogroup_init(&init_task); 6876 autogroup_init(&init_task);
6877 6877
6878 #endif /* CONFIG_CGROUP_SCHED */ 6878 #endif /* CONFIG_CGROUP_SCHED */
6879 6879
6880 #ifdef CONFIG_CGROUP_CPUACCT 6880 #ifdef CONFIG_CGROUP_CPUACCT
6881 root_cpuacct.cpustat = &kernel_cpustat; 6881 root_cpuacct.cpustat = &kernel_cpustat;
6882 root_cpuacct.cpuusage = alloc_percpu(u64); 6882 root_cpuacct.cpuusage = alloc_percpu(u64);
6883 /* Too early, not expected to fail */ 6883 /* Too early, not expected to fail */
6884 BUG_ON(!root_cpuacct.cpuusage); 6884 BUG_ON(!root_cpuacct.cpuusage);
6885 #endif 6885 #endif
6886 for_each_possible_cpu(i) { 6886 for_each_possible_cpu(i) {
6887 struct rq *rq; 6887 struct rq *rq;
6888 6888
6889 rq = cpu_rq(i); 6889 rq = cpu_rq(i);
6890 raw_spin_lock_init(&rq->lock); 6890 raw_spin_lock_init(&rq->lock);
6891 rq->nr_running = 0; 6891 rq->nr_running = 0;
6892 rq->calc_load_active = 0; 6892 rq->calc_load_active = 0;
6893 rq->calc_load_update = jiffies + LOAD_FREQ; 6893 rq->calc_load_update = jiffies + LOAD_FREQ;
6894 init_cfs_rq(&rq->cfs); 6894 init_cfs_rq(&rq->cfs);
6895 init_rt_rq(&rq->rt, rq); 6895 init_rt_rq(&rq->rt, rq);
6896 #ifdef CONFIG_FAIR_GROUP_SCHED 6896 #ifdef CONFIG_FAIR_GROUP_SCHED
6897 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6897 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6898 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6898 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6899 /* 6899 /*
6900 * How much cpu bandwidth does root_task_group get? 6900 * How much cpu bandwidth does root_task_group get?
6901 * 6901 *
6902 * In case of task-groups formed thr' the cgroup filesystem, it 6902 * In case of task-groups formed thr' the cgroup filesystem, it
6903 * gets 100% of the cpu resources in the system. This overall 6903 * gets 100% of the cpu resources in the system. This overall
6904 * system cpu resource is divided among the tasks of 6904 * system cpu resource is divided among the tasks of
6905 * root_task_group and its child task-groups in a fair manner, 6905 * root_task_group and its child task-groups in a fair manner,
6906 * based on each entity's (task or task-group's) weight 6906 * based on each entity's (task or task-group's) weight
6907 * (se->load.weight). 6907 * (se->load.weight).
6908 * 6908 *
6909 * In other words, if root_task_group has 10 tasks of weight 6909 * In other words, if root_task_group has 10 tasks of weight
6910 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6910 * 1024) and two child groups A0 and A1 (of weight 1024 each),
6911 * then A0's share of the cpu resource is: 6911 * then A0's share of the cpu resource is:
6912 * 6912 *
6913 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6913 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6914 * 6914 *
6915 * We achieve this by letting root_task_group's tasks sit 6915 * We achieve this by letting root_task_group's tasks sit
6916 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6916 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6917 */ 6917 */
6918 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6918 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6919 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6919 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6920 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6920 #endif /* CONFIG_FAIR_GROUP_SCHED */
6921 6921
6922 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6922 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6923 #ifdef CONFIG_RT_GROUP_SCHED 6923 #ifdef CONFIG_RT_GROUP_SCHED
6924 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 6924 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
6925 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6925 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6926 #endif 6926 #endif
6927 6927
6928 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6928 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6929 rq->cpu_load[j] = 0; 6929 rq->cpu_load[j] = 0;
6930 6930
6931 rq->last_load_update_tick = jiffies; 6931 rq->last_load_update_tick = jiffies;
6932 6932
6933 #ifdef CONFIG_SMP 6933 #ifdef CONFIG_SMP
6934 rq->sd = NULL; 6934 rq->sd = NULL;
6935 rq->rd = NULL; 6935 rq->rd = NULL;
6936 rq->cpu_power = SCHED_POWER_SCALE; 6936 rq->cpu_power = SCHED_POWER_SCALE;
6937 rq->post_schedule = 0; 6937 rq->post_schedule = 0;
6938 rq->active_balance = 0; 6938 rq->active_balance = 0;
6939 rq->next_balance = jiffies; 6939 rq->next_balance = jiffies;
6940 rq->push_cpu = 0; 6940 rq->push_cpu = 0;
6941 rq->cpu = i; 6941 rq->cpu = i;
6942 rq->online = 0; 6942 rq->online = 0;
6943 rq->idle_stamp = 0; 6943 rq->idle_stamp = 0;
6944 rq->avg_idle = 2*sysctl_sched_migration_cost; 6944 rq->avg_idle = 2*sysctl_sched_migration_cost;
6945 rq_attach_root(rq, &def_root_domain); 6945 rq_attach_root(rq, &def_root_domain);
6946 #ifdef CONFIG_NO_HZ 6946 #ifdef CONFIG_NO_HZ
6947 rq->nohz_flags = 0; 6947 rq->nohz_flags = 0;
6948 #endif 6948 #endif
6949 #endif 6949 #endif
6950 init_rq_hrtick(rq); 6950 init_rq_hrtick(rq);
6951 atomic_set(&rq->nr_iowait, 0); 6951 atomic_set(&rq->nr_iowait, 0);
6952 } 6952 }
6953 6953
6954 set_load_weight(&init_task); 6954 set_load_weight(&init_task);
6955 6955
6956 #ifdef CONFIG_PREEMPT_NOTIFIERS 6956 #ifdef CONFIG_PREEMPT_NOTIFIERS
6957 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6957 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6958 #endif 6958 #endif
6959 6959
6960 #ifdef CONFIG_RT_MUTEXES 6960 #ifdef CONFIG_RT_MUTEXES
6961 plist_head_init(&init_task.pi_waiters); 6961 plist_head_init(&init_task.pi_waiters);
6962 #endif 6962 #endif
6963 6963
6964 /* 6964 /*
6965 * The boot idle thread does lazy MMU switching as well: 6965 * The boot idle thread does lazy MMU switching as well:
6966 */ 6966 */
6967 atomic_inc(&init_mm.mm_count); 6967 atomic_inc(&init_mm.mm_count);
6968 enter_lazy_tlb(&init_mm, current); 6968 enter_lazy_tlb(&init_mm, current);
6969 6969
6970 /* 6970 /*
6971 * Make us the idle thread. Technically, schedule() should not be 6971 * Make us the idle thread. Technically, schedule() should not be
6972 * called from this thread, however somewhere below it might be, 6972 * called from this thread, however somewhere below it might be,
6973 * but because we are the idle thread, we just pick up running again 6973 * but because we are the idle thread, we just pick up running again
6974 * when this runqueue becomes "idle". 6974 * when this runqueue becomes "idle".
6975 */ 6975 */
6976 init_idle(current, smp_processor_id()); 6976 init_idle(current, smp_processor_id());
6977 6977
6978 calc_load_update = jiffies + LOAD_FREQ; 6978 calc_load_update = jiffies + LOAD_FREQ;
6979 6979
6980 /* 6980 /*
6981 * During early bootup we pretend to be a normal task: 6981 * During early bootup we pretend to be a normal task:
6982 */ 6982 */
6983 current->sched_class = &fair_sched_class; 6983 current->sched_class = &fair_sched_class;
6984 6984
6985 #ifdef CONFIG_SMP 6985 #ifdef CONFIG_SMP
6986 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6986 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6987 /* May be allocated at isolcpus cmdline parse time */ 6987 /* May be allocated at isolcpus cmdline parse time */
6988 if (cpu_isolated_map == NULL) 6988 if (cpu_isolated_map == NULL)
6989 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6989 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6990 #endif 6990 #endif
6991 init_sched_fair_class(); 6991 init_sched_fair_class();
6992 6992
6993 scheduler_running = 1; 6993 scheduler_running = 1;
6994 } 6994 }
6995 6995
6996 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6996 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6997 static inline int preempt_count_equals(int preempt_offset) 6997 static inline int preempt_count_equals(int preempt_offset)
6998 { 6998 {
6999 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 6999 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7000 7000
7001 return (nested == preempt_offset); 7001 return (nested == preempt_offset);
7002 } 7002 }
7003 7003
7004 void __might_sleep(const char *file, int line, int preempt_offset) 7004 void __might_sleep(const char *file, int line, int preempt_offset)
7005 { 7005 {
7006 static unsigned long prev_jiffy; /* ratelimiting */ 7006 static unsigned long prev_jiffy; /* ratelimiting */
7007 7007
7008 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 7008 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
7009 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 7009 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7010 system_state != SYSTEM_RUNNING || oops_in_progress) 7010 system_state != SYSTEM_RUNNING || oops_in_progress)
7011 return; 7011 return;
7012 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7012 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7013 return; 7013 return;
7014 prev_jiffy = jiffies; 7014 prev_jiffy = jiffies;
7015 7015
7016 printk(KERN_ERR 7016 printk(KERN_ERR
7017 "BUG: sleeping function called from invalid context at %s:%d\n", 7017 "BUG: sleeping function called from invalid context at %s:%d\n",
7018 file, line); 7018 file, line);
7019 printk(KERN_ERR 7019 printk(KERN_ERR
7020 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7020 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7021 in_atomic(), irqs_disabled(), 7021 in_atomic(), irqs_disabled(),
7022 current->pid, current->comm); 7022 current->pid, current->comm);
7023 7023
7024 debug_show_held_locks(current); 7024 debug_show_held_locks(current);
7025 if (irqs_disabled()) 7025 if (irqs_disabled())
7026 print_irqtrace_events(current); 7026 print_irqtrace_events(current);
7027 dump_stack(); 7027 dump_stack();
7028 } 7028 }
7029 EXPORT_SYMBOL(__might_sleep); 7029 EXPORT_SYMBOL(__might_sleep);
7030 #endif 7030 #endif
7031 7031
7032 #ifdef CONFIG_MAGIC_SYSRQ 7032 #ifdef CONFIG_MAGIC_SYSRQ
7033 static void normalize_task(struct rq *rq, struct task_struct *p) 7033 static void normalize_task(struct rq *rq, struct task_struct *p)
7034 { 7034 {
7035 const struct sched_class *prev_class = p->sched_class; 7035 const struct sched_class *prev_class = p->sched_class;
7036 int old_prio = p->prio; 7036 int old_prio = p->prio;
7037 int on_rq; 7037 int on_rq;
7038 7038
7039 on_rq = p->on_rq; 7039 on_rq = p->on_rq;
7040 if (on_rq) 7040 if (on_rq)
7041 dequeue_task(rq, p, 0); 7041 dequeue_task(rq, p, 0);
7042 __setscheduler(rq, p, SCHED_NORMAL, 0); 7042 __setscheduler(rq, p, SCHED_NORMAL, 0);
7043 if (on_rq) { 7043 if (on_rq) {
7044 enqueue_task(rq, p, 0); 7044 enqueue_task(rq, p, 0);
7045 resched_task(rq->curr); 7045 resched_task(rq->curr);
7046 } 7046 }
7047 7047
7048 check_class_changed(rq, p, prev_class, old_prio); 7048 check_class_changed(rq, p, prev_class, old_prio);
7049 } 7049 }
7050 7050
7051 void normalize_rt_tasks(void) 7051 void normalize_rt_tasks(void)
7052 { 7052 {
7053 struct task_struct *g, *p; 7053 struct task_struct *g, *p;
7054 unsigned long flags; 7054 unsigned long flags;
7055 struct rq *rq; 7055 struct rq *rq;
7056 7056
7057 read_lock_irqsave(&tasklist_lock, flags); 7057 read_lock_irqsave(&tasklist_lock, flags);
7058 do_each_thread(g, p) { 7058 do_each_thread(g, p) {
7059 /* 7059 /*
7060 * Only normalize user tasks: 7060 * Only normalize user tasks:
7061 */ 7061 */
7062 if (!p->mm) 7062 if (!p->mm)
7063 continue; 7063 continue;
7064 7064
7065 p->se.exec_start = 0; 7065 p->se.exec_start = 0;
7066 #ifdef CONFIG_SCHEDSTATS 7066 #ifdef CONFIG_SCHEDSTATS
7067 p->se.statistics.wait_start = 0; 7067 p->se.statistics.wait_start = 0;
7068 p->se.statistics.sleep_start = 0; 7068 p->se.statistics.sleep_start = 0;
7069 p->se.statistics.block_start = 0; 7069 p->se.statistics.block_start = 0;
7070 #endif 7070 #endif
7071 7071
7072 if (!rt_task(p)) { 7072 if (!rt_task(p)) {
7073 /* 7073 /*
7074 * Renice negative nice level userspace 7074 * Renice negative nice level userspace
7075 * tasks back to 0: 7075 * tasks back to 0:
7076 */ 7076 */
7077 if (TASK_NICE(p) < 0 && p->mm) 7077 if (TASK_NICE(p) < 0 && p->mm)
7078 set_user_nice(p, 0); 7078 set_user_nice(p, 0);
7079 continue; 7079 continue;
7080 } 7080 }
7081 7081
7082 raw_spin_lock(&p->pi_lock); 7082 raw_spin_lock(&p->pi_lock);
7083 rq = __task_rq_lock(p); 7083 rq = __task_rq_lock(p);
7084 7084
7085 normalize_task(rq, p); 7085 normalize_task(rq, p);
7086 7086
7087 __task_rq_unlock(rq); 7087 __task_rq_unlock(rq);
7088 raw_spin_unlock(&p->pi_lock); 7088 raw_spin_unlock(&p->pi_lock);
7089 } while_each_thread(g, p); 7089 } while_each_thread(g, p);
7090 7090
7091 read_unlock_irqrestore(&tasklist_lock, flags); 7091 read_unlock_irqrestore(&tasklist_lock, flags);
7092 } 7092 }
7093 7093
7094 #endif /* CONFIG_MAGIC_SYSRQ */ 7094 #endif /* CONFIG_MAGIC_SYSRQ */
7095 7095
7096 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7096 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7097 /* 7097 /*
7098 * These functions are only useful for the IA64 MCA handling, or kdb. 7098 * These functions are only useful for the IA64 MCA handling, or kdb.
7099 * 7099 *
7100 * They can only be called when the whole system has been 7100 * They can only be called when the whole system has been
7101 * stopped - every CPU needs to be quiescent, and no scheduling 7101 * stopped - every CPU needs to be quiescent, and no scheduling
7102 * activity can take place. Using them for anything else would 7102 * activity can take place. Using them for anything else would
7103 * be a serious bug, and as a result, they aren't even visible 7103 * be a serious bug, and as a result, they aren't even visible
7104 * under any other configuration. 7104 * under any other configuration.
7105 */ 7105 */
7106 7106
7107 /** 7107 /**
7108 * curr_task - return the current task for a given cpu. 7108 * curr_task - return the current task for a given cpu.
7109 * @cpu: the processor in question. 7109 * @cpu: the processor in question.
7110 * 7110 *
7111 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7111 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7112 */ 7112 */
7113 struct task_struct *curr_task(int cpu) 7113 struct task_struct *curr_task(int cpu)
7114 { 7114 {
7115 return cpu_curr(cpu); 7115 return cpu_curr(cpu);
7116 } 7116 }
7117 7117
7118 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7118 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7119 7119
7120 #ifdef CONFIG_IA64 7120 #ifdef CONFIG_IA64
7121 /** 7121 /**
7122 * set_curr_task - set the current task for a given cpu. 7122 * set_curr_task - set the current task for a given cpu.
7123 * @cpu: the processor in question. 7123 * @cpu: the processor in question.
7124 * @p: the task pointer to set. 7124 * @p: the task pointer to set.
7125 * 7125 *
7126 * Description: This function must only be used when non-maskable interrupts 7126 * Description: This function must only be used when non-maskable interrupts
7127 * are serviced on a separate stack. It allows the architecture to switch the 7127 * are serviced on a separate stack. It allows the architecture to switch the
7128 * notion of the current task on a cpu in a non-blocking manner. This function 7128 * notion of the current task on a cpu in a non-blocking manner. This function
7129 * must be called with all CPU's synchronized, and interrupts disabled, the 7129 * must be called with all CPU's synchronized, and interrupts disabled, the
7130 * and caller must save the original value of the current task (see 7130 * and caller must save the original value of the current task (see
7131 * curr_task() above) and restore that value before reenabling interrupts and 7131 * curr_task() above) and restore that value before reenabling interrupts and
7132 * re-starting the system. 7132 * re-starting the system.
7133 * 7133 *
7134 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7134 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7135 */ 7135 */
7136 void set_curr_task(int cpu, struct task_struct *p) 7136 void set_curr_task(int cpu, struct task_struct *p)
7137 { 7137 {
7138 cpu_curr(cpu) = p; 7138 cpu_curr(cpu) = p;
7139 } 7139 }
7140 7140
7141 #endif 7141 #endif
7142 7142
7143 #ifdef CONFIG_CGROUP_SCHED 7143 #ifdef CONFIG_CGROUP_SCHED
7144 /* task_group_lock serializes the addition/removal of task groups */ 7144 /* task_group_lock serializes the addition/removal of task groups */
7145 static DEFINE_SPINLOCK(task_group_lock); 7145 static DEFINE_SPINLOCK(task_group_lock);
7146 7146
7147 static void free_sched_group(struct task_group *tg) 7147 static void free_sched_group(struct task_group *tg)
7148 { 7148 {
7149 free_fair_sched_group(tg); 7149 free_fair_sched_group(tg);
7150 free_rt_sched_group(tg); 7150 free_rt_sched_group(tg);
7151 autogroup_free(tg); 7151 autogroup_free(tg);
7152 kfree(tg); 7152 kfree(tg);
7153 } 7153 }
7154 7154
7155 /* allocate runqueue etc for a new task group */ 7155 /* allocate runqueue etc for a new task group */
7156 struct task_group *sched_create_group(struct task_group *parent) 7156 struct task_group *sched_create_group(struct task_group *parent)
7157 { 7157 {
7158 struct task_group *tg; 7158 struct task_group *tg;
7159 unsigned long flags; 7159 unsigned long flags;
7160 7160
7161 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7161 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7162 if (!tg) 7162 if (!tg)
7163 return ERR_PTR(-ENOMEM); 7163 return ERR_PTR(-ENOMEM);
7164 7164
7165 if (!alloc_fair_sched_group(tg, parent)) 7165 if (!alloc_fair_sched_group(tg, parent))
7166 goto err; 7166 goto err;
7167 7167
7168 if (!alloc_rt_sched_group(tg, parent)) 7168 if (!alloc_rt_sched_group(tg, parent))
7169 goto err; 7169 goto err;
7170 7170
7171 spin_lock_irqsave(&task_group_lock, flags); 7171 spin_lock_irqsave(&task_group_lock, flags);
7172 list_add_rcu(&tg->list, &task_groups); 7172 list_add_rcu(&tg->list, &task_groups);
7173 7173
7174 WARN_ON(!parent); /* root should already exist */ 7174 WARN_ON(!parent); /* root should already exist */
7175 7175
7176 tg->parent = parent; 7176 tg->parent = parent;
7177 INIT_LIST_HEAD(&tg->children); 7177 INIT_LIST_HEAD(&tg->children);
7178 list_add_rcu(&tg->siblings, &parent->children); 7178 list_add_rcu(&tg->siblings, &parent->children);
7179 spin_unlock_irqrestore(&task_group_lock, flags); 7179 spin_unlock_irqrestore(&task_group_lock, flags);
7180 7180
7181 return tg; 7181 return tg;
7182 7182
7183 err: 7183 err:
7184 free_sched_group(tg); 7184 free_sched_group(tg);
7185 return ERR_PTR(-ENOMEM); 7185 return ERR_PTR(-ENOMEM);
7186 } 7186 }
7187 7187
7188 /* rcu callback to free various structures associated with a task group */ 7188 /* rcu callback to free various structures associated with a task group */
7189 static void free_sched_group_rcu(struct rcu_head *rhp) 7189 static void free_sched_group_rcu(struct rcu_head *rhp)
7190 { 7190 {
7191 /* now it should be safe to free those cfs_rqs */ 7191 /* now it should be safe to free those cfs_rqs */
7192 free_sched_group(container_of(rhp, struct task_group, rcu)); 7192 free_sched_group(container_of(rhp, struct task_group, rcu));
7193 } 7193 }
7194 7194
7195 /* Destroy runqueue etc associated with a task group */ 7195 /* Destroy runqueue etc associated with a task group */
7196 void sched_destroy_group(struct task_group *tg) 7196 void sched_destroy_group(struct task_group *tg)
7197 { 7197 {
7198 unsigned long flags; 7198 unsigned long flags;
7199 int i; 7199 int i;
7200 7200
7201 /* end participation in shares distribution */ 7201 /* end participation in shares distribution */
7202 for_each_possible_cpu(i) 7202 for_each_possible_cpu(i)
7203 unregister_fair_sched_group(tg, i); 7203 unregister_fair_sched_group(tg, i);
7204 7204
7205 spin_lock_irqsave(&task_group_lock, flags); 7205 spin_lock_irqsave(&task_group_lock, flags);
7206 list_del_rcu(&tg->list); 7206 list_del_rcu(&tg->list);
7207 list_del_rcu(&tg->siblings); 7207 list_del_rcu(&tg->siblings);
7208 spin_unlock_irqrestore(&task_group_lock, flags); 7208 spin_unlock_irqrestore(&task_group_lock, flags);
7209 7209
7210 /* wait for possible concurrent references to cfs_rqs complete */ 7210 /* wait for possible concurrent references to cfs_rqs complete */
7211 call_rcu(&tg->rcu, free_sched_group_rcu); 7211 call_rcu(&tg->rcu, free_sched_group_rcu);
7212 } 7212 }
7213 7213
7214 /* change task's runqueue when it moves between groups. 7214 /* change task's runqueue when it moves between groups.
7215 * The caller of this function should have put the task in its new group 7215 * The caller of this function should have put the task in its new group
7216 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7216 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7217 * reflect its new group. 7217 * reflect its new group.
7218 */ 7218 */
7219 void sched_move_task(struct task_struct *tsk) 7219 void sched_move_task(struct task_struct *tsk)
7220 { 7220 {
7221 int on_rq, running; 7221 int on_rq, running;
7222 unsigned long flags; 7222 unsigned long flags;
7223 struct rq *rq; 7223 struct rq *rq;
7224 7224
7225 rq = task_rq_lock(tsk, &flags); 7225 rq = task_rq_lock(tsk, &flags);
7226 7226
7227 running = task_current(rq, tsk); 7227 running = task_current(rq, tsk);
7228 on_rq = tsk->on_rq; 7228 on_rq = tsk->on_rq;
7229 7229
7230 if (on_rq) 7230 if (on_rq)
7231 dequeue_task(rq, tsk, 0); 7231 dequeue_task(rq, tsk, 0);
7232 if (unlikely(running)) 7232 if (unlikely(running))
7233 tsk->sched_class->put_prev_task(rq, tsk); 7233 tsk->sched_class->put_prev_task(rq, tsk);
7234 7234
7235 #ifdef CONFIG_FAIR_GROUP_SCHED 7235 #ifdef CONFIG_FAIR_GROUP_SCHED
7236 if (tsk->sched_class->task_move_group) 7236 if (tsk->sched_class->task_move_group)
7237 tsk->sched_class->task_move_group(tsk, on_rq); 7237 tsk->sched_class->task_move_group(tsk, on_rq);
7238 else 7238 else
7239 #endif 7239 #endif
7240 set_task_rq(tsk, task_cpu(tsk)); 7240 set_task_rq(tsk, task_cpu(tsk));
7241 7241
7242 if (unlikely(running)) 7242 if (unlikely(running))
7243 tsk->sched_class->set_curr_task(rq); 7243 tsk->sched_class->set_curr_task(rq);
7244 if (on_rq) 7244 if (on_rq)
7245 enqueue_task(rq, tsk, 0); 7245 enqueue_task(rq, tsk, 0);
7246 7246
7247 task_rq_unlock(rq, tsk, &flags); 7247 task_rq_unlock(rq, tsk, &flags);
7248 } 7248 }
7249 #endif /* CONFIG_CGROUP_SCHED */ 7249 #endif /* CONFIG_CGROUP_SCHED */
7250 7250
7251 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) 7251 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
7252 static unsigned long to_ratio(u64 period, u64 runtime) 7252 static unsigned long to_ratio(u64 period, u64 runtime)
7253 { 7253 {
7254 if (runtime == RUNTIME_INF) 7254 if (runtime == RUNTIME_INF)
7255 return 1ULL << 20; 7255 return 1ULL << 20;
7256 7256
7257 return div64_u64(runtime << 20, period); 7257 return div64_u64(runtime << 20, period);
7258 } 7258 }
7259 #endif 7259 #endif
7260 7260
7261 #ifdef CONFIG_RT_GROUP_SCHED 7261 #ifdef CONFIG_RT_GROUP_SCHED
7262 /* 7262 /*
7263 * Ensure that the real time constraints are schedulable. 7263 * Ensure that the real time constraints are schedulable.
7264 */ 7264 */
7265 static DEFINE_MUTEX(rt_constraints_mutex); 7265 static DEFINE_MUTEX(rt_constraints_mutex);
7266 7266
7267 /* Must be called with tasklist_lock held */ 7267 /* Must be called with tasklist_lock held */
7268 static inline int tg_has_rt_tasks(struct task_group *tg) 7268 static inline int tg_has_rt_tasks(struct task_group *tg)
7269 { 7269 {
7270 struct task_struct *g, *p; 7270 struct task_struct *g, *p;
7271 7271
7272 do_each_thread(g, p) { 7272 do_each_thread(g, p) {
7273 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7273 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7274 return 1; 7274 return 1;
7275 } while_each_thread(g, p); 7275 } while_each_thread(g, p);
7276 7276
7277 return 0; 7277 return 0;
7278 } 7278 }
7279 7279
7280 struct rt_schedulable_data { 7280 struct rt_schedulable_data {
7281 struct task_group *tg; 7281 struct task_group *tg;
7282 u64 rt_period; 7282 u64 rt_period;
7283 u64 rt_runtime; 7283 u64 rt_runtime;
7284 }; 7284 };
7285 7285
7286 static int tg_rt_schedulable(struct task_group *tg, void *data) 7286 static int tg_rt_schedulable(struct task_group *tg, void *data)
7287 { 7287 {
7288 struct rt_schedulable_data *d = data; 7288 struct rt_schedulable_data *d = data;
7289 struct task_group *child; 7289 struct task_group *child;
7290 unsigned long total, sum = 0; 7290 unsigned long total, sum = 0;
7291 u64 period, runtime; 7291 u64 period, runtime;
7292 7292
7293 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7293 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7294 runtime = tg->rt_bandwidth.rt_runtime; 7294 runtime = tg->rt_bandwidth.rt_runtime;
7295 7295
7296 if (tg == d->tg) { 7296 if (tg == d->tg) {
7297 period = d->rt_period; 7297 period = d->rt_period;
7298 runtime = d->rt_runtime; 7298 runtime = d->rt_runtime;
7299 } 7299 }
7300 7300
7301 /* 7301 /*
7302 * Cannot have more runtime than the period. 7302 * Cannot have more runtime than the period.
7303 */ 7303 */
7304 if (runtime > period && runtime != RUNTIME_INF) 7304 if (runtime > period && runtime != RUNTIME_INF)
7305 return -EINVAL; 7305 return -EINVAL;
7306 7306
7307 /* 7307 /*
7308 * Ensure we don't starve existing RT tasks. 7308 * Ensure we don't starve existing RT tasks.
7309 */ 7309 */
7310 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7310 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7311 return -EBUSY; 7311 return -EBUSY;
7312 7312
7313 total = to_ratio(period, runtime); 7313 total = to_ratio(period, runtime);
7314 7314
7315 /* 7315 /*
7316 * Nobody can have more than the global setting allows. 7316 * Nobody can have more than the global setting allows.
7317 */ 7317 */
7318 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7318 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7319 return -EINVAL; 7319 return -EINVAL;
7320 7320
7321 /* 7321 /*
7322 * The sum of our children's runtime should not exceed our own. 7322 * The sum of our children's runtime should not exceed our own.
7323 */ 7323 */
7324 list_for_each_entry_rcu(child, &tg->children, siblings) { 7324 list_for_each_entry_rcu(child, &tg->children, siblings) {
7325 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7325 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7326 runtime = child->rt_bandwidth.rt_runtime; 7326 runtime = child->rt_bandwidth.rt_runtime;
7327 7327
7328 if (child == d->tg) { 7328 if (child == d->tg) {
7329 period = d->rt_period; 7329 period = d->rt_period;
7330 runtime = d->rt_runtime; 7330 runtime = d->rt_runtime;
7331 } 7331 }
7332 7332
7333 sum += to_ratio(period, runtime); 7333 sum += to_ratio(period, runtime);
7334 } 7334 }
7335 7335
7336 if (sum > total) 7336 if (sum > total)
7337 return -EINVAL; 7337 return -EINVAL;
7338 7338
7339 return 0; 7339 return 0;
7340 } 7340 }
7341 7341
7342 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7342 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7343 { 7343 {
7344 int ret; 7344 int ret;
7345 7345
7346 struct rt_schedulable_data data = { 7346 struct rt_schedulable_data data = {
7347 .tg = tg, 7347 .tg = tg,
7348 .rt_period = period, 7348 .rt_period = period,
7349 .rt_runtime = runtime, 7349 .rt_runtime = runtime,
7350 }; 7350 };
7351 7351
7352 rcu_read_lock(); 7352 rcu_read_lock();
7353 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7353 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7354 rcu_read_unlock(); 7354 rcu_read_unlock();
7355 7355
7356 return ret; 7356 return ret;
7357 } 7357 }
7358 7358
7359 static int tg_set_rt_bandwidth(struct task_group *tg, 7359 static int tg_set_rt_bandwidth(struct task_group *tg,
7360 u64 rt_period, u64 rt_runtime) 7360 u64 rt_period, u64 rt_runtime)
7361 { 7361 {
7362 int i, err = 0; 7362 int i, err = 0;
7363 7363
7364 mutex_lock(&rt_constraints_mutex); 7364 mutex_lock(&rt_constraints_mutex);
7365 read_lock(&tasklist_lock); 7365 read_lock(&tasklist_lock);
7366 err = __rt_schedulable(tg, rt_period, rt_runtime); 7366 err = __rt_schedulable(tg, rt_period, rt_runtime);
7367 if (err) 7367 if (err)
7368 goto unlock; 7368 goto unlock;
7369 7369
7370 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7370 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7371 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7371 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7372 tg->rt_bandwidth.rt_runtime = rt_runtime; 7372 tg->rt_bandwidth.rt_runtime = rt_runtime;
7373 7373
7374 for_each_possible_cpu(i) { 7374 for_each_possible_cpu(i) {
7375 struct rt_rq *rt_rq = tg->rt_rq[i]; 7375 struct rt_rq *rt_rq = tg->rt_rq[i];
7376 7376
7377 raw_spin_lock(&rt_rq->rt_runtime_lock); 7377 raw_spin_lock(&rt_rq->rt_runtime_lock);
7378 rt_rq->rt_runtime = rt_runtime; 7378 rt_rq->rt_runtime = rt_runtime;
7379 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7379 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7380 } 7380 }
7381 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7381 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7382 unlock: 7382 unlock:
7383 read_unlock(&tasklist_lock); 7383 read_unlock(&tasklist_lock);
7384 mutex_unlock(&rt_constraints_mutex); 7384 mutex_unlock(&rt_constraints_mutex);
7385 7385
7386 return err; 7386 return err;
7387 } 7387 }
7388 7388
7389 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7389 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7390 { 7390 {
7391 u64 rt_runtime, rt_period; 7391 u64 rt_runtime, rt_period;
7392 7392
7393 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7393 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7394 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7394 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7395 if (rt_runtime_us < 0) 7395 if (rt_runtime_us < 0)
7396 rt_runtime = RUNTIME_INF; 7396 rt_runtime = RUNTIME_INF;
7397 7397
7398 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7398 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7399 } 7399 }
7400 7400
7401 long sched_group_rt_runtime(struct task_group *tg) 7401 long sched_group_rt_runtime(struct task_group *tg)
7402 { 7402 {
7403 u64 rt_runtime_us; 7403 u64 rt_runtime_us;
7404 7404
7405 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7405 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7406 return -1; 7406 return -1;
7407 7407
7408 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7408 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7409 do_div(rt_runtime_us, NSEC_PER_USEC); 7409 do_div(rt_runtime_us, NSEC_PER_USEC);
7410 return rt_runtime_us; 7410 return rt_runtime_us;
7411 } 7411 }
7412 7412
7413 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7413 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7414 { 7414 {
7415 u64 rt_runtime, rt_period; 7415 u64 rt_runtime, rt_period;
7416 7416
7417 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7417 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7418 rt_runtime = tg->rt_bandwidth.rt_runtime; 7418 rt_runtime = tg->rt_bandwidth.rt_runtime;
7419 7419
7420 if (rt_period == 0) 7420 if (rt_period == 0)
7421 return -EINVAL; 7421 return -EINVAL;
7422 7422
7423 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7423 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7424 } 7424 }
7425 7425
7426 long sched_group_rt_period(struct task_group *tg) 7426 long sched_group_rt_period(struct task_group *tg)
7427 { 7427 {
7428 u64 rt_period_us; 7428 u64 rt_period_us;
7429 7429
7430 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7430 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7431 do_div(rt_period_us, NSEC_PER_USEC); 7431 do_div(rt_period_us, NSEC_PER_USEC);
7432 return rt_period_us; 7432 return rt_period_us;
7433 } 7433 }
7434 7434
7435 static int sched_rt_global_constraints(void) 7435 static int sched_rt_global_constraints(void)
7436 { 7436 {
7437 u64 runtime, period; 7437 u64 runtime, period;
7438 int ret = 0; 7438 int ret = 0;
7439 7439
7440 if (sysctl_sched_rt_period <= 0) 7440 if (sysctl_sched_rt_period <= 0)
7441 return -EINVAL; 7441 return -EINVAL;
7442 7442
7443 runtime = global_rt_runtime(); 7443 runtime = global_rt_runtime();
7444 period = global_rt_period(); 7444 period = global_rt_period();
7445 7445
7446 /* 7446 /*
7447 * Sanity check on the sysctl variables. 7447 * Sanity check on the sysctl variables.
7448 */ 7448 */
7449 if (runtime > period && runtime != RUNTIME_INF) 7449 if (runtime > period && runtime != RUNTIME_INF)
7450 return -EINVAL; 7450 return -EINVAL;
7451 7451
7452 mutex_lock(&rt_constraints_mutex); 7452 mutex_lock(&rt_constraints_mutex);
7453 read_lock(&tasklist_lock); 7453 read_lock(&tasklist_lock);
7454 ret = __rt_schedulable(NULL, 0, 0); 7454 ret = __rt_schedulable(NULL, 0, 0);
7455 read_unlock(&tasklist_lock); 7455 read_unlock(&tasklist_lock);
7456 mutex_unlock(&rt_constraints_mutex); 7456 mutex_unlock(&rt_constraints_mutex);
7457 7457
7458 return ret; 7458 return ret;
7459 } 7459 }
7460 7460
7461 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7461 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7462 { 7462 {
7463 /* Don't accept realtime tasks when there is no way for them to run */ 7463 /* Don't accept realtime tasks when there is no way for them to run */
7464 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7464 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7465 return 0; 7465 return 0;
7466 7466
7467 return 1; 7467 return 1;
7468 } 7468 }
7469 7469
7470 #else /* !CONFIG_RT_GROUP_SCHED */ 7470 #else /* !CONFIG_RT_GROUP_SCHED */
7471 static int sched_rt_global_constraints(void) 7471 static int sched_rt_global_constraints(void)
7472 { 7472 {
7473 unsigned long flags; 7473 unsigned long flags;
7474 int i; 7474 int i;
7475 7475
7476 if (sysctl_sched_rt_period <= 0) 7476 if (sysctl_sched_rt_period <= 0)
7477 return -EINVAL; 7477 return -EINVAL;
7478 7478
7479 /* 7479 /*
7480 * There's always some RT tasks in the root group 7480 * There's always some RT tasks in the root group
7481 * -- migration, kstopmachine etc.. 7481 * -- migration, kstopmachine etc..
7482 */ 7482 */
7483 if (sysctl_sched_rt_runtime == 0) 7483 if (sysctl_sched_rt_runtime == 0)
7484 return -EBUSY; 7484 return -EBUSY;
7485 7485
7486 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7486 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7487 for_each_possible_cpu(i) { 7487 for_each_possible_cpu(i) {
7488 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7488 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7489 7489
7490 raw_spin_lock(&rt_rq->rt_runtime_lock); 7490 raw_spin_lock(&rt_rq->rt_runtime_lock);
7491 rt_rq->rt_runtime = global_rt_runtime(); 7491 rt_rq->rt_runtime = global_rt_runtime();
7492 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7492 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7493 } 7493 }
7494 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7494 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7495 7495
7496 return 0; 7496 return 0;
7497 } 7497 }
7498 #endif /* CONFIG_RT_GROUP_SCHED */ 7498 #endif /* CONFIG_RT_GROUP_SCHED */
7499 7499
7500 int sched_rt_handler(struct ctl_table *table, int write, 7500 int sched_rt_handler(struct ctl_table *table, int write,
7501 void __user *buffer, size_t *lenp, 7501 void __user *buffer, size_t *lenp,
7502 loff_t *ppos) 7502 loff_t *ppos)
7503 { 7503 {
7504 int ret; 7504 int ret;
7505 int old_period, old_runtime; 7505 int old_period, old_runtime;
7506 static DEFINE_MUTEX(mutex); 7506 static DEFINE_MUTEX(mutex);
7507 7507
7508 mutex_lock(&mutex); 7508 mutex_lock(&mutex);
7509 old_period = sysctl_sched_rt_period; 7509 old_period = sysctl_sched_rt_period;
7510 old_runtime = sysctl_sched_rt_runtime; 7510 old_runtime = sysctl_sched_rt_runtime;
7511 7511
7512 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7512 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7513 7513
7514 if (!ret && write) { 7514 if (!ret && write) {
7515 ret = sched_rt_global_constraints(); 7515 ret = sched_rt_global_constraints();
7516 if (ret) { 7516 if (ret) {
7517 sysctl_sched_rt_period = old_period; 7517 sysctl_sched_rt_period = old_period;
7518 sysctl_sched_rt_runtime = old_runtime; 7518 sysctl_sched_rt_runtime = old_runtime;
7519 } else { 7519 } else {
7520 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7520 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7521 def_rt_bandwidth.rt_period = 7521 def_rt_bandwidth.rt_period =
7522 ns_to_ktime(global_rt_period()); 7522 ns_to_ktime(global_rt_period());
7523 } 7523 }
7524 } 7524 }
7525 mutex_unlock(&mutex); 7525 mutex_unlock(&mutex);
7526 7526
7527 return ret; 7527 return ret;
7528 } 7528 }
7529 7529
7530 #ifdef CONFIG_CGROUP_SCHED 7530 #ifdef CONFIG_CGROUP_SCHED
7531 7531
7532 /* return corresponding task_group object of a cgroup */ 7532 /* return corresponding task_group object of a cgroup */
7533 static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 7533 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7534 { 7534 {
7535 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 7535 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
7536 struct task_group, css); 7536 struct task_group, css);
7537 } 7537 }
7538 7538
7539 static struct cgroup_subsys_state * 7539 static struct cgroup_subsys_state *
7540 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 7540 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7541 { 7541 {
7542 struct task_group *tg, *parent; 7542 struct task_group *tg, *parent;
7543 7543
7544 if (!cgrp->parent) { 7544 if (!cgrp->parent) {
7545 /* This is early initialization for the top cgroup */ 7545 /* This is early initialization for the top cgroup */
7546 return &root_task_group.css; 7546 return &root_task_group.css;
7547 } 7547 }
7548 7548
7549 parent = cgroup_tg(cgrp->parent); 7549 parent = cgroup_tg(cgrp->parent);
7550 tg = sched_create_group(parent); 7550 tg = sched_create_group(parent);
7551 if (IS_ERR(tg)) 7551 if (IS_ERR(tg))
7552 return ERR_PTR(-ENOMEM); 7552 return ERR_PTR(-ENOMEM);
7553 7553
7554 return &tg->css; 7554 return &tg->css;
7555 } 7555 }
7556 7556
7557 static void 7557 static void
7558 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7558 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7559 { 7559 {
7560 struct task_group *tg = cgroup_tg(cgrp); 7560 struct task_group *tg = cgroup_tg(cgrp);
7561 7561
7562 sched_destroy_group(tg); 7562 sched_destroy_group(tg);
7563 } 7563 }
7564 7564
7565 static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7565 static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7566 struct cgroup_taskset *tset) 7566 struct cgroup_taskset *tset)
7567 { 7567 {
7568 struct task_struct *task; 7568 struct task_struct *task;
7569 7569
7570 cgroup_taskset_for_each(task, cgrp, tset) { 7570 cgroup_taskset_for_each(task, cgrp, tset) {
7571 #ifdef CONFIG_RT_GROUP_SCHED 7571 #ifdef CONFIG_RT_GROUP_SCHED
7572 if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7572 if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
7573 return -EINVAL; 7573 return -EINVAL;
7574 #else 7574 #else
7575 /* We don't support RT-tasks being in separate groups */ 7575 /* We don't support RT-tasks being in separate groups */
7576 if (task->sched_class != &fair_sched_class) 7576 if (task->sched_class != &fair_sched_class)
7577 return -EINVAL; 7577 return -EINVAL;
7578 #endif 7578 #endif
7579 } 7579 }
7580 return 0; 7580 return 0;
7581 } 7581 }
7582 7582
7583 static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7583 static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7584 struct cgroup_taskset *tset) 7584 struct cgroup_taskset *tset)
7585 { 7585 {
7586 struct task_struct *task; 7586 struct task_struct *task;
7587 7587
7588 cgroup_taskset_for_each(task, cgrp, tset) 7588 cgroup_taskset_for_each(task, cgrp, tset)
7589 sched_move_task(task); 7589 sched_move_task(task);
7590 } 7590 }
7591 7591
7592 static void 7592 static void
7593 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7593 cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7594 struct cgroup *old_cgrp, struct task_struct *task) 7594 struct cgroup *old_cgrp, struct task_struct *task)
7595 { 7595 {
7596 /* 7596 /*
7597 * cgroup_exit() is called in the copy_process() failure path. 7597 * cgroup_exit() is called in the copy_process() failure path.
7598 * Ignore this case since the task hasn't ran yet, this avoids 7598 * Ignore this case since the task hasn't ran yet, this avoids
7599 * trying to poke a half freed task state from generic code. 7599 * trying to poke a half freed task state from generic code.
7600 */ 7600 */
7601 if (!(task->flags & PF_EXITING)) 7601 if (!(task->flags & PF_EXITING))
7602 return; 7602 return;
7603 7603
7604 sched_move_task(task); 7604 sched_move_task(task);
7605 } 7605 }
7606 7606
7607 #ifdef CONFIG_FAIR_GROUP_SCHED 7607 #ifdef CONFIG_FAIR_GROUP_SCHED
7608 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7608 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7609 u64 shareval) 7609 u64 shareval)
7610 { 7610 {
7611 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); 7611 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
7612 } 7612 }
7613 7613
7614 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 7614 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
7615 { 7615 {
7616 struct task_group *tg = cgroup_tg(cgrp); 7616 struct task_group *tg = cgroup_tg(cgrp);
7617 7617
7618 return (u64) scale_load_down(tg->shares); 7618 return (u64) scale_load_down(tg->shares);
7619 } 7619 }
7620 7620
7621 #ifdef CONFIG_CFS_BANDWIDTH 7621 #ifdef CONFIG_CFS_BANDWIDTH
7622 static DEFINE_MUTEX(cfs_constraints_mutex); 7622 static DEFINE_MUTEX(cfs_constraints_mutex);
7623 7623
7624 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7624 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7625 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7625 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7626 7626
7627 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7627 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7628 7628
7629 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7629 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7630 { 7630 {
7631 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7631 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7632 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7632 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7633 7633
7634 if (tg == &root_task_group) 7634 if (tg == &root_task_group)
7635 return -EINVAL; 7635 return -EINVAL;
7636 7636
7637 /* 7637 /*
7638 * Ensure we have at some amount of bandwidth every period. This is 7638 * Ensure we have at some amount of bandwidth every period. This is
7639 * to prevent reaching a state of large arrears when throttled via 7639 * to prevent reaching a state of large arrears when throttled via
7640 * entity_tick() resulting in prolonged exit starvation. 7640 * entity_tick() resulting in prolonged exit starvation.
7641 */ 7641 */
7642 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7642 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7643 return -EINVAL; 7643 return -EINVAL;
7644 7644
7645 /* 7645 /*
7646 * Likewise, bound things on the otherside by preventing insane quota 7646 * Likewise, bound things on the otherside by preventing insane quota
7647 * periods. This also allows us to normalize in computing quota 7647 * periods. This also allows us to normalize in computing quota
7648 * feasibility. 7648 * feasibility.
7649 */ 7649 */
7650 if (period > max_cfs_quota_period) 7650 if (period > max_cfs_quota_period)
7651 return -EINVAL; 7651 return -EINVAL;
7652 7652
7653 mutex_lock(&cfs_constraints_mutex); 7653 mutex_lock(&cfs_constraints_mutex);
7654 ret = __cfs_schedulable(tg, period, quota); 7654 ret = __cfs_schedulable(tg, period, quota);
7655 if (ret) 7655 if (ret)
7656 goto out_unlock; 7656 goto out_unlock;
7657 7657
7658 runtime_enabled = quota != RUNTIME_INF; 7658 runtime_enabled = quota != RUNTIME_INF;
7659 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7659 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7660 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7660 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
7661 raw_spin_lock_irq(&cfs_b->lock); 7661 raw_spin_lock_irq(&cfs_b->lock);
7662 cfs_b->period = ns_to_ktime(period); 7662 cfs_b->period = ns_to_ktime(period);
7663 cfs_b->quota = quota; 7663 cfs_b->quota = quota;
7664 7664
7665 __refill_cfs_bandwidth_runtime(cfs_b); 7665 __refill_cfs_bandwidth_runtime(cfs_b);
7666 /* restart the period timer (if active) to handle new period expiry */ 7666 /* restart the period timer (if active) to handle new period expiry */
7667 if (runtime_enabled && cfs_b->timer_active) { 7667 if (runtime_enabled && cfs_b->timer_active) {
7668 /* force a reprogram */ 7668 /* force a reprogram */
7669 cfs_b->timer_active = 0; 7669 cfs_b->timer_active = 0;
7670 __start_cfs_bandwidth(cfs_b); 7670 __start_cfs_bandwidth(cfs_b);
7671 } 7671 }
7672 raw_spin_unlock_irq(&cfs_b->lock); 7672 raw_spin_unlock_irq(&cfs_b->lock);
7673 7673
7674 for_each_possible_cpu(i) { 7674 for_each_possible_cpu(i) {
7675 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7675 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7676 struct rq *rq = cfs_rq->rq; 7676 struct rq *rq = cfs_rq->rq;
7677 7677
7678 raw_spin_lock_irq(&rq->lock); 7678 raw_spin_lock_irq(&rq->lock);
7679 cfs_rq->runtime_enabled = runtime_enabled; 7679 cfs_rq->runtime_enabled = runtime_enabled;
7680 cfs_rq->runtime_remaining = 0; 7680 cfs_rq->runtime_remaining = 0;
7681 7681
7682 if (cfs_rq->throttled) 7682 if (cfs_rq->throttled)
7683 unthrottle_cfs_rq(cfs_rq); 7683 unthrottle_cfs_rq(cfs_rq);
7684 raw_spin_unlock_irq(&rq->lock); 7684 raw_spin_unlock_irq(&rq->lock);
7685 } 7685 }
7686 out_unlock: 7686 out_unlock:
7687 mutex_unlock(&cfs_constraints_mutex); 7687 mutex_unlock(&cfs_constraints_mutex);
7688 7688
7689 return ret; 7689 return ret;
7690 } 7690 }
7691 7691
7692 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7692 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7693 { 7693 {
7694 u64 quota, period; 7694 u64 quota, period;
7695 7695
7696 period = ktime_to_ns(tg->cfs_bandwidth.period); 7696 period = ktime_to_ns(tg->cfs_bandwidth.period);
7697 if (cfs_quota_us < 0) 7697 if (cfs_quota_us < 0)
7698 quota = RUNTIME_INF; 7698 quota = RUNTIME_INF;
7699 else 7699 else
7700 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7700 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7701 7701
7702 return tg_set_cfs_bandwidth(tg, period, quota); 7702 return tg_set_cfs_bandwidth(tg, period, quota);
7703 } 7703 }
7704 7704
7705 long tg_get_cfs_quota(struct task_group *tg) 7705 long tg_get_cfs_quota(struct task_group *tg)
7706 { 7706 {
7707 u64 quota_us; 7707 u64 quota_us;
7708 7708
7709 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7709 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7710 return -1; 7710 return -1;
7711 7711
7712 quota_us = tg->cfs_bandwidth.quota; 7712 quota_us = tg->cfs_bandwidth.quota;
7713 do_div(quota_us, NSEC_PER_USEC); 7713 do_div(quota_us, NSEC_PER_USEC);
7714 7714
7715 return quota_us; 7715 return quota_us;
7716 } 7716 }
7717 7717
7718 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7718 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7719 { 7719 {
7720 u64 quota, period; 7720 u64 quota, period;
7721 7721
7722 period = (u64)cfs_period_us * NSEC_PER_USEC; 7722 period = (u64)cfs_period_us * NSEC_PER_USEC;
7723 quota = tg->cfs_bandwidth.quota; 7723 quota = tg->cfs_bandwidth.quota;
7724 7724
7725 return tg_set_cfs_bandwidth(tg, period, quota); 7725 return tg_set_cfs_bandwidth(tg, period, quota);
7726 } 7726 }
7727 7727
7728 long tg_get_cfs_period(struct task_group *tg) 7728 long tg_get_cfs_period(struct task_group *tg)
7729 { 7729 {
7730 u64 cfs_period_us; 7730 u64 cfs_period_us;
7731 7731
7732 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7732 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7733 do_div(cfs_period_us, NSEC_PER_USEC); 7733 do_div(cfs_period_us, NSEC_PER_USEC);
7734 7734
7735 return cfs_period_us; 7735 return cfs_period_us;
7736 } 7736 }
7737 7737
7738 static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) 7738 static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
7739 { 7739 {
7740 return tg_get_cfs_quota(cgroup_tg(cgrp)); 7740 return tg_get_cfs_quota(cgroup_tg(cgrp));
7741 } 7741 }
7742 7742
7743 static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, 7743 static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
7744 s64 cfs_quota_us) 7744 s64 cfs_quota_us)
7745 { 7745 {
7746 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); 7746 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
7747 } 7747 }
7748 7748
7749 static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) 7749 static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
7750 { 7750 {
7751 return tg_get_cfs_period(cgroup_tg(cgrp)); 7751 return tg_get_cfs_period(cgroup_tg(cgrp));
7752 } 7752 }
7753 7753
7754 static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, 7754 static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
7755 u64 cfs_period_us) 7755 u64 cfs_period_us)
7756 { 7756 {
7757 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); 7757 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
7758 } 7758 }
7759 7759
7760 struct cfs_schedulable_data { 7760 struct cfs_schedulable_data {
7761 struct task_group *tg; 7761 struct task_group *tg;
7762 u64 period, quota; 7762 u64 period, quota;
7763 }; 7763 };
7764 7764
7765 /* 7765 /*
7766 * normalize group quota/period to be quota/max_period 7766 * normalize group quota/period to be quota/max_period
7767 * note: units are usecs 7767 * note: units are usecs
7768 */ 7768 */
7769 static u64 normalize_cfs_quota(struct task_group *tg, 7769 static u64 normalize_cfs_quota(struct task_group *tg,
7770 struct cfs_schedulable_data *d) 7770 struct cfs_schedulable_data *d)
7771 { 7771 {
7772 u64 quota, period; 7772 u64 quota, period;
7773 7773
7774 if (tg == d->tg) { 7774 if (tg == d->tg) {
7775 period = d->period; 7775 period = d->period;
7776 quota = d->quota; 7776 quota = d->quota;
7777 } else { 7777 } else {
7778 period = tg_get_cfs_period(tg); 7778 period = tg_get_cfs_period(tg);
7779 quota = tg_get_cfs_quota(tg); 7779 quota = tg_get_cfs_quota(tg);
7780 } 7780 }
7781 7781
7782 /* note: these should typically be equivalent */ 7782 /* note: these should typically be equivalent */
7783 if (quota == RUNTIME_INF || quota == -1) 7783 if (quota == RUNTIME_INF || quota == -1)
7784 return RUNTIME_INF; 7784 return RUNTIME_INF;
7785 7785
7786 return to_ratio(period, quota); 7786 return to_ratio(period, quota);
7787 } 7787 }
7788 7788
7789 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7789 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7790 { 7790 {
7791 struct cfs_schedulable_data *d = data; 7791 struct cfs_schedulable_data *d = data;
7792 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7792 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7793 s64 quota = 0, parent_quota = -1; 7793 s64 quota = 0, parent_quota = -1;
7794 7794
7795 if (!tg->parent) { 7795 if (!tg->parent) {
7796 quota = RUNTIME_INF; 7796 quota = RUNTIME_INF;
7797 } else { 7797 } else {
7798 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7798 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7799 7799
7800 quota = normalize_cfs_quota(tg, d); 7800 quota = normalize_cfs_quota(tg, d);
7801 parent_quota = parent_b->hierarchal_quota; 7801 parent_quota = parent_b->hierarchal_quota;
7802 7802
7803 /* 7803 /*
7804 * ensure max(child_quota) <= parent_quota, inherit when no 7804 * ensure max(child_quota) <= parent_quota, inherit when no
7805 * limit is set 7805 * limit is set
7806 */ 7806 */
7807 if (quota == RUNTIME_INF) 7807 if (quota == RUNTIME_INF)
7808 quota = parent_quota; 7808 quota = parent_quota;
7809 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7809 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7810 return -EINVAL; 7810 return -EINVAL;
7811 } 7811 }
7812 cfs_b->hierarchal_quota = quota; 7812 cfs_b->hierarchal_quota = quota;
7813 7813
7814 return 0; 7814 return 0;
7815 } 7815 }
7816 7816
7817 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7817 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7818 { 7818 {
7819 int ret; 7819 int ret;
7820 struct cfs_schedulable_data data = { 7820 struct cfs_schedulable_data data = {
7821 .tg = tg, 7821 .tg = tg,
7822 .period = period, 7822 .period = period,
7823 .quota = quota, 7823 .quota = quota,
7824 }; 7824 };
7825 7825
7826 if (quota != RUNTIME_INF) { 7826 if (quota != RUNTIME_INF) {
7827 do_div(data.period, NSEC_PER_USEC); 7827 do_div(data.period, NSEC_PER_USEC);
7828 do_div(data.quota, NSEC_PER_USEC); 7828 do_div(data.quota, NSEC_PER_USEC);
7829 } 7829 }
7830 7830
7831 rcu_read_lock(); 7831 rcu_read_lock();
7832 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7832 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7833 rcu_read_unlock(); 7833 rcu_read_unlock();
7834 7834
7835 return ret; 7835 return ret;
7836 } 7836 }
7837 7837
7838 static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, 7838 static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
7839 struct cgroup_map_cb *cb) 7839 struct cgroup_map_cb *cb)
7840 { 7840 {
7841 struct task_group *tg = cgroup_tg(cgrp); 7841 struct task_group *tg = cgroup_tg(cgrp);
7842 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7842 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7843 7843
7844 cb->fill(cb, "nr_periods", cfs_b->nr_periods); 7844 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
7845 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); 7845 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
7846 cb->fill(cb, "throttled_time", cfs_b->throttled_time); 7846 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
7847 7847
7848 return 0; 7848 return 0;
7849 } 7849 }
7850 #endif /* CONFIG_CFS_BANDWIDTH */ 7850 #endif /* CONFIG_CFS_BANDWIDTH */
7851 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7851 #endif /* CONFIG_FAIR_GROUP_SCHED */
7852 7852
7853 #ifdef CONFIG_RT_GROUP_SCHED 7853 #ifdef CONFIG_RT_GROUP_SCHED
7854 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 7854 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
7855 s64 val) 7855 s64 val)
7856 { 7856 {
7857 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 7857 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
7858 } 7858 }
7859 7859
7860 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 7860 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
7861 { 7861 {
7862 return sched_group_rt_runtime(cgroup_tg(cgrp)); 7862 return sched_group_rt_runtime(cgroup_tg(cgrp));
7863 } 7863 }
7864 7864
7865 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 7865 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7866 u64 rt_period_us) 7866 u64 rt_period_us)
7867 { 7867 {
7868 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 7868 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
7869 } 7869 }
7870 7870
7871 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 7871 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
7872 { 7872 {
7873 return sched_group_rt_period(cgroup_tg(cgrp)); 7873 return sched_group_rt_period(cgroup_tg(cgrp));
7874 } 7874 }
7875 #endif /* CONFIG_RT_GROUP_SCHED */ 7875 #endif /* CONFIG_RT_GROUP_SCHED */
7876 7876
7877 static struct cftype cpu_files[] = { 7877 static struct cftype cpu_files[] = {
7878 #ifdef CONFIG_FAIR_GROUP_SCHED 7878 #ifdef CONFIG_FAIR_GROUP_SCHED
7879 { 7879 {
7880 .name = "shares", 7880 .name = "shares",
7881 .read_u64 = cpu_shares_read_u64, 7881 .read_u64 = cpu_shares_read_u64,
7882 .write_u64 = cpu_shares_write_u64, 7882 .write_u64 = cpu_shares_write_u64,
7883 }, 7883 },
7884 #endif 7884 #endif
7885 #ifdef CONFIG_CFS_BANDWIDTH 7885 #ifdef CONFIG_CFS_BANDWIDTH
7886 { 7886 {
7887 .name = "cfs_quota_us", 7887 .name = "cfs_quota_us",
7888 .read_s64 = cpu_cfs_quota_read_s64, 7888 .read_s64 = cpu_cfs_quota_read_s64,
7889 .write_s64 = cpu_cfs_quota_write_s64, 7889 .write_s64 = cpu_cfs_quota_write_s64,
7890 }, 7890 },
7891 { 7891 {
7892 .name = "cfs_period_us", 7892 .name = "cfs_period_us",
7893 .read_u64 = cpu_cfs_period_read_u64, 7893 .read_u64 = cpu_cfs_period_read_u64,
7894 .write_u64 = cpu_cfs_period_write_u64, 7894 .write_u64 = cpu_cfs_period_write_u64,
7895 }, 7895 },
7896 { 7896 {
7897 .name = "stat", 7897 .name = "stat",
7898 .read_map = cpu_stats_show, 7898 .read_map = cpu_stats_show,
7899 }, 7899 },
7900 #endif 7900 #endif
7901 #ifdef CONFIG_RT_GROUP_SCHED 7901 #ifdef CONFIG_RT_GROUP_SCHED
7902 { 7902 {
7903 .name = "rt_runtime_us", 7903 .name = "rt_runtime_us",
7904 .read_s64 = cpu_rt_runtime_read, 7904 .read_s64 = cpu_rt_runtime_read,
7905 .write_s64 = cpu_rt_runtime_write, 7905 .write_s64 = cpu_rt_runtime_write,
7906 }, 7906 },
7907 { 7907 {
7908 .name = "rt_period_us", 7908 .name = "rt_period_us",
7909 .read_u64 = cpu_rt_period_read_uint, 7909 .read_u64 = cpu_rt_period_read_uint,
7910 .write_u64 = cpu_rt_period_write_uint, 7910 .write_u64 = cpu_rt_period_write_uint,
7911 }, 7911 },
7912 #endif 7912 #endif
7913 }; 7913 };
7914 7914
7915 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7915 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7916 { 7916 {
7917 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); 7917 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7918 } 7918 }
7919 7919
7920 struct cgroup_subsys cpu_cgroup_subsys = { 7920 struct cgroup_subsys cpu_cgroup_subsys = {
7921 .name = "cpu", 7921 .name = "cpu",
7922 .create = cpu_cgroup_create, 7922 .create = cpu_cgroup_create,
7923 .destroy = cpu_cgroup_destroy, 7923 .destroy = cpu_cgroup_destroy,
7924 .can_attach = cpu_cgroup_can_attach, 7924 .can_attach = cpu_cgroup_can_attach,
7925 .attach = cpu_cgroup_attach, 7925 .attach = cpu_cgroup_attach,
7926 .exit = cpu_cgroup_exit, 7926 .exit = cpu_cgroup_exit,
7927 .populate = cpu_cgroup_populate, 7927 .populate = cpu_cgroup_populate,
7928 .subsys_id = cpu_cgroup_subsys_id, 7928 .subsys_id = cpu_cgroup_subsys_id,
7929 .early_init = 1, 7929 .early_init = 1,
7930 }; 7930 };
7931 7931
7932 #endif /* CONFIG_CGROUP_SCHED */ 7932 #endif /* CONFIG_CGROUP_SCHED */
7933 7933
7934 #ifdef CONFIG_CGROUP_CPUACCT 7934 #ifdef CONFIG_CGROUP_CPUACCT
7935 7935
7936 /* 7936 /*
7937 * CPU accounting code for task groups. 7937 * CPU accounting code for task groups.
7938 * 7938 *
7939 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 7939 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7940 * (balbir@in.ibm.com). 7940 * (balbir@in.ibm.com).
7941 */ 7941 */
7942 7942
7943 /* create a new cpu accounting group */ 7943 /* create a new cpu accounting group */
7944 static struct cgroup_subsys_state *cpuacct_create( 7944 static struct cgroup_subsys_state *cpuacct_create(
7945 struct cgroup_subsys *ss, struct cgroup *cgrp) 7945 struct cgroup_subsys *ss, struct cgroup *cgrp)
7946 { 7946 {
7947 struct cpuacct *ca; 7947 struct cpuacct *ca;
7948 7948
7949 if (!cgrp->parent) 7949 if (!cgrp->parent)
7950 return &root_cpuacct.css; 7950 return &root_cpuacct.css;
7951 7951
7952 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 7952 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7953 if (!ca) 7953 if (!ca)
7954 goto out; 7954 goto out;
7955 7955
7956 ca->cpuusage = alloc_percpu(u64); 7956 ca->cpuusage = alloc_percpu(u64);
7957 if (!ca->cpuusage) 7957 if (!ca->cpuusage)
7958 goto out_free_ca; 7958 goto out_free_ca;
7959 7959
7960 ca->cpustat = alloc_percpu(struct kernel_cpustat); 7960 ca->cpustat = alloc_percpu(struct kernel_cpustat);
7961 if (!ca->cpustat) 7961 if (!ca->cpustat)
7962 goto out_free_cpuusage; 7962 goto out_free_cpuusage;
7963 7963
7964 return &ca->css; 7964 return &ca->css;
7965 7965
7966 out_free_cpuusage: 7966 out_free_cpuusage:
7967 free_percpu(ca->cpuusage); 7967 free_percpu(ca->cpuusage);
7968 out_free_ca: 7968 out_free_ca:
7969 kfree(ca); 7969 kfree(ca);
7970 out: 7970 out:
7971 return ERR_PTR(-ENOMEM); 7971 return ERR_PTR(-ENOMEM);
7972 } 7972 }
7973 7973
7974 /* destroy an existing cpu accounting group */ 7974 /* destroy an existing cpu accounting group */
7975 static void 7975 static void
7976 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 7976 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7977 { 7977 {
7978 struct cpuacct *ca = cgroup_ca(cgrp); 7978 struct cpuacct *ca = cgroup_ca(cgrp);
7979 7979
7980 free_percpu(ca->cpustat); 7980 free_percpu(ca->cpustat);
7981 free_percpu(ca->cpuusage); 7981 free_percpu(ca->cpuusage);
7982 kfree(ca); 7982 kfree(ca);
7983 } 7983 }
7984 7984
7985 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 7985 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
7986 { 7986 {
7987 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 7987 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
7988 u64 data; 7988 u64 data;
7989 7989
7990 #ifndef CONFIG_64BIT 7990 #ifndef CONFIG_64BIT
7991 /* 7991 /*
7992 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 7992 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
7993 */ 7993 */
7994 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 7994 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
7995 data = *cpuusage; 7995 data = *cpuusage;
7996 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 7996 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
7997 #else 7997 #else
7998 data = *cpuusage; 7998 data = *cpuusage;
7999 #endif 7999 #endif
8000 8000
8001 return data; 8001 return data;
8002 } 8002 }
8003 8003
8004 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 8004 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8005 { 8005 {
8006 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8006 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8007 8007
8008 #ifndef CONFIG_64BIT 8008 #ifndef CONFIG_64BIT
8009 /* 8009 /*
8010 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 8010 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8011 */ 8011 */
8012 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 8012 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8013 *cpuusage = val; 8013 *cpuusage = val;
8014 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 8014 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8015 #else 8015 #else
8016 *cpuusage = val; 8016 *cpuusage = val;
8017 #endif 8017 #endif
8018 } 8018 }
8019 8019
8020 /* return total cpu usage (in nanoseconds) of a group */ 8020 /* return total cpu usage (in nanoseconds) of a group */
8021 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 8021 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8022 { 8022 {
8023 struct cpuacct *ca = cgroup_ca(cgrp); 8023 struct cpuacct *ca = cgroup_ca(cgrp);
8024 u64 totalcpuusage = 0; 8024 u64 totalcpuusage = 0;
8025 int i; 8025 int i;
8026 8026
8027 for_each_present_cpu(i) 8027 for_each_present_cpu(i)
8028 totalcpuusage += cpuacct_cpuusage_read(ca, i); 8028 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8029 8029
8030 return totalcpuusage; 8030 return totalcpuusage;
8031 } 8031 }
8032 8032
8033 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 8033 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8034 u64 reset) 8034 u64 reset)
8035 { 8035 {
8036 struct cpuacct *ca = cgroup_ca(cgrp); 8036 struct cpuacct *ca = cgroup_ca(cgrp);
8037 int err = 0; 8037 int err = 0;
8038 int i; 8038 int i;
8039 8039
8040 if (reset) { 8040 if (reset) {
8041 err = -EINVAL; 8041 err = -EINVAL;
8042 goto out; 8042 goto out;
8043 } 8043 }
8044 8044
8045 for_each_present_cpu(i) 8045 for_each_present_cpu(i)
8046 cpuacct_cpuusage_write(ca, i, 0); 8046 cpuacct_cpuusage_write(ca, i, 0);
8047 8047
8048 out: 8048 out:
8049 return err; 8049 return err;
8050 } 8050 }
8051 8051
8052 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 8052 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8053 struct seq_file *m) 8053 struct seq_file *m)
8054 { 8054 {
8055 struct cpuacct *ca = cgroup_ca(cgroup); 8055 struct cpuacct *ca = cgroup_ca(cgroup);
8056 u64 percpu; 8056 u64 percpu;
8057 int i; 8057 int i;
8058 8058
8059 for_each_present_cpu(i) { 8059 for_each_present_cpu(i) {
8060 percpu = cpuacct_cpuusage_read(ca, i); 8060 percpu = cpuacct_cpuusage_read(ca, i);
8061 seq_printf(m, "%llu ", (unsigned long long) percpu); 8061 seq_printf(m, "%llu ", (unsigned long long) percpu);
8062 } 8062 }
8063 seq_printf(m, "\n"); 8063 seq_printf(m, "\n");
8064 return 0; 8064 return 0;
8065 } 8065 }
8066 8066
8067 static const char *cpuacct_stat_desc[] = { 8067 static const char *cpuacct_stat_desc[] = {
8068 [CPUACCT_STAT_USER] = "user", 8068 [CPUACCT_STAT_USER] = "user",
8069 [CPUACCT_STAT_SYSTEM] = "system", 8069 [CPUACCT_STAT_SYSTEM] = "system",
8070 }; 8070 };
8071 8071
8072 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8072 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8073 struct cgroup_map_cb *cb) 8073 struct cgroup_map_cb *cb)
8074 { 8074 {
8075 struct cpuacct *ca = cgroup_ca(cgrp); 8075 struct cpuacct *ca = cgroup_ca(cgrp);
8076 int cpu; 8076 int cpu;
8077 s64 val = 0; 8077 s64 val = 0;
8078 8078
8079 for_each_online_cpu(cpu) { 8079 for_each_online_cpu(cpu) {
8080 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8080 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8081 val += kcpustat->cpustat[CPUTIME_USER]; 8081 val += kcpustat->cpustat[CPUTIME_USER];
8082 val += kcpustat->cpustat[CPUTIME_NICE]; 8082 val += kcpustat->cpustat[CPUTIME_NICE];
8083 } 8083 }
8084 val = cputime64_to_clock_t(val); 8084 val = cputime64_to_clock_t(val);
8085 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); 8085 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
8086 8086
8087 val = 0; 8087 val = 0;
8088 for_each_online_cpu(cpu) { 8088 for_each_online_cpu(cpu) {
8089 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); 8089 struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
8090 val += kcpustat->cpustat[CPUTIME_SYSTEM]; 8090 val += kcpustat->cpustat[CPUTIME_SYSTEM];
8091 val += kcpustat->cpustat[CPUTIME_IRQ]; 8091 val += kcpustat->cpustat[CPUTIME_IRQ];
8092 val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; 8092 val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
8093 } 8093 }
8094 8094
8095 val = cputime64_to_clock_t(val); 8095 val = cputime64_to_clock_t(val);
8096 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); 8096 cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
8097 8097
8098 return 0; 8098 return 0;
8099 } 8099 }
8100 8100
8101 static struct cftype files[] = { 8101 static struct cftype files[] = {
8102 { 8102 {
8103 .name = "usage", 8103 .name = "usage",
8104 .read_u64 = cpuusage_read, 8104 .read_u64 = cpuusage_read,
8105 .write_u64 = cpuusage_write, 8105 .write_u64 = cpuusage_write,
8106 }, 8106 },
8107 { 8107 {
8108 .name = "usage_percpu", 8108 .name = "usage_percpu",
8109 .read_seq_string = cpuacct_percpu_seq_read, 8109 .read_seq_string = cpuacct_percpu_seq_read,
8110 }, 8110 },
8111 { 8111 {
8112 .name = "stat", 8112 .name = "stat",
8113 .read_map = cpuacct_stats_show, 8113 .read_map = cpuacct_stats_show,
8114 }, 8114 },
8115 }; 8115 };
8116 8116
8117 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 8117 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8118 { 8118 {
8119 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); 8119 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8120 } 8120 }
8121 8121
8122 /* 8122 /*
8123 * charge this task's execution time to its accounting group. 8123 * charge this task's execution time to its accounting group.
8124 * 8124 *
8125 * called with rq->lock held. 8125 * called with rq->lock held.
8126 */ 8126 */
8127 void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8127 void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8128 { 8128 {
8129 struct cpuacct *ca; 8129 struct cpuacct *ca;
8130 int cpu; 8130 int cpu;
8131 8131
8132 if (unlikely(!cpuacct_subsys.active)) 8132 if (unlikely(!cpuacct_subsys.active))
8133 return; 8133 return;
8134 8134
8135 cpu = task_cpu(tsk); 8135 cpu = task_cpu(tsk);
8136 8136
8137 rcu_read_lock(); 8137 rcu_read_lock();
8138 8138
8139 ca = task_ca(tsk); 8139 ca = task_ca(tsk);
8140 8140
8141 for (; ca; ca = parent_ca(ca)) { 8141 for (; ca; ca = parent_ca(ca)) {
8142 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8142 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8143 *cpuusage += cputime; 8143 *cpuusage += cputime;
8144 } 8144 }
8145 8145
8146 rcu_read_unlock(); 8146 rcu_read_unlock();
8147 } 8147 }
8148 8148
8149 struct cgroup_subsys cpuacct_subsys = { 8149 struct cgroup_subsys cpuacct_subsys = {
8150 .name = "cpuacct", 8150 .name = "cpuacct",
8151 .create = cpuacct_create, 8151 .create = cpuacct_create,
8152 .destroy = cpuacct_destroy, 8152 .destroy = cpuacct_destroy,
8153 .populate = cpuacct_populate, 8153 .populate = cpuacct_populate,
8154 .subsys_id = cpuacct_subsys_id, 8154 .subsys_id = cpuacct_subsys_id,
8155 }; 8155 };
8156 #endif /* CONFIG_CGROUP_CPUACCT */ 8156 #endif /* CONFIG_CGROUP_CPUACCT */
8157 8157
1 /* 1 /*
2 * linux/kernel/softirq.c 2 * linux/kernel/softirq.c
3 * 3 *
4 * Copyright (C) 1992 Linus Torvalds 4 * Copyright (C) 1992 Linus Torvalds
5 * 5 *
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 * 9 *
10 * Remote softirq infrastructure is by Jens Axboe. 10 * Remote softirq infrastructure is by Jens Axboe.
11 */ 11 */
12 12
13 #include <linux/export.h> 13 #include <linux/export.h>
14 #include <linux/kernel_stat.h> 14 #include <linux/kernel_stat.h>
15 #include <linux/interrupt.h> 15 #include <linux/interrupt.h>
16 #include <linux/init.h> 16 #include <linux/init.h>
17 #include <linux/mm.h> 17 #include <linux/mm.h>
18 #include <linux/notifier.h> 18 #include <linux/notifier.h>
19 #include <linux/percpu.h> 19 #include <linux/percpu.h>
20 #include <linux/cpu.h> 20 #include <linux/cpu.h>
21 #include <linux/freezer.h> 21 #include <linux/freezer.h>
22 #include <linux/kthread.h> 22 #include <linux/kthread.h>
23 #include <linux/rcupdate.h> 23 #include <linux/rcupdate.h>
24 #include <linux/ftrace.h> 24 #include <linux/ftrace.h>
25 #include <linux/smp.h> 25 #include <linux/smp.h>
26 #include <linux/tick.h> 26 #include <linux/tick.h>
27 27
28 #define CREATE_TRACE_POINTS 28 #define CREATE_TRACE_POINTS
29 #include <trace/events/irq.h> 29 #include <trace/events/irq.h>
30 30
31 #include <asm/irq.h> 31 #include <asm/irq.h>
32 /* 32 /*
33 - No shared variables, all the data are CPU local. 33 - No shared variables, all the data are CPU local.
34 - If a softirq needs serialization, let it serialize itself 34 - If a softirq needs serialization, let it serialize itself
35 by its own spinlocks. 35 by its own spinlocks.
36 - Even if softirq is serialized, only local cpu is marked for 36 - Even if softirq is serialized, only local cpu is marked for
37 execution. Hence, we get something sort of weak cpu binding. 37 execution. Hence, we get something sort of weak cpu binding.
38 Though it is still not clear, will it result in better locality 38 Though it is still not clear, will it result in better locality
39 or will not. 39 or will not.
40 40
41 Examples: 41 Examples:
42 - NET RX softirq. It is multithreaded and does not require 42 - NET RX softirq. It is multithreaded and does not require
43 any global serialization. 43 any global serialization.
44 - NET TX softirq. It kicks software netdevice queues, hence 44 - NET TX softirq. It kicks software netdevice queues, hence
45 it is logically serialized per device, but this serialization 45 it is logically serialized per device, but this serialization
46 is invisible to common code. 46 is invisible to common code.
47 - Tasklets: serialized wrt itself. 47 - Tasklets: serialized wrt itself.
48 */ 48 */
49 49
50 #ifndef __ARCH_IRQ_STAT 50 #ifndef __ARCH_IRQ_STAT
51 irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; 51 irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
52 EXPORT_SYMBOL(irq_stat); 52 EXPORT_SYMBOL(irq_stat);
53 #endif 53 #endif
54 54
55 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; 55 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
56 56
57 DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 57 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
58 58
59 char *softirq_to_name[NR_SOFTIRQS] = { 59 char *softirq_to_name[NR_SOFTIRQS] = {
60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", 60 "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
61 "TASKLET", "SCHED", "HRTIMER", "RCU" 61 "TASKLET", "SCHED", "HRTIMER", "RCU"
62 }; 62 };
63 63
64 /* 64 /*
65 * we cannot loop indefinitely here to avoid userspace starvation, 65 * we cannot loop indefinitely here to avoid userspace starvation,
66 * but we also don't want to introduce a worst case 1/HZ latency 66 * but we also don't want to introduce a worst case 1/HZ latency
67 * to the pending events, so lets the scheduler to balance 67 * to the pending events, so lets the scheduler to balance
68 * the softirq load for us. 68 * the softirq load for us.
69 */ 69 */
70 static void wakeup_softirqd(void) 70 static void wakeup_softirqd(void)
71 { 71 {
72 /* Interrupts are disabled: no need to stop preemption */ 72 /* Interrupts are disabled: no need to stop preemption */
73 struct task_struct *tsk = __this_cpu_read(ksoftirqd); 73 struct task_struct *tsk = __this_cpu_read(ksoftirqd);
74 74
75 if (tsk && tsk->state != TASK_RUNNING) 75 if (tsk && tsk->state != TASK_RUNNING)
76 wake_up_process(tsk); 76 wake_up_process(tsk);
77 } 77 }
78 78
79 /* 79 /*
80 * preempt_count and SOFTIRQ_OFFSET usage: 80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving 81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing. 82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) 83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable. 84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing 85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled. 86 * softirq and whether we just have bh disabled.
87 */ 87 */
88 88
89 /* 89 /*
90 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
91 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
92 */ 92 */
93 #ifdef CONFIG_TRACE_IRQFLAGS 93 #ifdef CONFIG_TRACE_IRQFLAGS
94 static void __local_bh_disable(unsigned long ip, unsigned int cnt) 94 static void __local_bh_disable(unsigned long ip, unsigned int cnt)
95 { 95 {
96 unsigned long flags; 96 unsigned long flags;
97 97
98 WARN_ON_ONCE(in_irq()); 98 WARN_ON_ONCE(in_irq());
99 99
100 raw_local_irq_save(flags); 100 raw_local_irq_save(flags);
101 /* 101 /*
102 * The preempt tracer hooks into add_preempt_count and will break 102 * The preempt tracer hooks into add_preempt_count and will break
103 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 103 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
104 * is set and before current->softirq_enabled is cleared. 104 * is set and before current->softirq_enabled is cleared.
105 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
106 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
107 */ 107 */
108 preempt_count() += cnt; 108 preempt_count() += cnt;
109 /* 109 /*
110 * Were softirqs turned off above: 110 * Were softirqs turned off above:
111 */ 111 */
112 if (softirq_count() == cnt) 112 if (softirq_count() == cnt)
113 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
114 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
115 115
116 if (preempt_count() == cnt) 116 if (preempt_count() == cnt)
117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
118 } 118 }
119 #else /* !CONFIG_TRACE_IRQFLAGS */ 119 #else /* !CONFIG_TRACE_IRQFLAGS */
120 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 120 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
121 { 121 {
122 add_preempt_count(cnt); 122 add_preempt_count(cnt);
123 barrier(); 123 barrier();
124 } 124 }
125 #endif /* CONFIG_TRACE_IRQFLAGS */ 125 #endif /* CONFIG_TRACE_IRQFLAGS */
126 126
127 void local_bh_disable(void) 127 void local_bh_disable(void)
128 { 128 {
129 __local_bh_disable((unsigned long)__builtin_return_address(0), 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET); 130 SOFTIRQ_DISABLE_OFFSET);
131 } 131 }
132 132
133 EXPORT_SYMBOL(local_bh_disable); 133 EXPORT_SYMBOL(local_bh_disable);
134 134
135 static void __local_bh_enable(unsigned int cnt) 135 static void __local_bh_enable(unsigned int cnt)
136 { 136 {
137 WARN_ON_ONCE(in_irq()); 137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled()); 138 WARN_ON_ONCE(!irqs_disabled());
139 139
140 if (softirq_count() == cnt) 140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt); 142 sub_preempt_count(cnt);
143 } 143 }
144 144
145 /* 145 /*
146 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
147 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
148 * without processing still-pending softirqs: 148 * without processing still-pending softirqs:
149 */ 149 */
150 void _local_bh_enable(void) 150 void _local_bh_enable(void)
151 { 151 {
152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153 } 153 }
154 154
155 EXPORT_SYMBOL(_local_bh_enable); 155 EXPORT_SYMBOL(_local_bh_enable);
156 156
157 static inline void _local_bh_enable_ip(unsigned long ip) 157 static inline void _local_bh_enable_ip(unsigned long ip)
158 { 158 {
159 WARN_ON_ONCE(in_irq() || irqs_disabled()); 159 WARN_ON_ONCE(in_irq() || irqs_disabled());
160 #ifdef CONFIG_TRACE_IRQFLAGS 160 #ifdef CONFIG_TRACE_IRQFLAGS
161 local_irq_disable(); 161 local_irq_disable();
162 #endif 162 #endif
163 /* 163 /*
164 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
165 */ 165 */
166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
167 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
168 /* 168 /*
169 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
170 * softirq processing: 170 * softirq processing:
171 */ 171 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
173 173
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
175 do_softirq(); 175 do_softirq();
176 176
177 dec_preempt_count(); 177 dec_preempt_count();
178 #ifdef CONFIG_TRACE_IRQFLAGS 178 #ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 179 local_irq_enable();
180 #endif 180 #endif
181 preempt_check_resched(); 181 preempt_check_resched();
182 } 182 }
183 183
184 void local_bh_enable(void) 184 void local_bh_enable(void)
185 { 185 {
186 _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); 186 _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
187 } 187 }
188 EXPORT_SYMBOL(local_bh_enable); 188 EXPORT_SYMBOL(local_bh_enable);
189 189
190 void local_bh_enable_ip(unsigned long ip) 190 void local_bh_enable_ip(unsigned long ip)
191 { 191 {
192 _local_bh_enable_ip(ip); 192 _local_bh_enable_ip(ip);
193 } 193 }
194 EXPORT_SYMBOL(local_bh_enable_ip); 194 EXPORT_SYMBOL(local_bh_enable_ip);
195 195
196 /* 196 /*
197 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 197 * We restart softirq processing MAX_SOFTIRQ_RESTART times,
198 * and we fall back to softirqd after that. 198 * and we fall back to softirqd after that.
199 * 199 *
200 * This number has been established via experimentation. 200 * This number has been established via experimentation.
201 * The two things to balance is latency against fairness - 201 * The two things to balance is latency against fairness -
202 * we want to handle softirqs as soon as possible, but they 202 * we want to handle softirqs as soon as possible, but they
203 * should not be able to lock up the box. 203 * should not be able to lock up the box.
204 */ 204 */
205 #define MAX_SOFTIRQ_RESTART 10 205 #define MAX_SOFTIRQ_RESTART 10
206 206
207 asmlinkage void __do_softirq(void) 207 asmlinkage void __do_softirq(void)
208 { 208 {
209 struct softirq_action *h; 209 struct softirq_action *h;
210 __u32 pending; 210 __u32 pending;
211 int max_restart = MAX_SOFTIRQ_RESTART; 211 int max_restart = MAX_SOFTIRQ_RESTART;
212 int cpu; 212 int cpu;
213 213
214 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
215 account_system_vtime(current); 215 account_system_vtime(current);
216 216
217 __local_bh_disable((unsigned long)__builtin_return_address(0), 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET); 218 SOFTIRQ_OFFSET);
219 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
220 220
221 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
222 restart: 222 restart:
223 /* Reset the pending bitmask before enabling irqs */ 223 /* Reset the pending bitmask before enabling irqs */
224 set_softirq_pending(0); 224 set_softirq_pending(0);
225 225
226 local_irq_enable(); 226 local_irq_enable();
227 227
228 h = softirq_vec; 228 h = softirq_vec;
229 229
230 do { 230 do {
231 if (pending & 1) { 231 if (pending & 1) {
232 unsigned int vec_nr = h - softirq_vec; 232 unsigned int vec_nr = h - softirq_vec;
233 int prev_count = preempt_count(); 233 int prev_count = preempt_count();
234 234
235 kstat_incr_softirqs_this_cpu(vec_nr); 235 kstat_incr_softirqs_this_cpu(vec_nr);
236 236
237 trace_softirq_entry(vec_nr); 237 trace_softirq_entry(vec_nr);
238 h->action(h); 238 h->action(h);
239 trace_softirq_exit(vec_nr); 239 trace_softirq_exit(vec_nr);
240 if (unlikely(prev_count != preempt_count())) { 240 if (unlikely(prev_count != preempt_count())) {
241 printk(KERN_ERR "huh, entered softirq %u %s %p" 241 printk(KERN_ERR "huh, entered softirq %u %s %p"
242 "with preempt_count %08x," 242 "with preempt_count %08x,"
243 " exited with %08x?\n", vec_nr, 243 " exited with %08x?\n", vec_nr,
244 softirq_to_name[vec_nr], h->action, 244 softirq_to_name[vec_nr], h->action,
245 prev_count, preempt_count()); 245 prev_count, preempt_count());
246 preempt_count() = prev_count; 246 preempt_count() = prev_count;
247 } 247 }
248 248
249 rcu_bh_qs(cpu); 249 rcu_bh_qs(cpu);
250 } 250 }
251 h++; 251 h++;
252 pending >>= 1; 252 pending >>= 1;
253 } while (pending); 253 } while (pending);
254 254
255 local_irq_disable(); 255 local_irq_disable();
256 256
257 pending = local_softirq_pending(); 257 pending = local_softirq_pending();
258 if (pending && --max_restart) 258 if (pending && --max_restart)
259 goto restart; 259 goto restart;
260 260
261 if (pending) 261 if (pending)
262 wakeup_softirqd(); 262 wakeup_softirqd();
263 263
264 lockdep_softirq_exit(); 264 lockdep_softirq_exit();
265 265
266 account_system_vtime(current); 266 account_system_vtime(current);
267 __local_bh_enable(SOFTIRQ_OFFSET); 267 __local_bh_enable(SOFTIRQ_OFFSET);
268 } 268 }
269 269
270 #ifndef __ARCH_HAS_DO_SOFTIRQ 270 #ifndef __ARCH_HAS_DO_SOFTIRQ
271 271
272 asmlinkage void do_softirq(void) 272 asmlinkage void do_softirq(void)
273 { 273 {
274 __u32 pending; 274 __u32 pending;
275 unsigned long flags; 275 unsigned long flags;
276 276
277 if (in_interrupt()) 277 if (in_interrupt())
278 return; 278 return;
279 279
280 local_irq_save(flags); 280 local_irq_save(flags);
281 281
282 pending = local_softirq_pending(); 282 pending = local_softirq_pending();
283 283
284 if (pending) 284 if (pending)
285 __do_softirq(); 285 __do_softirq();
286 286
287 local_irq_restore(flags); 287 local_irq_restore(flags);
288 } 288 }
289 289
290 #endif 290 #endif
291 291
292 /* 292 /*
293 * Enter an interrupt context. 293 * Enter an interrupt context.
294 */ 294 */
295 void irq_enter(void) 295 void irq_enter(void)
296 { 296 {
297 int cpu = smp_processor_id(); 297 int cpu = smp_processor_id();
298 298
299 rcu_irq_enter(); 299 rcu_irq_enter();
300 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (idle_cpu(cpu) && !in_interrupt()) {
301 /* 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd 302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt. 303 * here, as softirq will be serviced on return from interrupt.
304 */ 304 */
305 local_bh_disable(); 305 local_bh_disable();
306 tick_check_idle(cpu); 306 tick_check_idle(cpu);
307 _local_bh_enable(); 307 _local_bh_enable();
308 } 308 }
309 309
310 __irq_enter(); 310 __irq_enter();
311 } 311 }
312 312
313 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 313 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314 static inline void invoke_softirq(void) 314 static inline void invoke_softirq(void)
315 { 315 {
316 if (!force_irqthreads) 316 if (!force_irqthreads)
317 __do_softirq(); 317 __do_softirq();
318 else { 318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0), 319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET); 320 SOFTIRQ_OFFSET);
321 wakeup_softirqd(); 321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET); 322 __local_bh_enable(SOFTIRQ_OFFSET);
323 } 323 }
324 } 324 }
325 #else 325 #else
326 static inline void invoke_softirq(void) 326 static inline void invoke_softirq(void)
327 { 327 {
328 if (!force_irqthreads) 328 if (!force_irqthreads)
329 do_softirq(); 329 do_softirq();
330 else { 330 else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0), 331 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET); 332 SOFTIRQ_OFFSET);
333 wakeup_softirqd(); 333 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET); 334 __local_bh_enable(SOFTIRQ_OFFSET);
335 } 335 }
336 } 336 }
337 #endif 337 #endif
338 338
339 /* 339 /*
340 * Exit an interrupt context. Process softirqs if needed and possible: 340 * Exit an interrupt context. Process softirqs if needed and possible:
341 */ 341 */
342 void irq_exit(void) 342 void irq_exit(void)
343 { 343 {
344 account_system_vtime(current); 344 account_system_vtime(current);
345 trace_hardirq_exit(); 345 trace_hardirq_exit();
346 sub_preempt_count(IRQ_EXIT_OFFSET); 346 sub_preempt_count(IRQ_EXIT_OFFSET);
347 if (!in_interrupt() && local_softirq_pending()) 347 if (!in_interrupt() && local_softirq_pending())
348 invoke_softirq(); 348 invoke_softirq();
349 349
350 #ifdef CONFIG_NO_HZ 350 #ifdef CONFIG_NO_HZ
351 /* Make sure that timer wheel updates are propagated */ 351 /* Make sure that timer wheel updates are propagated */
352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) 352 if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
353 tick_nohz_irq_exit(); 353 tick_nohz_irq_exit();
354 #endif 354 #endif
355 rcu_irq_exit(); 355 rcu_irq_exit();
356 preempt_enable_no_resched(); 356 sched_preempt_enable_no_resched();
357 } 357 }
358 358
359 /* 359 /*
360 * This function must run with irqs disabled! 360 * This function must run with irqs disabled!
361 */ 361 */
362 inline void raise_softirq_irqoff(unsigned int nr) 362 inline void raise_softirq_irqoff(unsigned int nr)
363 { 363 {
364 __raise_softirq_irqoff(nr); 364 __raise_softirq_irqoff(nr);
365 365
366 /* 366 /*
367 * If we're in an interrupt or softirq, we're done 367 * If we're in an interrupt or softirq, we're done
368 * (this also catches softirq-disabled code). We will 368 * (this also catches softirq-disabled code). We will
369 * actually run the softirq once we return from 369 * actually run the softirq once we return from
370 * the irq or softirq. 370 * the irq or softirq.
371 * 371 *
372 * Otherwise we wake up ksoftirqd to make sure we 372 * Otherwise we wake up ksoftirqd to make sure we
373 * schedule the softirq soon. 373 * schedule the softirq soon.
374 */ 374 */
375 if (!in_interrupt()) 375 if (!in_interrupt())
376 wakeup_softirqd(); 376 wakeup_softirqd();
377 } 377 }
378 378
379 void raise_softirq(unsigned int nr) 379 void raise_softirq(unsigned int nr)
380 { 380 {
381 unsigned long flags; 381 unsigned long flags;
382 382
383 local_irq_save(flags); 383 local_irq_save(flags);
384 raise_softirq_irqoff(nr); 384 raise_softirq_irqoff(nr);
385 local_irq_restore(flags); 385 local_irq_restore(flags);
386 } 386 }
387 387
388 void open_softirq(int nr, void (*action)(struct softirq_action *)) 388 void open_softirq(int nr, void (*action)(struct softirq_action *))
389 { 389 {
390 softirq_vec[nr].action = action; 390 softirq_vec[nr].action = action;
391 } 391 }
392 392
393 /* 393 /*
394 * Tasklets 394 * Tasklets
395 */ 395 */
396 struct tasklet_head 396 struct tasklet_head
397 { 397 {
398 struct tasklet_struct *head; 398 struct tasklet_struct *head;
399 struct tasklet_struct **tail; 399 struct tasklet_struct **tail;
400 }; 400 };
401 401
402 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); 402 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
403 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); 403 static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
404 404
405 void __tasklet_schedule(struct tasklet_struct *t) 405 void __tasklet_schedule(struct tasklet_struct *t)
406 { 406 {
407 unsigned long flags; 407 unsigned long flags;
408 408
409 local_irq_save(flags); 409 local_irq_save(flags);
410 t->next = NULL; 410 t->next = NULL;
411 *__this_cpu_read(tasklet_vec.tail) = t; 411 *__this_cpu_read(tasklet_vec.tail) = t;
412 __this_cpu_write(tasklet_vec.tail, &(t->next)); 412 __this_cpu_write(tasklet_vec.tail, &(t->next));
413 raise_softirq_irqoff(TASKLET_SOFTIRQ); 413 raise_softirq_irqoff(TASKLET_SOFTIRQ);
414 local_irq_restore(flags); 414 local_irq_restore(flags);
415 } 415 }
416 416
417 EXPORT_SYMBOL(__tasklet_schedule); 417 EXPORT_SYMBOL(__tasklet_schedule);
418 418
419 void __tasklet_hi_schedule(struct tasklet_struct *t) 419 void __tasklet_hi_schedule(struct tasklet_struct *t)
420 { 420 {
421 unsigned long flags; 421 unsigned long flags;
422 422
423 local_irq_save(flags); 423 local_irq_save(flags);
424 t->next = NULL; 424 t->next = NULL;
425 *__this_cpu_read(tasklet_hi_vec.tail) = t; 425 *__this_cpu_read(tasklet_hi_vec.tail) = t;
426 __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); 426 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
427 raise_softirq_irqoff(HI_SOFTIRQ); 427 raise_softirq_irqoff(HI_SOFTIRQ);
428 local_irq_restore(flags); 428 local_irq_restore(flags);
429 } 429 }
430 430
431 EXPORT_SYMBOL(__tasklet_hi_schedule); 431 EXPORT_SYMBOL(__tasklet_hi_schedule);
432 432
433 void __tasklet_hi_schedule_first(struct tasklet_struct *t) 433 void __tasklet_hi_schedule_first(struct tasklet_struct *t)
434 { 434 {
435 BUG_ON(!irqs_disabled()); 435 BUG_ON(!irqs_disabled());
436 436
437 t->next = __this_cpu_read(tasklet_hi_vec.head); 437 t->next = __this_cpu_read(tasklet_hi_vec.head);
438 __this_cpu_write(tasklet_hi_vec.head, t); 438 __this_cpu_write(tasklet_hi_vec.head, t);
439 __raise_softirq_irqoff(HI_SOFTIRQ); 439 __raise_softirq_irqoff(HI_SOFTIRQ);
440 } 440 }
441 441
442 EXPORT_SYMBOL(__tasklet_hi_schedule_first); 442 EXPORT_SYMBOL(__tasklet_hi_schedule_first);
443 443
444 static void tasklet_action(struct softirq_action *a) 444 static void tasklet_action(struct softirq_action *a)
445 { 445 {
446 struct tasklet_struct *list; 446 struct tasklet_struct *list;
447 447
448 local_irq_disable(); 448 local_irq_disable();
449 list = __this_cpu_read(tasklet_vec.head); 449 list = __this_cpu_read(tasklet_vec.head);
450 __this_cpu_write(tasklet_vec.head, NULL); 450 __this_cpu_write(tasklet_vec.head, NULL);
451 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); 451 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
452 local_irq_enable(); 452 local_irq_enable();
453 453
454 while (list) { 454 while (list) {
455 struct tasklet_struct *t = list; 455 struct tasklet_struct *t = list;
456 456
457 list = list->next; 457 list = list->next;
458 458
459 if (tasklet_trylock(t)) { 459 if (tasklet_trylock(t)) {
460 if (!atomic_read(&t->count)) { 460 if (!atomic_read(&t->count)) {
461 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) 461 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
462 BUG(); 462 BUG();
463 t->func(t->data); 463 t->func(t->data);
464 tasklet_unlock(t); 464 tasklet_unlock(t);
465 continue; 465 continue;
466 } 466 }
467 tasklet_unlock(t); 467 tasklet_unlock(t);
468 } 468 }
469 469
470 local_irq_disable(); 470 local_irq_disable();
471 t->next = NULL; 471 t->next = NULL;
472 *__this_cpu_read(tasklet_vec.tail) = t; 472 *__this_cpu_read(tasklet_vec.tail) = t;
473 __this_cpu_write(tasklet_vec.tail, &(t->next)); 473 __this_cpu_write(tasklet_vec.tail, &(t->next));
474 __raise_softirq_irqoff(TASKLET_SOFTIRQ); 474 __raise_softirq_irqoff(TASKLET_SOFTIRQ);
475 local_irq_enable(); 475 local_irq_enable();
476 } 476 }
477 } 477 }
478 478
479 static void tasklet_hi_action(struct softirq_action *a) 479 static void tasklet_hi_action(struct softirq_action *a)
480 { 480 {
481 struct tasklet_struct *list; 481 struct tasklet_struct *list;
482 482
483 local_irq_disable(); 483 local_irq_disable();
484 list = __this_cpu_read(tasklet_hi_vec.head); 484 list = __this_cpu_read(tasklet_hi_vec.head);
485 __this_cpu_write(tasklet_hi_vec.head, NULL); 485 __this_cpu_write(tasklet_hi_vec.head, NULL);
486 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); 486 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head);
487 local_irq_enable(); 487 local_irq_enable();
488 488
489 while (list) { 489 while (list) {
490 struct tasklet_struct *t = list; 490 struct tasklet_struct *t = list;
491 491
492 list = list->next; 492 list = list->next;
493 493
494 if (tasklet_trylock(t)) { 494 if (tasklet_trylock(t)) {
495 if (!atomic_read(&t->count)) { 495 if (!atomic_read(&t->count)) {
496 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) 496 if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
497 BUG(); 497 BUG();
498 t->func(t->data); 498 t->func(t->data);
499 tasklet_unlock(t); 499 tasklet_unlock(t);
500 continue; 500 continue;
501 } 501 }
502 tasklet_unlock(t); 502 tasklet_unlock(t);
503 } 503 }
504 504
505 local_irq_disable(); 505 local_irq_disable();
506 t->next = NULL; 506 t->next = NULL;
507 *__this_cpu_read(tasklet_hi_vec.tail) = t; 507 *__this_cpu_read(tasklet_hi_vec.tail) = t;
508 __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); 508 __this_cpu_write(tasklet_hi_vec.tail, &(t->next));
509 __raise_softirq_irqoff(HI_SOFTIRQ); 509 __raise_softirq_irqoff(HI_SOFTIRQ);
510 local_irq_enable(); 510 local_irq_enable();
511 } 511 }
512 } 512 }
513 513
514 514
515 void tasklet_init(struct tasklet_struct *t, 515 void tasklet_init(struct tasklet_struct *t,
516 void (*func)(unsigned long), unsigned long data) 516 void (*func)(unsigned long), unsigned long data)
517 { 517 {
518 t->next = NULL; 518 t->next = NULL;
519 t->state = 0; 519 t->state = 0;
520 atomic_set(&t->count, 0); 520 atomic_set(&t->count, 0);
521 t->func = func; 521 t->func = func;
522 t->data = data; 522 t->data = data;
523 } 523 }
524 524
525 EXPORT_SYMBOL(tasklet_init); 525 EXPORT_SYMBOL(tasklet_init);
526 526
527 void tasklet_kill(struct tasklet_struct *t) 527 void tasklet_kill(struct tasklet_struct *t)
528 { 528 {
529 if (in_interrupt()) 529 if (in_interrupt())
530 printk("Attempt to kill tasklet from interrupt\n"); 530 printk("Attempt to kill tasklet from interrupt\n");
531 531
532 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { 532 while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
533 do { 533 do {
534 yield(); 534 yield();
535 } while (test_bit(TASKLET_STATE_SCHED, &t->state)); 535 } while (test_bit(TASKLET_STATE_SCHED, &t->state));
536 } 536 }
537 tasklet_unlock_wait(t); 537 tasklet_unlock_wait(t);
538 clear_bit(TASKLET_STATE_SCHED, &t->state); 538 clear_bit(TASKLET_STATE_SCHED, &t->state);
539 } 539 }
540 540
541 EXPORT_SYMBOL(tasklet_kill); 541 EXPORT_SYMBOL(tasklet_kill);
542 542
543 /* 543 /*
544 * tasklet_hrtimer 544 * tasklet_hrtimer
545 */ 545 */
546 546
547 /* 547 /*
548 * The trampoline is called when the hrtimer expires. It schedules a tasklet 548 * The trampoline is called when the hrtimer expires. It schedules a tasklet
549 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended 549 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
550 * hrtimer callback, but from softirq context. 550 * hrtimer callback, but from softirq context.
551 */ 551 */
552 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 552 static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
553 { 553 {
554 struct tasklet_hrtimer *ttimer = 554 struct tasklet_hrtimer *ttimer =
555 container_of(timer, struct tasklet_hrtimer, timer); 555 container_of(timer, struct tasklet_hrtimer, timer);
556 556
557 tasklet_hi_schedule(&ttimer->tasklet); 557 tasklet_hi_schedule(&ttimer->tasklet);
558 return HRTIMER_NORESTART; 558 return HRTIMER_NORESTART;
559 } 559 }
560 560
561 /* 561 /*
562 * Helper function which calls the hrtimer callback from 562 * Helper function which calls the hrtimer callback from
563 * tasklet/softirq context 563 * tasklet/softirq context
564 */ 564 */
565 static void __tasklet_hrtimer_trampoline(unsigned long data) 565 static void __tasklet_hrtimer_trampoline(unsigned long data)
566 { 566 {
567 struct tasklet_hrtimer *ttimer = (void *)data; 567 struct tasklet_hrtimer *ttimer = (void *)data;
568 enum hrtimer_restart restart; 568 enum hrtimer_restart restart;
569 569
570 restart = ttimer->function(&ttimer->timer); 570 restart = ttimer->function(&ttimer->timer);
571 if (restart != HRTIMER_NORESTART) 571 if (restart != HRTIMER_NORESTART)
572 hrtimer_restart(&ttimer->timer); 572 hrtimer_restart(&ttimer->timer);
573 } 573 }
574 574
575 /** 575 /**
576 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks 576 * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
577 * @ttimer: tasklet_hrtimer which is initialized 577 * @ttimer: tasklet_hrtimer which is initialized
578 * @function: hrtimer callback function which gets called from softirq context 578 * @function: hrtimer callback function which gets called from softirq context
579 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) 579 * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
580 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) 580 * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
581 */ 581 */
582 void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, 582 void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
583 enum hrtimer_restart (*function)(struct hrtimer *), 583 enum hrtimer_restart (*function)(struct hrtimer *),
584 clockid_t which_clock, enum hrtimer_mode mode) 584 clockid_t which_clock, enum hrtimer_mode mode)
585 { 585 {
586 hrtimer_init(&ttimer->timer, which_clock, mode); 586 hrtimer_init(&ttimer->timer, which_clock, mode);
587 ttimer->timer.function = __hrtimer_tasklet_trampoline; 587 ttimer->timer.function = __hrtimer_tasklet_trampoline;
588 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, 588 tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline,
589 (unsigned long)ttimer); 589 (unsigned long)ttimer);
590 ttimer->function = function; 590 ttimer->function = function;
591 } 591 }
592 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); 592 EXPORT_SYMBOL_GPL(tasklet_hrtimer_init);
593 593
594 /* 594 /*
595 * Remote softirq bits 595 * Remote softirq bits
596 */ 596 */
597 597
598 DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); 598 DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
599 EXPORT_PER_CPU_SYMBOL(softirq_work_list); 599 EXPORT_PER_CPU_SYMBOL(softirq_work_list);
600 600
601 static void __local_trigger(struct call_single_data *cp, int softirq) 601 static void __local_trigger(struct call_single_data *cp, int softirq)
602 { 602 {
603 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); 603 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
604 604
605 list_add_tail(&cp->list, head); 605 list_add_tail(&cp->list, head);
606 606
607 /* Trigger the softirq only if the list was previously empty. */ 607 /* Trigger the softirq only if the list was previously empty. */
608 if (head->next == &cp->list) 608 if (head->next == &cp->list)
609 raise_softirq_irqoff(softirq); 609 raise_softirq_irqoff(softirq);
610 } 610 }
611 611
612 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS 612 #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
613 static void remote_softirq_receive(void *data) 613 static void remote_softirq_receive(void *data)
614 { 614 {
615 struct call_single_data *cp = data; 615 struct call_single_data *cp = data;
616 unsigned long flags; 616 unsigned long flags;
617 int softirq; 617 int softirq;
618 618
619 softirq = cp->priv; 619 softirq = cp->priv;
620 620
621 local_irq_save(flags); 621 local_irq_save(flags);
622 __local_trigger(cp, softirq); 622 __local_trigger(cp, softirq);
623 local_irq_restore(flags); 623 local_irq_restore(flags);
624 } 624 }
625 625
626 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) 626 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
627 { 627 {
628 if (cpu_online(cpu)) { 628 if (cpu_online(cpu)) {
629 cp->func = remote_softirq_receive; 629 cp->func = remote_softirq_receive;
630 cp->info = cp; 630 cp->info = cp;
631 cp->flags = 0; 631 cp->flags = 0;
632 cp->priv = softirq; 632 cp->priv = softirq;
633 633
634 __smp_call_function_single(cpu, cp, 0); 634 __smp_call_function_single(cpu, cp, 0);
635 return 0; 635 return 0;
636 } 636 }
637 return 1; 637 return 1;
638 } 638 }
639 #else /* CONFIG_USE_GENERIC_SMP_HELPERS */ 639 #else /* CONFIG_USE_GENERIC_SMP_HELPERS */
640 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) 640 static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
641 { 641 {
642 return 1; 642 return 1;
643 } 643 }
644 #endif 644 #endif
645 645
646 /** 646 /**
647 * __send_remote_softirq - try to schedule softirq work on a remote cpu 647 * __send_remote_softirq - try to schedule softirq work on a remote cpu
648 * @cp: private SMP call function data area 648 * @cp: private SMP call function data area
649 * @cpu: the remote cpu 649 * @cpu: the remote cpu
650 * @this_cpu: the currently executing cpu 650 * @this_cpu: the currently executing cpu
651 * @softirq: the softirq for the work 651 * @softirq: the softirq for the work
652 * 652 *
653 * Attempt to schedule softirq work on a remote cpu. If this cannot be 653 * Attempt to schedule softirq work on a remote cpu. If this cannot be
654 * done, the work is instead queued up on the local cpu. 654 * done, the work is instead queued up on the local cpu.
655 * 655 *
656 * Interrupts must be disabled. 656 * Interrupts must be disabled.
657 */ 657 */
658 void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) 658 void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
659 { 659 {
660 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) 660 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
661 __local_trigger(cp, softirq); 661 __local_trigger(cp, softirq);
662 } 662 }
663 EXPORT_SYMBOL(__send_remote_softirq); 663 EXPORT_SYMBOL(__send_remote_softirq);
664 664
665 /** 665 /**
666 * send_remote_softirq - try to schedule softirq work on a remote cpu 666 * send_remote_softirq - try to schedule softirq work on a remote cpu
667 * @cp: private SMP call function data area 667 * @cp: private SMP call function data area
668 * @cpu: the remote cpu 668 * @cpu: the remote cpu
669 * @softirq: the softirq for the work 669 * @softirq: the softirq for the work
670 * 670 *
671 * Like __send_remote_softirq except that disabling interrupts and 671 * Like __send_remote_softirq except that disabling interrupts and
672 * computing the current cpu is done for the caller. 672 * computing the current cpu is done for the caller.
673 */ 673 */
674 void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) 674 void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
675 { 675 {
676 unsigned long flags; 676 unsigned long flags;
677 int this_cpu; 677 int this_cpu;
678 678
679 local_irq_save(flags); 679 local_irq_save(flags);
680 this_cpu = smp_processor_id(); 680 this_cpu = smp_processor_id();
681 __send_remote_softirq(cp, cpu, this_cpu, softirq); 681 __send_remote_softirq(cp, cpu, this_cpu, softirq);
682 local_irq_restore(flags); 682 local_irq_restore(flags);
683 } 683 }
684 EXPORT_SYMBOL(send_remote_softirq); 684 EXPORT_SYMBOL(send_remote_softirq);
685 685
686 static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, 686 static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
687 unsigned long action, void *hcpu) 687 unsigned long action, void *hcpu)
688 { 688 {
689 /* 689 /*
690 * If a CPU goes away, splice its entries to the current CPU 690 * If a CPU goes away, splice its entries to the current CPU
691 * and trigger a run of the softirq 691 * and trigger a run of the softirq
692 */ 692 */
693 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { 693 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
694 int cpu = (unsigned long) hcpu; 694 int cpu = (unsigned long) hcpu;
695 int i; 695 int i;
696 696
697 local_irq_disable(); 697 local_irq_disable();
698 for (i = 0; i < NR_SOFTIRQS; i++) { 698 for (i = 0; i < NR_SOFTIRQS; i++) {
699 struct list_head *head = &per_cpu(softirq_work_list[i], cpu); 699 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
700 struct list_head *local_head; 700 struct list_head *local_head;
701 701
702 if (list_empty(head)) 702 if (list_empty(head))
703 continue; 703 continue;
704 704
705 local_head = &__get_cpu_var(softirq_work_list[i]); 705 local_head = &__get_cpu_var(softirq_work_list[i]);
706 list_splice_init(head, local_head); 706 list_splice_init(head, local_head);
707 raise_softirq_irqoff(i); 707 raise_softirq_irqoff(i);
708 } 708 }
709 local_irq_enable(); 709 local_irq_enable();
710 } 710 }
711 711
712 return NOTIFY_OK; 712 return NOTIFY_OK;
713 } 713 }
714 714
715 static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { 715 static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
716 .notifier_call = remote_softirq_cpu_notify, 716 .notifier_call = remote_softirq_cpu_notify,
717 }; 717 };
718 718
719 void __init softirq_init(void) 719 void __init softirq_init(void)
720 { 720 {
721 int cpu; 721 int cpu;
722 722
723 for_each_possible_cpu(cpu) { 723 for_each_possible_cpu(cpu) {
724 int i; 724 int i;
725 725
726 per_cpu(tasklet_vec, cpu).tail = 726 per_cpu(tasklet_vec, cpu).tail =
727 &per_cpu(tasklet_vec, cpu).head; 727 &per_cpu(tasklet_vec, cpu).head;
728 per_cpu(tasklet_hi_vec, cpu).tail = 728 per_cpu(tasklet_hi_vec, cpu).tail =
729 &per_cpu(tasklet_hi_vec, cpu).head; 729 &per_cpu(tasklet_hi_vec, cpu).head;
730 for (i = 0; i < NR_SOFTIRQS; i++) 730 for (i = 0; i < NR_SOFTIRQS; i++)
731 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); 731 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
732 } 732 }
733 733
734 register_hotcpu_notifier(&remote_softirq_cpu_notifier); 734 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
735 735
736 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 736 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
737 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 737 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
738 } 738 }
739 739
740 static int run_ksoftirqd(void * __bind_cpu) 740 static int run_ksoftirqd(void * __bind_cpu)
741 { 741 {
742 set_current_state(TASK_INTERRUPTIBLE); 742 set_current_state(TASK_INTERRUPTIBLE);
743 743
744 while (!kthread_should_stop()) { 744 while (!kthread_should_stop()) {
745 preempt_disable(); 745 preempt_disable();
746 if (!local_softirq_pending()) { 746 if (!local_softirq_pending()) {
747 schedule_preempt_disabled(); 747 schedule_preempt_disabled();
748 } 748 }
749 749
750 __set_current_state(TASK_RUNNING); 750 __set_current_state(TASK_RUNNING);
751 751
752 while (local_softirq_pending()) { 752 while (local_softirq_pending()) {
753 /* Preempt disable stops cpu going offline. 753 /* Preempt disable stops cpu going offline.
754 If already offline, we'll be on wrong CPU: 754 If already offline, we'll be on wrong CPU:
755 don't process */ 755 don't process */
756 if (cpu_is_offline((long)__bind_cpu)) 756 if (cpu_is_offline((long)__bind_cpu))
757 goto wait_to_die; 757 goto wait_to_die;
758 local_irq_disable(); 758 local_irq_disable();
759 if (local_softirq_pending()) 759 if (local_softirq_pending())
760 __do_softirq(); 760 __do_softirq();
761 local_irq_enable(); 761 local_irq_enable();
762 preempt_enable_no_resched(); 762 sched_preempt_enable_no_resched();
763 cond_resched(); 763 cond_resched();
764 preempt_disable(); 764 preempt_disable();
765 rcu_note_context_switch((long)__bind_cpu); 765 rcu_note_context_switch((long)__bind_cpu);
766 } 766 }
767 preempt_enable(); 767 preempt_enable();
768 set_current_state(TASK_INTERRUPTIBLE); 768 set_current_state(TASK_INTERRUPTIBLE);
769 } 769 }
770 __set_current_state(TASK_RUNNING); 770 __set_current_state(TASK_RUNNING);
771 return 0; 771 return 0;
772 772
773 wait_to_die: 773 wait_to_die:
774 preempt_enable(); 774 preempt_enable();
775 /* Wait for kthread_stop */ 775 /* Wait for kthread_stop */
776 set_current_state(TASK_INTERRUPTIBLE); 776 set_current_state(TASK_INTERRUPTIBLE);
777 while (!kthread_should_stop()) { 777 while (!kthread_should_stop()) {
778 schedule(); 778 schedule();
779 set_current_state(TASK_INTERRUPTIBLE); 779 set_current_state(TASK_INTERRUPTIBLE);
780 } 780 }
781 __set_current_state(TASK_RUNNING); 781 __set_current_state(TASK_RUNNING);
782 return 0; 782 return 0;
783 } 783 }
784 784
785 #ifdef CONFIG_HOTPLUG_CPU 785 #ifdef CONFIG_HOTPLUG_CPU
786 /* 786 /*
787 * tasklet_kill_immediate is called to remove a tasklet which can already be 787 * tasklet_kill_immediate is called to remove a tasklet which can already be
788 * scheduled for execution on @cpu. 788 * scheduled for execution on @cpu.
789 * 789 *
790 * Unlike tasklet_kill, this function removes the tasklet 790 * Unlike tasklet_kill, this function removes the tasklet
791 * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. 791 * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state.
792 * 792 *
793 * When this function is called, @cpu must be in the CPU_DEAD state. 793 * When this function is called, @cpu must be in the CPU_DEAD state.
794 */ 794 */
795 void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) 795 void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu)
796 { 796 {
797 struct tasklet_struct **i; 797 struct tasklet_struct **i;
798 798
799 BUG_ON(cpu_online(cpu)); 799 BUG_ON(cpu_online(cpu));
800 BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); 800 BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state));
801 801
802 if (!test_bit(TASKLET_STATE_SCHED, &t->state)) 802 if (!test_bit(TASKLET_STATE_SCHED, &t->state))
803 return; 803 return;
804 804
805 /* CPU is dead, so no lock needed. */ 805 /* CPU is dead, so no lock needed. */
806 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { 806 for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) {
807 if (*i == t) { 807 if (*i == t) {
808 *i = t->next; 808 *i = t->next;
809 /* If this was the tail element, move the tail ptr */ 809 /* If this was the tail element, move the tail ptr */
810 if (*i == NULL) 810 if (*i == NULL)
811 per_cpu(tasklet_vec, cpu).tail = i; 811 per_cpu(tasklet_vec, cpu).tail = i;
812 return; 812 return;
813 } 813 }
814 } 814 }
815 BUG(); 815 BUG();
816 } 816 }
817 817
818 static void takeover_tasklets(unsigned int cpu) 818 static void takeover_tasklets(unsigned int cpu)
819 { 819 {
820 /* CPU is dead, so no lock needed. */ 820 /* CPU is dead, so no lock needed. */
821 local_irq_disable(); 821 local_irq_disable();
822 822
823 /* Find end, append list for that CPU. */ 823 /* Find end, append list for that CPU. */
824 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { 824 if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) {
825 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; 825 *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head;
826 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); 826 this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail);
827 per_cpu(tasklet_vec, cpu).head = NULL; 827 per_cpu(tasklet_vec, cpu).head = NULL;
828 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; 828 per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head;
829 } 829 }
830 raise_softirq_irqoff(TASKLET_SOFTIRQ); 830 raise_softirq_irqoff(TASKLET_SOFTIRQ);
831 831
832 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { 832 if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) {
833 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; 833 *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head;
834 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); 834 __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail);
835 per_cpu(tasklet_hi_vec, cpu).head = NULL; 835 per_cpu(tasklet_hi_vec, cpu).head = NULL;
836 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; 836 per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head;
837 } 837 }
838 raise_softirq_irqoff(HI_SOFTIRQ); 838 raise_softirq_irqoff(HI_SOFTIRQ);
839 839
840 local_irq_enable(); 840 local_irq_enable();
841 } 841 }
842 #endif /* CONFIG_HOTPLUG_CPU */ 842 #endif /* CONFIG_HOTPLUG_CPU */
843 843
844 static int __cpuinit cpu_callback(struct notifier_block *nfb, 844 static int __cpuinit cpu_callback(struct notifier_block *nfb,
845 unsigned long action, 845 unsigned long action,
846 void *hcpu) 846 void *hcpu)
847 { 847 {
848 int hotcpu = (unsigned long)hcpu; 848 int hotcpu = (unsigned long)hcpu;
849 struct task_struct *p; 849 struct task_struct *p;
850 850
851 switch (action) { 851 switch (action) {
852 case CPU_UP_PREPARE: 852 case CPU_UP_PREPARE:
853 case CPU_UP_PREPARE_FROZEN: 853 case CPU_UP_PREPARE_FROZEN:
854 p = kthread_create_on_node(run_ksoftirqd, 854 p = kthread_create_on_node(run_ksoftirqd,
855 hcpu, 855 hcpu,
856 cpu_to_node(hotcpu), 856 cpu_to_node(hotcpu),
857 "ksoftirqd/%d", hotcpu); 857 "ksoftirqd/%d", hotcpu);
858 if (IS_ERR(p)) { 858 if (IS_ERR(p)) {
859 printk("ksoftirqd for %i failed\n", hotcpu); 859 printk("ksoftirqd for %i failed\n", hotcpu);
860 return notifier_from_errno(PTR_ERR(p)); 860 return notifier_from_errno(PTR_ERR(p));
861 } 861 }
862 kthread_bind(p, hotcpu); 862 kthread_bind(p, hotcpu);
863 per_cpu(ksoftirqd, hotcpu) = p; 863 per_cpu(ksoftirqd, hotcpu) = p;
864 break; 864 break;
865 case CPU_ONLINE: 865 case CPU_ONLINE:
866 case CPU_ONLINE_FROZEN: 866 case CPU_ONLINE_FROZEN:
867 wake_up_process(per_cpu(ksoftirqd, hotcpu)); 867 wake_up_process(per_cpu(ksoftirqd, hotcpu));
868 break; 868 break;
869 #ifdef CONFIG_HOTPLUG_CPU 869 #ifdef CONFIG_HOTPLUG_CPU
870 case CPU_UP_CANCELED: 870 case CPU_UP_CANCELED:
871 case CPU_UP_CANCELED_FROZEN: 871 case CPU_UP_CANCELED_FROZEN:
872 if (!per_cpu(ksoftirqd, hotcpu)) 872 if (!per_cpu(ksoftirqd, hotcpu))
873 break; 873 break;
874 /* Unbind so it can run. Fall thru. */ 874 /* Unbind so it can run. Fall thru. */
875 kthread_bind(per_cpu(ksoftirqd, hotcpu), 875 kthread_bind(per_cpu(ksoftirqd, hotcpu),
876 cpumask_any(cpu_online_mask)); 876 cpumask_any(cpu_online_mask));
877 case CPU_DEAD: 877 case CPU_DEAD:
878 case CPU_DEAD_FROZEN: { 878 case CPU_DEAD_FROZEN: {
879 static const struct sched_param param = { 879 static const struct sched_param param = {
880 .sched_priority = MAX_RT_PRIO-1 880 .sched_priority = MAX_RT_PRIO-1
881 }; 881 };
882 882
883 p = per_cpu(ksoftirqd, hotcpu); 883 p = per_cpu(ksoftirqd, hotcpu);
884 per_cpu(ksoftirqd, hotcpu) = NULL; 884 per_cpu(ksoftirqd, hotcpu) = NULL;
885 sched_setscheduler_nocheck(p, SCHED_FIFO, &param); 885 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
886 kthread_stop(p); 886 kthread_stop(p);
887 takeover_tasklets(hotcpu); 887 takeover_tasklets(hotcpu);
888 break; 888 break;
889 } 889 }
890 #endif /* CONFIG_HOTPLUG_CPU */ 890 #endif /* CONFIG_HOTPLUG_CPU */
891 } 891 }
892 return NOTIFY_OK; 892 return NOTIFY_OK;
893 } 893 }
894 894
895 static struct notifier_block __cpuinitdata cpu_nfb = { 895 static struct notifier_block __cpuinitdata cpu_nfb = {
896 .notifier_call = cpu_callback 896 .notifier_call = cpu_callback
897 }; 897 };
898 898
899 static __init int spawn_ksoftirqd(void) 899 static __init int spawn_ksoftirqd(void)
900 { 900 {
901 void *cpu = (void *)(long)smp_processor_id(); 901 void *cpu = (void *)(long)smp_processor_id();
902 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 902 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
903 903
904 BUG_ON(err != NOTIFY_OK); 904 BUG_ON(err != NOTIFY_OK);
905 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 905 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
906 register_cpu_notifier(&cpu_nfb); 906 register_cpu_notifier(&cpu_nfb);
907 return 0; 907 return 0;
908 } 908 }
909 early_initcall(spawn_ksoftirqd); 909 early_initcall(spawn_ksoftirqd);
910 910
911 /* 911 /*
912 * [ These __weak aliases are kept in a separate compilation unit, so that 912 * [ These __weak aliases are kept in a separate compilation unit, so that
913 * GCC does not inline them incorrectly. ] 913 * GCC does not inline them incorrectly. ]
914 */ 914 */
915 915
916 int __init __weak early_irq_init(void) 916 int __init __weak early_irq_init(void)
917 { 917 {
918 return 0; 918 return 0;
919 } 919 }
920 920
921 #ifdef CONFIG_GENERIC_HARDIRQS 921 #ifdef CONFIG_GENERIC_HARDIRQS
922 int __init __weak arch_probe_nr_irqs(void) 922 int __init __weak arch_probe_nr_irqs(void)
923 { 923 {
924 return NR_IRQS_LEGACY; 924 return NR_IRQS_LEGACY;
925 } 925 }
926 926
927 int __init __weak arch_early_irq_init(void) 927 int __init __weak arch_early_irq_init(void)
928 { 928 {
929 return 0; 929 return 0;
930 } 930 }
931 #endif 931 #endif
932 932