Commit 32439700fe1c0fc3c2d3f2aedd3ad6707c88b8ba

Authored by Linus Torvalds

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
 "Various fixlets, mostly related to the (root-only) SCHED_DEADLINE
  policy, but also a hotplug bug fix and a fix for a NR_CPUS related
  overallocation bug causing a suspend/resume regression"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched: Fix hotplug vs. set_cpus_allowed_ptr()
  sched/cpupri: Replace NR_CPUS arrays
  sched/deadline: Replace NR_CPUS arrays
  sched/deadline: Restrict user params max value to 2^63 ns
  sched/deadline: Change sched_getparam() behaviour vs SCHED_DEADLINE
  sched: Disallow sched_attr::sched_policy < 0
  sched: Make sched_setattr() correctly return -EFBIG

Showing 6 changed files Inline Diff

1 /* CPU control. 1 /* CPU control.
2 * (C) 2001, 2002, 2003, 2004 Rusty Russell 2 * (C) 2001, 2002, 2003, 2004 Rusty Russell
3 * 3 *
4 * This code is licenced under the GPL. 4 * This code is licenced under the GPL.
5 */ 5 */
6 #include <linux/proc_fs.h> 6 #include <linux/proc_fs.h>
7 #include <linux/smp.h> 7 #include <linux/smp.h>
8 #include <linux/init.h> 8 #include <linux/init.h>
9 #include <linux/notifier.h> 9 #include <linux/notifier.h>
10 #include <linux/sched.h> 10 #include <linux/sched.h>
11 #include <linux/unistd.h> 11 #include <linux/unistd.h>
12 #include <linux/cpu.h> 12 #include <linux/cpu.h>
13 #include <linux/oom.h> 13 #include <linux/oom.h>
14 #include <linux/rcupdate.h> 14 #include <linux/rcupdate.h>
15 #include <linux/export.h> 15 #include <linux/export.h>
16 #include <linux/bug.h> 16 #include <linux/bug.h>
17 #include <linux/kthread.h> 17 #include <linux/kthread.h>
18 #include <linux/stop_machine.h> 18 #include <linux/stop_machine.h>
19 #include <linux/mutex.h> 19 #include <linux/mutex.h>
20 #include <linux/gfp.h> 20 #include <linux/gfp.h>
21 #include <linux/suspend.h> 21 #include <linux/suspend.h>
22 #include <linux/lockdep.h> 22 #include <linux/lockdep.h>
23 23
24 #include "smpboot.h" 24 #include "smpboot.h"
25 25
26 #ifdef CONFIG_SMP 26 #ifdef CONFIG_SMP
27 /* Serializes the updates to cpu_online_mask, cpu_present_mask */ 27 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
28 static DEFINE_MUTEX(cpu_add_remove_lock); 28 static DEFINE_MUTEX(cpu_add_remove_lock);
29 29
30 /* 30 /*
31 * The following two APIs (cpu_maps_update_begin/done) must be used when 31 * The following two APIs (cpu_maps_update_begin/done) must be used when
32 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask. 32 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
33 * The APIs cpu_notifier_register_begin/done() must be used to protect CPU 33 * The APIs cpu_notifier_register_begin/done() must be used to protect CPU
34 * hotplug callback (un)registration performed using __register_cpu_notifier() 34 * hotplug callback (un)registration performed using __register_cpu_notifier()
35 * or __unregister_cpu_notifier(). 35 * or __unregister_cpu_notifier().
36 */ 36 */
37 void cpu_maps_update_begin(void) 37 void cpu_maps_update_begin(void)
38 { 38 {
39 mutex_lock(&cpu_add_remove_lock); 39 mutex_lock(&cpu_add_remove_lock);
40 } 40 }
41 EXPORT_SYMBOL(cpu_notifier_register_begin); 41 EXPORT_SYMBOL(cpu_notifier_register_begin);
42 42
43 void cpu_maps_update_done(void) 43 void cpu_maps_update_done(void)
44 { 44 {
45 mutex_unlock(&cpu_add_remove_lock); 45 mutex_unlock(&cpu_add_remove_lock);
46 } 46 }
47 EXPORT_SYMBOL(cpu_notifier_register_done); 47 EXPORT_SYMBOL(cpu_notifier_register_done);
48 48
49 static RAW_NOTIFIER_HEAD(cpu_chain); 49 static RAW_NOTIFIER_HEAD(cpu_chain);
50 50
51 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 51 /* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
52 * Should always be manipulated under cpu_add_remove_lock 52 * Should always be manipulated under cpu_add_remove_lock
53 */ 53 */
54 static int cpu_hotplug_disabled; 54 static int cpu_hotplug_disabled;
55 55
56 #ifdef CONFIG_HOTPLUG_CPU 56 #ifdef CONFIG_HOTPLUG_CPU
57 57
58 static struct { 58 static struct {
59 struct task_struct *active_writer; 59 struct task_struct *active_writer;
60 struct mutex lock; /* Synchronizes accesses to refcount, */ 60 struct mutex lock; /* Synchronizes accesses to refcount, */
61 /* 61 /*
62 * Also blocks the new readers during 62 * Also blocks the new readers during
63 * an ongoing cpu hotplug operation. 63 * an ongoing cpu hotplug operation.
64 */ 64 */
65 int refcount; 65 int refcount;
66 66
67 #ifdef CONFIG_DEBUG_LOCK_ALLOC 67 #ifdef CONFIG_DEBUG_LOCK_ALLOC
68 struct lockdep_map dep_map; 68 struct lockdep_map dep_map;
69 #endif 69 #endif
70 } cpu_hotplug = { 70 } cpu_hotplug = {
71 .active_writer = NULL, 71 .active_writer = NULL,
72 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), 72 .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
73 .refcount = 0, 73 .refcount = 0,
74 #ifdef CONFIG_DEBUG_LOCK_ALLOC 74 #ifdef CONFIG_DEBUG_LOCK_ALLOC
75 .dep_map = {.name = "cpu_hotplug.lock" }, 75 .dep_map = {.name = "cpu_hotplug.lock" },
76 #endif 76 #endif
77 }; 77 };
78 78
79 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ 79 /* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
80 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) 80 #define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
81 #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 81 #define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
82 #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 82 #define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
83 83
84 void get_online_cpus(void) 84 void get_online_cpus(void)
85 { 85 {
86 might_sleep(); 86 might_sleep();
87 if (cpu_hotplug.active_writer == current) 87 if (cpu_hotplug.active_writer == current)
88 return; 88 return;
89 cpuhp_lock_acquire_read(); 89 cpuhp_lock_acquire_read();
90 mutex_lock(&cpu_hotplug.lock); 90 mutex_lock(&cpu_hotplug.lock);
91 cpu_hotplug.refcount++; 91 cpu_hotplug.refcount++;
92 mutex_unlock(&cpu_hotplug.lock); 92 mutex_unlock(&cpu_hotplug.lock);
93 93
94 } 94 }
95 EXPORT_SYMBOL_GPL(get_online_cpus); 95 EXPORT_SYMBOL_GPL(get_online_cpus);
96 96
97 void put_online_cpus(void) 97 void put_online_cpus(void)
98 { 98 {
99 if (cpu_hotplug.active_writer == current) 99 if (cpu_hotplug.active_writer == current)
100 return; 100 return;
101 mutex_lock(&cpu_hotplug.lock); 101 mutex_lock(&cpu_hotplug.lock);
102 102
103 if (WARN_ON(!cpu_hotplug.refcount)) 103 if (WARN_ON(!cpu_hotplug.refcount))
104 cpu_hotplug.refcount++; /* try to fix things up */ 104 cpu_hotplug.refcount++; /* try to fix things up */
105 105
106 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) 106 if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
107 wake_up_process(cpu_hotplug.active_writer); 107 wake_up_process(cpu_hotplug.active_writer);
108 mutex_unlock(&cpu_hotplug.lock); 108 mutex_unlock(&cpu_hotplug.lock);
109 cpuhp_lock_release(); 109 cpuhp_lock_release();
110 110
111 } 111 }
112 EXPORT_SYMBOL_GPL(put_online_cpus); 112 EXPORT_SYMBOL_GPL(put_online_cpus);
113 113
114 /* 114 /*
115 * This ensures that the hotplug operation can begin only when the 115 * This ensures that the hotplug operation can begin only when the
116 * refcount goes to zero. 116 * refcount goes to zero.
117 * 117 *
118 * Note that during a cpu-hotplug operation, the new readers, if any, 118 * Note that during a cpu-hotplug operation, the new readers, if any,
119 * will be blocked by the cpu_hotplug.lock 119 * will be blocked by the cpu_hotplug.lock
120 * 120 *
121 * Since cpu_hotplug_begin() is always called after invoking 121 * Since cpu_hotplug_begin() is always called after invoking
122 * cpu_maps_update_begin(), we can be sure that only one writer is active. 122 * cpu_maps_update_begin(), we can be sure that only one writer is active.
123 * 123 *
124 * Note that theoretically, there is a possibility of a livelock: 124 * Note that theoretically, there is a possibility of a livelock:
125 * - Refcount goes to zero, last reader wakes up the sleeping 125 * - Refcount goes to zero, last reader wakes up the sleeping
126 * writer. 126 * writer.
127 * - Last reader unlocks the cpu_hotplug.lock. 127 * - Last reader unlocks the cpu_hotplug.lock.
128 * - A new reader arrives at this moment, bumps up the refcount. 128 * - A new reader arrives at this moment, bumps up the refcount.
129 * - The writer acquires the cpu_hotplug.lock finds the refcount 129 * - The writer acquires the cpu_hotplug.lock finds the refcount
130 * non zero and goes to sleep again. 130 * non zero and goes to sleep again.
131 * 131 *
132 * However, this is very difficult to achieve in practice since 132 * However, this is very difficult to achieve in practice since
133 * get_online_cpus() not an api which is called all that often. 133 * get_online_cpus() not an api which is called all that often.
134 * 134 *
135 */ 135 */
136 void cpu_hotplug_begin(void) 136 void cpu_hotplug_begin(void)
137 { 137 {
138 cpu_hotplug.active_writer = current; 138 cpu_hotplug.active_writer = current;
139 139
140 cpuhp_lock_acquire(); 140 cpuhp_lock_acquire();
141 for (;;) { 141 for (;;) {
142 mutex_lock(&cpu_hotplug.lock); 142 mutex_lock(&cpu_hotplug.lock);
143 if (likely(!cpu_hotplug.refcount)) 143 if (likely(!cpu_hotplug.refcount))
144 break; 144 break;
145 __set_current_state(TASK_UNINTERRUPTIBLE); 145 __set_current_state(TASK_UNINTERRUPTIBLE);
146 mutex_unlock(&cpu_hotplug.lock); 146 mutex_unlock(&cpu_hotplug.lock);
147 schedule(); 147 schedule();
148 } 148 }
149 } 149 }
150 150
151 void cpu_hotplug_done(void) 151 void cpu_hotplug_done(void)
152 { 152 {
153 cpu_hotplug.active_writer = NULL; 153 cpu_hotplug.active_writer = NULL;
154 mutex_unlock(&cpu_hotplug.lock); 154 mutex_unlock(&cpu_hotplug.lock);
155 cpuhp_lock_release(); 155 cpuhp_lock_release();
156 } 156 }
157 157
158 /* 158 /*
159 * Wait for currently running CPU hotplug operations to complete (if any) and 159 * Wait for currently running CPU hotplug operations to complete (if any) and
160 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects 160 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
161 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the 161 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
162 * hotplug path before performing hotplug operations. So acquiring that lock 162 * hotplug path before performing hotplug operations. So acquiring that lock
163 * guarantees mutual exclusion from any currently running hotplug operations. 163 * guarantees mutual exclusion from any currently running hotplug operations.
164 */ 164 */
165 void cpu_hotplug_disable(void) 165 void cpu_hotplug_disable(void)
166 { 166 {
167 cpu_maps_update_begin(); 167 cpu_maps_update_begin();
168 cpu_hotplug_disabled = 1; 168 cpu_hotplug_disabled = 1;
169 cpu_maps_update_done(); 169 cpu_maps_update_done();
170 } 170 }
171 171
172 void cpu_hotplug_enable(void) 172 void cpu_hotplug_enable(void)
173 { 173 {
174 cpu_maps_update_begin(); 174 cpu_maps_update_begin();
175 cpu_hotplug_disabled = 0; 175 cpu_hotplug_disabled = 0;
176 cpu_maps_update_done(); 176 cpu_maps_update_done();
177 } 177 }
178 178
179 #endif /* CONFIG_HOTPLUG_CPU */ 179 #endif /* CONFIG_HOTPLUG_CPU */
180 180
181 /* Need to know about CPUs going up/down? */ 181 /* Need to know about CPUs going up/down? */
182 int __ref register_cpu_notifier(struct notifier_block *nb) 182 int __ref register_cpu_notifier(struct notifier_block *nb)
183 { 183 {
184 int ret; 184 int ret;
185 cpu_maps_update_begin(); 185 cpu_maps_update_begin();
186 ret = raw_notifier_chain_register(&cpu_chain, nb); 186 ret = raw_notifier_chain_register(&cpu_chain, nb);
187 cpu_maps_update_done(); 187 cpu_maps_update_done();
188 return ret; 188 return ret;
189 } 189 }
190 190
191 int __ref __register_cpu_notifier(struct notifier_block *nb) 191 int __ref __register_cpu_notifier(struct notifier_block *nb)
192 { 192 {
193 return raw_notifier_chain_register(&cpu_chain, nb); 193 return raw_notifier_chain_register(&cpu_chain, nb);
194 } 194 }
195 195
196 static int __cpu_notify(unsigned long val, void *v, int nr_to_call, 196 static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
197 int *nr_calls) 197 int *nr_calls)
198 { 198 {
199 int ret; 199 int ret;
200 200
201 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, 201 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
202 nr_calls); 202 nr_calls);
203 203
204 return notifier_to_errno(ret); 204 return notifier_to_errno(ret);
205 } 205 }
206 206
207 static int cpu_notify(unsigned long val, void *v) 207 static int cpu_notify(unsigned long val, void *v)
208 { 208 {
209 return __cpu_notify(val, v, -1, NULL); 209 return __cpu_notify(val, v, -1, NULL);
210 } 210 }
211 211
212 #ifdef CONFIG_HOTPLUG_CPU 212 #ifdef CONFIG_HOTPLUG_CPU
213 213
214 static void cpu_notify_nofail(unsigned long val, void *v) 214 static void cpu_notify_nofail(unsigned long val, void *v)
215 { 215 {
216 BUG_ON(cpu_notify(val, v)); 216 BUG_ON(cpu_notify(val, v));
217 } 217 }
218 EXPORT_SYMBOL(register_cpu_notifier); 218 EXPORT_SYMBOL(register_cpu_notifier);
219 EXPORT_SYMBOL(__register_cpu_notifier); 219 EXPORT_SYMBOL(__register_cpu_notifier);
220 220
221 void __ref unregister_cpu_notifier(struct notifier_block *nb) 221 void __ref unregister_cpu_notifier(struct notifier_block *nb)
222 { 222 {
223 cpu_maps_update_begin(); 223 cpu_maps_update_begin();
224 raw_notifier_chain_unregister(&cpu_chain, nb); 224 raw_notifier_chain_unregister(&cpu_chain, nb);
225 cpu_maps_update_done(); 225 cpu_maps_update_done();
226 } 226 }
227 EXPORT_SYMBOL(unregister_cpu_notifier); 227 EXPORT_SYMBOL(unregister_cpu_notifier);
228 228
229 void __ref __unregister_cpu_notifier(struct notifier_block *nb) 229 void __ref __unregister_cpu_notifier(struct notifier_block *nb)
230 { 230 {
231 raw_notifier_chain_unregister(&cpu_chain, nb); 231 raw_notifier_chain_unregister(&cpu_chain, nb);
232 } 232 }
233 EXPORT_SYMBOL(__unregister_cpu_notifier); 233 EXPORT_SYMBOL(__unregister_cpu_notifier);
234 234
235 /** 235 /**
236 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU 236 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
237 * @cpu: a CPU id 237 * @cpu: a CPU id
238 * 238 *
239 * This function walks all processes, finds a valid mm struct for each one and 239 * This function walks all processes, finds a valid mm struct for each one and
240 * then clears a corresponding bit in mm's cpumask. While this all sounds 240 * then clears a corresponding bit in mm's cpumask. While this all sounds
241 * trivial, there are various non-obvious corner cases, which this function 241 * trivial, there are various non-obvious corner cases, which this function
242 * tries to solve in a safe manner. 242 * tries to solve in a safe manner.
243 * 243 *
244 * Also note that the function uses a somewhat relaxed locking scheme, so it may 244 * Also note that the function uses a somewhat relaxed locking scheme, so it may
245 * be called only for an already offlined CPU. 245 * be called only for an already offlined CPU.
246 */ 246 */
247 void clear_tasks_mm_cpumask(int cpu) 247 void clear_tasks_mm_cpumask(int cpu)
248 { 248 {
249 struct task_struct *p; 249 struct task_struct *p;
250 250
251 /* 251 /*
252 * This function is called after the cpu is taken down and marked 252 * This function is called after the cpu is taken down and marked
253 * offline, so its not like new tasks will ever get this cpu set in 253 * offline, so its not like new tasks will ever get this cpu set in
254 * their mm mask. -- Peter Zijlstra 254 * their mm mask. -- Peter Zijlstra
255 * Thus, we may use rcu_read_lock() here, instead of grabbing 255 * Thus, we may use rcu_read_lock() here, instead of grabbing
256 * full-fledged tasklist_lock. 256 * full-fledged tasklist_lock.
257 */ 257 */
258 WARN_ON(cpu_online(cpu)); 258 WARN_ON(cpu_online(cpu));
259 rcu_read_lock(); 259 rcu_read_lock();
260 for_each_process(p) { 260 for_each_process(p) {
261 struct task_struct *t; 261 struct task_struct *t;
262 262
263 /* 263 /*
264 * Main thread might exit, but other threads may still have 264 * Main thread might exit, but other threads may still have
265 * a valid mm. Find one. 265 * a valid mm. Find one.
266 */ 266 */
267 t = find_lock_task_mm(p); 267 t = find_lock_task_mm(p);
268 if (!t) 268 if (!t)
269 continue; 269 continue;
270 cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); 270 cpumask_clear_cpu(cpu, mm_cpumask(t->mm));
271 task_unlock(t); 271 task_unlock(t);
272 } 272 }
273 rcu_read_unlock(); 273 rcu_read_unlock();
274 } 274 }
275 275
276 static inline void check_for_tasks(int cpu) 276 static inline void check_for_tasks(int cpu)
277 { 277 {
278 struct task_struct *p; 278 struct task_struct *p;
279 cputime_t utime, stime; 279 cputime_t utime, stime;
280 280
281 write_lock_irq(&tasklist_lock); 281 write_lock_irq(&tasklist_lock);
282 for_each_process(p) { 282 for_each_process(p) {
283 task_cputime(p, &utime, &stime); 283 task_cputime(p, &utime, &stime);
284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING && 284 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
285 (utime || stime)) 285 (utime || stime))
286 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " 286 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
287 "(state = %ld, flags = %x)\n", 287 "(state = %ld, flags = %x)\n",
288 p->comm, task_pid_nr(p), cpu, 288 p->comm, task_pid_nr(p), cpu,
289 p->state, p->flags); 289 p->state, p->flags);
290 } 290 }
291 write_unlock_irq(&tasklist_lock); 291 write_unlock_irq(&tasklist_lock);
292 } 292 }
293 293
294 struct take_cpu_down_param { 294 struct take_cpu_down_param {
295 unsigned long mod; 295 unsigned long mod;
296 void *hcpu; 296 void *hcpu;
297 }; 297 };
298 298
299 /* Take this CPU down. */ 299 /* Take this CPU down. */
300 static int __ref take_cpu_down(void *_param) 300 static int __ref take_cpu_down(void *_param)
301 { 301 {
302 struct take_cpu_down_param *param = _param; 302 struct take_cpu_down_param *param = _param;
303 int err; 303 int err;
304 304
305 /* Ensure this CPU doesn't handle any more interrupts. */ 305 /* Ensure this CPU doesn't handle any more interrupts. */
306 err = __cpu_disable(); 306 err = __cpu_disable();
307 if (err < 0) 307 if (err < 0)
308 return err; 308 return err;
309 309
310 cpu_notify(CPU_DYING | param->mod, param->hcpu); 310 cpu_notify(CPU_DYING | param->mod, param->hcpu);
311 /* Park the stopper thread */ 311 /* Park the stopper thread */
312 kthread_park(current); 312 kthread_park(current);
313 return 0; 313 return 0;
314 } 314 }
315 315
316 /* Requires cpu_add_remove_lock to be held */ 316 /* Requires cpu_add_remove_lock to be held */
317 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 317 static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
318 { 318 {
319 int err, nr_calls = 0; 319 int err, nr_calls = 0;
320 void *hcpu = (void *)(long)cpu; 320 void *hcpu = (void *)(long)cpu;
321 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 321 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
322 struct take_cpu_down_param tcd_param = { 322 struct take_cpu_down_param tcd_param = {
323 .mod = mod, 323 .mod = mod,
324 .hcpu = hcpu, 324 .hcpu = hcpu,
325 }; 325 };
326 326
327 if (num_online_cpus() == 1) 327 if (num_online_cpus() == 1)
328 return -EBUSY; 328 return -EBUSY;
329 329
330 if (!cpu_online(cpu)) 330 if (!cpu_online(cpu))
331 return -EINVAL; 331 return -EINVAL;
332 332
333 cpu_hotplug_begin(); 333 cpu_hotplug_begin();
334 334
335 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 335 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
336 if (err) { 336 if (err) {
337 nr_calls--; 337 nr_calls--;
338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 338 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
339 printk("%s: attempt to take down CPU %u failed\n", 339 printk("%s: attempt to take down CPU %u failed\n",
340 __func__, cpu); 340 __func__, cpu);
341 goto out_release; 341 goto out_release;
342 } 342 }
343 343
344 /* 344 /*
345 * By now we've cleared cpu_active_mask, wait for all preempt-disabled 345 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
346 * and RCU users of this state to go away such that all new such users 346 * and RCU users of this state to go away such that all new such users
347 * will observe it. 347 * will observe it.
348 * 348 *
349 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might 349 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
350 * not imply sync_sched(), so explicitly call both. 350 * not imply sync_sched(), so explicitly call both.
351 * 351 *
352 * Do sync before park smpboot threads to take care the rcu boost case. 352 * Do sync before park smpboot threads to take care the rcu boost case.
353 */ 353 */
354 #ifdef CONFIG_PREEMPT 354 #ifdef CONFIG_PREEMPT
355 synchronize_sched(); 355 synchronize_sched();
356 #endif 356 #endif
357 synchronize_rcu(); 357 synchronize_rcu();
358 358
359 smpboot_park_threads(cpu); 359 smpboot_park_threads(cpu);
360 360
361 /* 361 /*
362 * So now all preempt/rcu users must observe !cpu_active(). 362 * So now all preempt/rcu users must observe !cpu_active().
363 */ 363 */
364 364
365 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 365 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
366 if (err) { 366 if (err) {
367 /* CPU didn't die: tell everyone. Can't complain. */ 367 /* CPU didn't die: tell everyone. Can't complain. */
368 smpboot_unpark_threads(cpu); 368 smpboot_unpark_threads(cpu);
369 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 369 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
370 goto out_release; 370 goto out_release;
371 } 371 }
372 BUG_ON(cpu_online(cpu)); 372 BUG_ON(cpu_online(cpu));
373 373
374 /* 374 /*
375 * The migration_call() CPU_DYING callback will have removed all 375 * The migration_call() CPU_DYING callback will have removed all
376 * runnable tasks from the cpu, there's only the idle task left now 376 * runnable tasks from the cpu, there's only the idle task left now
377 * that the migration thread is done doing the stop_machine thing. 377 * that the migration thread is done doing the stop_machine thing.
378 * 378 *
379 * Wait for the stop thread to go away. 379 * Wait for the stop thread to go away.
380 */ 380 */
381 while (!idle_cpu(cpu)) 381 while (!idle_cpu(cpu))
382 cpu_relax(); 382 cpu_relax();
383 383
384 /* This actually kills the CPU. */ 384 /* This actually kills the CPU. */
385 __cpu_die(cpu); 385 __cpu_die(cpu);
386 386
387 /* CPU is completely dead: tell everyone. Too late to complain. */ 387 /* CPU is completely dead: tell everyone. Too late to complain. */
388 cpu_notify_nofail(CPU_DEAD | mod, hcpu); 388 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
389 389
390 check_for_tasks(cpu); 390 check_for_tasks(cpu);
391 391
392 out_release: 392 out_release:
393 cpu_hotplug_done(); 393 cpu_hotplug_done();
394 if (!err) 394 if (!err)
395 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); 395 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
396 return err; 396 return err;
397 } 397 }
398 398
399 int __ref cpu_down(unsigned int cpu) 399 int __ref cpu_down(unsigned int cpu)
400 { 400 {
401 int err; 401 int err;
402 402
403 cpu_maps_update_begin(); 403 cpu_maps_update_begin();
404 404
405 if (cpu_hotplug_disabled) { 405 if (cpu_hotplug_disabled) {
406 err = -EBUSY; 406 err = -EBUSY;
407 goto out; 407 goto out;
408 } 408 }
409 409
410 err = _cpu_down(cpu, 0); 410 err = _cpu_down(cpu, 0);
411 411
412 out: 412 out:
413 cpu_maps_update_done(); 413 cpu_maps_update_done();
414 return err; 414 return err;
415 } 415 }
416 EXPORT_SYMBOL(cpu_down); 416 EXPORT_SYMBOL(cpu_down);
417 #endif /*CONFIG_HOTPLUG_CPU*/ 417 #endif /*CONFIG_HOTPLUG_CPU*/
418 418
419 /* Requires cpu_add_remove_lock to be held */ 419 /* Requires cpu_add_remove_lock to be held */
420 static int _cpu_up(unsigned int cpu, int tasks_frozen) 420 static int _cpu_up(unsigned int cpu, int tasks_frozen)
421 { 421 {
422 int ret, nr_calls = 0; 422 int ret, nr_calls = 0;
423 void *hcpu = (void *)(long)cpu; 423 void *hcpu = (void *)(long)cpu;
424 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 424 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
425 struct task_struct *idle; 425 struct task_struct *idle;
426 426
427 cpu_hotplug_begin(); 427 cpu_hotplug_begin();
428 428
429 if (cpu_online(cpu) || !cpu_present(cpu)) { 429 if (cpu_online(cpu) || !cpu_present(cpu)) {
430 ret = -EINVAL; 430 ret = -EINVAL;
431 goto out; 431 goto out;
432 } 432 }
433 433
434 idle = idle_thread_get(cpu); 434 idle = idle_thread_get(cpu);
435 if (IS_ERR(idle)) { 435 if (IS_ERR(idle)) {
436 ret = PTR_ERR(idle); 436 ret = PTR_ERR(idle);
437 goto out; 437 goto out;
438 } 438 }
439 439
440 ret = smpboot_create_threads(cpu); 440 ret = smpboot_create_threads(cpu);
441 if (ret) 441 if (ret)
442 goto out; 442 goto out;
443 443
444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); 444 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
445 if (ret) { 445 if (ret) {
446 nr_calls--; 446 nr_calls--;
447 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", 447 printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
448 __func__, cpu); 448 __func__, cpu);
449 goto out_notify; 449 goto out_notify;
450 } 450 }
451 451
452 /* Arch-specific enabling code. */ 452 /* Arch-specific enabling code. */
453 ret = __cpu_up(cpu, idle); 453 ret = __cpu_up(cpu, idle);
454 if (ret != 0) 454 if (ret != 0)
455 goto out_notify; 455 goto out_notify;
456 BUG_ON(!cpu_online(cpu)); 456 BUG_ON(!cpu_online(cpu));
457 457
458 /* Wake the per cpu threads */ 458 /* Wake the per cpu threads */
459 smpboot_unpark_threads(cpu); 459 smpboot_unpark_threads(cpu);
460 460
461 /* Now call notifier in preparation. */ 461 /* Now call notifier in preparation. */
462 cpu_notify(CPU_ONLINE | mod, hcpu); 462 cpu_notify(CPU_ONLINE | mod, hcpu);
463 463
464 out_notify: 464 out_notify:
465 if (ret != 0) 465 if (ret != 0)
466 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 466 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
467 out: 467 out:
468 cpu_hotplug_done(); 468 cpu_hotplug_done();
469 469
470 return ret; 470 return ret;
471 } 471 }
472 472
473 int cpu_up(unsigned int cpu) 473 int cpu_up(unsigned int cpu)
474 { 474 {
475 int err = 0; 475 int err = 0;
476 476
477 if (!cpu_possible(cpu)) { 477 if (!cpu_possible(cpu)) {
478 printk(KERN_ERR "can't online cpu %d because it is not " 478 printk(KERN_ERR "can't online cpu %d because it is not "
479 "configured as may-hotadd at boot time\n", cpu); 479 "configured as may-hotadd at boot time\n", cpu);
480 #if defined(CONFIG_IA64) 480 #if defined(CONFIG_IA64)
481 printk(KERN_ERR "please check additional_cpus= boot " 481 printk(KERN_ERR "please check additional_cpus= boot "
482 "parameter\n"); 482 "parameter\n");
483 #endif 483 #endif
484 return -EINVAL; 484 return -EINVAL;
485 } 485 }
486 486
487 err = try_online_node(cpu_to_node(cpu)); 487 err = try_online_node(cpu_to_node(cpu));
488 if (err) 488 if (err)
489 return err; 489 return err;
490 490
491 cpu_maps_update_begin(); 491 cpu_maps_update_begin();
492 492
493 if (cpu_hotplug_disabled) { 493 if (cpu_hotplug_disabled) {
494 err = -EBUSY; 494 err = -EBUSY;
495 goto out; 495 goto out;
496 } 496 }
497 497
498 err = _cpu_up(cpu, 0); 498 err = _cpu_up(cpu, 0);
499 499
500 out: 500 out:
501 cpu_maps_update_done(); 501 cpu_maps_update_done();
502 return err; 502 return err;
503 } 503 }
504 EXPORT_SYMBOL_GPL(cpu_up); 504 EXPORT_SYMBOL_GPL(cpu_up);
505 505
506 #ifdef CONFIG_PM_SLEEP_SMP 506 #ifdef CONFIG_PM_SLEEP_SMP
507 static cpumask_var_t frozen_cpus; 507 static cpumask_var_t frozen_cpus;
508 508
509 int disable_nonboot_cpus(void) 509 int disable_nonboot_cpus(void)
510 { 510 {
511 int cpu, first_cpu, error = 0; 511 int cpu, first_cpu, error = 0;
512 512
513 cpu_maps_update_begin(); 513 cpu_maps_update_begin();
514 first_cpu = cpumask_first(cpu_online_mask); 514 first_cpu = cpumask_first(cpu_online_mask);
515 /* 515 /*
516 * We take down all of the non-boot CPUs in one shot to avoid races 516 * We take down all of the non-boot CPUs in one shot to avoid races
517 * with the userspace trying to use the CPU hotplug at the same time 517 * with the userspace trying to use the CPU hotplug at the same time
518 */ 518 */
519 cpumask_clear(frozen_cpus); 519 cpumask_clear(frozen_cpus);
520 520
521 printk("Disabling non-boot CPUs ...\n"); 521 printk("Disabling non-boot CPUs ...\n");
522 for_each_online_cpu(cpu) { 522 for_each_online_cpu(cpu) {
523 if (cpu == first_cpu) 523 if (cpu == first_cpu)
524 continue; 524 continue;
525 error = _cpu_down(cpu, 1); 525 error = _cpu_down(cpu, 1);
526 if (!error) 526 if (!error)
527 cpumask_set_cpu(cpu, frozen_cpus); 527 cpumask_set_cpu(cpu, frozen_cpus);
528 else { 528 else {
529 printk(KERN_ERR "Error taking CPU%d down: %d\n", 529 printk(KERN_ERR "Error taking CPU%d down: %d\n",
530 cpu, error); 530 cpu, error);
531 break; 531 break;
532 } 532 }
533 } 533 }
534 534
535 if (!error) { 535 if (!error) {
536 BUG_ON(num_online_cpus() > 1); 536 BUG_ON(num_online_cpus() > 1);
537 /* Make sure the CPUs won't be enabled by someone else */ 537 /* Make sure the CPUs won't be enabled by someone else */
538 cpu_hotplug_disabled = 1; 538 cpu_hotplug_disabled = 1;
539 } else { 539 } else {
540 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 540 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
541 } 541 }
542 cpu_maps_update_done(); 542 cpu_maps_update_done();
543 return error; 543 return error;
544 } 544 }
545 545
546 void __weak arch_enable_nonboot_cpus_begin(void) 546 void __weak arch_enable_nonboot_cpus_begin(void)
547 { 547 {
548 } 548 }
549 549
550 void __weak arch_enable_nonboot_cpus_end(void) 550 void __weak arch_enable_nonboot_cpus_end(void)
551 { 551 {
552 } 552 }
553 553
554 void __ref enable_nonboot_cpus(void) 554 void __ref enable_nonboot_cpus(void)
555 { 555 {
556 int cpu, error; 556 int cpu, error;
557 557
558 /* Allow everyone to use the CPU hotplug again */ 558 /* Allow everyone to use the CPU hotplug again */
559 cpu_maps_update_begin(); 559 cpu_maps_update_begin();
560 cpu_hotplug_disabled = 0; 560 cpu_hotplug_disabled = 0;
561 if (cpumask_empty(frozen_cpus)) 561 if (cpumask_empty(frozen_cpus))
562 goto out; 562 goto out;
563 563
564 printk(KERN_INFO "Enabling non-boot CPUs ...\n"); 564 printk(KERN_INFO "Enabling non-boot CPUs ...\n");
565 565
566 arch_enable_nonboot_cpus_begin(); 566 arch_enable_nonboot_cpus_begin();
567 567
568 for_each_cpu(cpu, frozen_cpus) { 568 for_each_cpu(cpu, frozen_cpus) {
569 error = _cpu_up(cpu, 1); 569 error = _cpu_up(cpu, 1);
570 if (!error) { 570 if (!error) {
571 printk(KERN_INFO "CPU%d is up\n", cpu); 571 printk(KERN_INFO "CPU%d is up\n", cpu);
572 continue; 572 continue;
573 } 573 }
574 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); 574 printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
575 } 575 }
576 576
577 arch_enable_nonboot_cpus_end(); 577 arch_enable_nonboot_cpus_end();
578 578
579 cpumask_clear(frozen_cpus); 579 cpumask_clear(frozen_cpus);
580 out: 580 out:
581 cpu_maps_update_done(); 581 cpu_maps_update_done();
582 } 582 }
583 583
584 static int __init alloc_frozen_cpus(void) 584 static int __init alloc_frozen_cpus(void)
585 { 585 {
586 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) 586 if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
587 return -ENOMEM; 587 return -ENOMEM;
588 return 0; 588 return 0;
589 } 589 }
590 core_initcall(alloc_frozen_cpus); 590 core_initcall(alloc_frozen_cpus);
591 591
592 /* 592 /*
593 * When callbacks for CPU hotplug notifications are being executed, we must 593 * When callbacks for CPU hotplug notifications are being executed, we must
594 * ensure that the state of the system with respect to the tasks being frozen 594 * ensure that the state of the system with respect to the tasks being frozen
595 * or not, as reported by the notification, remains unchanged *throughout the 595 * or not, as reported by the notification, remains unchanged *throughout the
596 * duration* of the execution of the callbacks. 596 * duration* of the execution of the callbacks.
597 * Hence we need to prevent the freezer from racing with regular CPU hotplug. 597 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
598 * 598 *
599 * This synchronization is implemented by mutually excluding regular CPU 599 * This synchronization is implemented by mutually excluding regular CPU
600 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ 600 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
601 * Hibernate notifications. 601 * Hibernate notifications.
602 */ 602 */
603 static int 603 static int
604 cpu_hotplug_pm_callback(struct notifier_block *nb, 604 cpu_hotplug_pm_callback(struct notifier_block *nb,
605 unsigned long action, void *ptr) 605 unsigned long action, void *ptr)
606 { 606 {
607 switch (action) { 607 switch (action) {
608 608
609 case PM_SUSPEND_PREPARE: 609 case PM_SUSPEND_PREPARE:
610 case PM_HIBERNATION_PREPARE: 610 case PM_HIBERNATION_PREPARE:
611 cpu_hotplug_disable(); 611 cpu_hotplug_disable();
612 break; 612 break;
613 613
614 case PM_POST_SUSPEND: 614 case PM_POST_SUSPEND:
615 case PM_POST_HIBERNATION: 615 case PM_POST_HIBERNATION:
616 cpu_hotplug_enable(); 616 cpu_hotplug_enable();
617 break; 617 break;
618 618
619 default: 619 default:
620 return NOTIFY_DONE; 620 return NOTIFY_DONE;
621 } 621 }
622 622
623 return NOTIFY_OK; 623 return NOTIFY_OK;
624 } 624 }
625 625
626 626
627 static int __init cpu_hotplug_pm_sync_init(void) 627 static int __init cpu_hotplug_pm_sync_init(void)
628 { 628 {
629 /* 629 /*
630 * cpu_hotplug_pm_callback has higher priority than x86 630 * cpu_hotplug_pm_callback has higher priority than x86
631 * bsp_pm_callback which depends on cpu_hotplug_pm_callback 631 * bsp_pm_callback which depends on cpu_hotplug_pm_callback
632 * to disable cpu hotplug to avoid cpu hotplug race. 632 * to disable cpu hotplug to avoid cpu hotplug race.
633 */ 633 */
634 pm_notifier(cpu_hotplug_pm_callback, 0); 634 pm_notifier(cpu_hotplug_pm_callback, 0);
635 return 0; 635 return 0;
636 } 636 }
637 core_initcall(cpu_hotplug_pm_sync_init); 637 core_initcall(cpu_hotplug_pm_sync_init);
638 638
639 #endif /* CONFIG_PM_SLEEP_SMP */ 639 #endif /* CONFIG_PM_SLEEP_SMP */
640 640
641 /** 641 /**
642 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers 642 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
643 * @cpu: cpu that just started 643 * @cpu: cpu that just started
644 * 644 *
645 * This function calls the cpu_chain notifiers with CPU_STARTING. 645 * This function calls the cpu_chain notifiers with CPU_STARTING.
646 * It must be called by the arch code on the new cpu, before the new cpu 646 * It must be called by the arch code on the new cpu, before the new cpu
647 * enables interrupts and before the "boot" cpu returns from __cpu_up(). 647 * enables interrupts and before the "boot" cpu returns from __cpu_up().
648 */ 648 */
649 void notify_cpu_starting(unsigned int cpu) 649 void notify_cpu_starting(unsigned int cpu)
650 { 650 {
651 unsigned long val = CPU_STARTING; 651 unsigned long val = CPU_STARTING;
652 652
653 #ifdef CONFIG_PM_SLEEP_SMP 653 #ifdef CONFIG_PM_SLEEP_SMP
654 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 654 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
655 val = CPU_STARTING_FROZEN; 655 val = CPU_STARTING_FROZEN;
656 #endif /* CONFIG_PM_SLEEP_SMP */ 656 #endif /* CONFIG_PM_SLEEP_SMP */
657 cpu_notify(val, (void *)(long)cpu); 657 cpu_notify(val, (void *)(long)cpu);
658 } 658 }
659 659
660 #endif /* CONFIG_SMP */ 660 #endif /* CONFIG_SMP */
661 661
662 /* 662 /*
663 * cpu_bit_bitmap[] is a special, "compressed" data structure that 663 * cpu_bit_bitmap[] is a special, "compressed" data structure that
664 * represents all NR_CPUS bits binary values of 1<<nr. 664 * represents all NR_CPUS bits binary values of 1<<nr.
665 * 665 *
666 * It is used by cpumask_of() to get a constant address to a CPU 666 * It is used by cpumask_of() to get a constant address to a CPU
667 * mask value that has a single bit set only. 667 * mask value that has a single bit set only.
668 */ 668 */
669 669
670 /* cpu_bit_bitmap[0] is empty - so we can back into it */ 670 /* cpu_bit_bitmap[0] is empty - so we can back into it */
671 #define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x)) 671 #define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
672 #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1) 672 #define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
673 #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2) 673 #define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
674 #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4) 674 #define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
675 675
676 const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = { 676 const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
677 677
678 MASK_DECLARE_8(0), MASK_DECLARE_8(8), 678 MASK_DECLARE_8(0), MASK_DECLARE_8(8),
679 MASK_DECLARE_8(16), MASK_DECLARE_8(24), 679 MASK_DECLARE_8(16), MASK_DECLARE_8(24),
680 #if BITS_PER_LONG > 32 680 #if BITS_PER_LONG > 32
681 MASK_DECLARE_8(32), MASK_DECLARE_8(40), 681 MASK_DECLARE_8(32), MASK_DECLARE_8(40),
682 MASK_DECLARE_8(48), MASK_DECLARE_8(56), 682 MASK_DECLARE_8(48), MASK_DECLARE_8(56),
683 #endif 683 #endif
684 }; 684 };
685 EXPORT_SYMBOL_GPL(cpu_bit_bitmap); 685 EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
686 686
687 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; 687 const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
688 EXPORT_SYMBOL(cpu_all_bits); 688 EXPORT_SYMBOL(cpu_all_bits);
689 689
690 #ifdef CONFIG_INIT_ALL_POSSIBLE 690 #ifdef CONFIG_INIT_ALL_POSSIBLE
691 static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly 691 static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
692 = CPU_BITS_ALL; 692 = CPU_BITS_ALL;
693 #else 693 #else
694 static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; 694 static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
695 #endif 695 #endif
696 const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); 696 const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
697 EXPORT_SYMBOL(cpu_possible_mask); 697 EXPORT_SYMBOL(cpu_possible_mask);
698 698
699 static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; 699 static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
700 const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); 700 const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
701 EXPORT_SYMBOL(cpu_online_mask); 701 EXPORT_SYMBOL(cpu_online_mask);
702 702
703 static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; 703 static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
704 const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); 704 const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
705 EXPORT_SYMBOL(cpu_present_mask); 705 EXPORT_SYMBOL(cpu_present_mask);
706 706
707 static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; 707 static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
708 const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); 708 const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
709 EXPORT_SYMBOL(cpu_active_mask); 709 EXPORT_SYMBOL(cpu_active_mask);
710 710
711 void set_cpu_possible(unsigned int cpu, bool possible) 711 void set_cpu_possible(unsigned int cpu, bool possible)
712 { 712 {
713 if (possible) 713 if (possible)
714 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); 714 cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
715 else 715 else
716 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); 716 cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
717 } 717 }
718 718
719 void set_cpu_present(unsigned int cpu, bool present) 719 void set_cpu_present(unsigned int cpu, bool present)
720 { 720 {
721 if (present) 721 if (present)
722 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); 722 cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
723 else 723 else
724 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); 724 cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
725 } 725 }
726 726
727 void set_cpu_online(unsigned int cpu, bool online) 727 void set_cpu_online(unsigned int cpu, bool online)
728 { 728 {
729 if (online) 729 if (online) {
730 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); 730 cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
731 else 731 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
732 } else {
732 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); 733 cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
734 }
733 } 735 }
734 736
735 void set_cpu_active(unsigned int cpu, bool active) 737 void set_cpu_active(unsigned int cpu, bool active)
736 { 738 {
737 if (active) 739 if (active)
738 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); 740 cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
739 else 741 else
740 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); 742 cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
741 } 743 }
742 744
743 void init_cpu_present(const struct cpumask *src) 745 void init_cpu_present(const struct cpumask *src)
744 { 746 {
745 cpumask_copy(to_cpumask(cpu_present_bits), src); 747 cpumask_copy(to_cpumask(cpu_present_bits), src);
746 } 748 }
747 749
748 void init_cpu_possible(const struct cpumask *src) 750 void init_cpu_possible(const struct cpumask *src)
749 { 751 {
750 cpumask_copy(to_cpumask(cpu_possible_bits), src); 752 cpumask_copy(to_cpumask(cpu_possible_bits), src);
751 } 753 }
752 754
753 void init_cpu_online(const struct cpumask *src) 755 void init_cpu_online(const struct cpumask *src)
754 { 756 {
755 cpumask_copy(to_cpumask(cpu_online_bits), src); 757 cpumask_copy(to_cpumask(cpu_online_bits), src);
756 } 758 }
757 759
1 /* 1 /*
2 * kernel/sched/core.c 2 * kernel/sched/core.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz 26 * Thomas Gleixner, Mike Kravetz
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/uaccess.h> 33 #include <linux/uaccess.h>
34 #include <linux/highmem.h> 34 #include <linux/highmem.h>
35 #include <asm/mmu_context.h> 35 #include <asm/mmu_context.h>
36 #include <linux/interrupt.h> 36 #include <linux/interrupt.h>
37 #include <linux/capability.h> 37 #include <linux/capability.h>
38 #include <linux/completion.h> 38 #include <linux/completion.h>
39 #include <linux/kernel_stat.h> 39 #include <linux/kernel_stat.h>
40 #include <linux/debug_locks.h> 40 #include <linux/debug_locks.h>
41 #include <linux/perf_event.h> 41 #include <linux/perf_event.h>
42 #include <linux/security.h> 42 #include <linux/security.h>
43 #include <linux/notifier.h> 43 #include <linux/notifier.h>
44 #include <linux/profile.h> 44 #include <linux/profile.h>
45 #include <linux/freezer.h> 45 #include <linux/freezer.h>
46 #include <linux/vmalloc.h> 46 #include <linux/vmalloc.h>
47 #include <linux/blkdev.h> 47 #include <linux/blkdev.h>
48 #include <linux/delay.h> 48 #include <linux/delay.h>
49 #include <linux/pid_namespace.h> 49 #include <linux/pid_namespace.h>
50 #include <linux/smp.h> 50 #include <linux/smp.h>
51 #include <linux/threads.h> 51 #include <linux/threads.h>
52 #include <linux/timer.h> 52 #include <linux/timer.h>
53 #include <linux/rcupdate.h> 53 #include <linux/rcupdate.h>
54 #include <linux/cpu.h> 54 #include <linux/cpu.h>
55 #include <linux/cpuset.h> 55 #include <linux/cpuset.h>
56 #include <linux/percpu.h> 56 #include <linux/percpu.h>
57 #include <linux/proc_fs.h> 57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h> 58 #include <linux/seq_file.h>
59 #include <linux/sysctl.h> 59 #include <linux/sysctl.h>
60 #include <linux/syscalls.h> 60 #include <linux/syscalls.h>
61 #include <linux/times.h> 61 #include <linux/times.h>
62 #include <linux/tsacct_kern.h> 62 #include <linux/tsacct_kern.h>
63 #include <linux/kprobes.h> 63 #include <linux/kprobes.h>
64 #include <linux/delayacct.h> 64 #include <linux/delayacct.h>
65 #include <linux/unistd.h> 65 #include <linux/unistd.h>
66 #include <linux/pagemap.h> 66 #include <linux/pagemap.h>
67 #include <linux/hrtimer.h> 67 #include <linux/hrtimer.h>
68 #include <linux/tick.h> 68 #include <linux/tick.h>
69 #include <linux/debugfs.h> 69 #include <linux/debugfs.h>
70 #include <linux/ctype.h> 70 #include <linux/ctype.h>
71 #include <linux/ftrace.h> 71 #include <linux/ftrace.h>
72 #include <linux/slab.h> 72 #include <linux/slab.h>
73 #include <linux/init_task.h> 73 #include <linux/init_task.h>
74 #include <linux/binfmts.h> 74 #include <linux/binfmts.h>
75 #include <linux/context_tracking.h> 75 #include <linux/context_tracking.h>
76 #include <linux/compiler.h> 76 #include <linux/compiler.h>
77 77
78 #include <asm/switch_to.h> 78 #include <asm/switch_to.h>
79 #include <asm/tlb.h> 79 #include <asm/tlb.h>
80 #include <asm/irq_regs.h> 80 #include <asm/irq_regs.h>
81 #include <asm/mutex.h> 81 #include <asm/mutex.h>
82 #ifdef CONFIG_PARAVIRT 82 #ifdef CONFIG_PARAVIRT
83 #include <asm/paravirt.h> 83 #include <asm/paravirt.h>
84 #endif 84 #endif
85 85
86 #include "sched.h" 86 #include "sched.h"
87 #include "../workqueue_internal.h" 87 #include "../workqueue_internal.h"
88 #include "../smpboot.h" 88 #include "../smpboot.h"
89 89
90 #define CREATE_TRACE_POINTS 90 #define CREATE_TRACE_POINTS
91 #include <trace/events/sched.h> 91 #include <trace/events/sched.h>
92 92
93 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 93 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
94 { 94 {
95 unsigned long delta; 95 unsigned long delta;
96 ktime_t soft, hard, now; 96 ktime_t soft, hard, now;
97 97
98 for (;;) { 98 for (;;) {
99 if (hrtimer_active(period_timer)) 99 if (hrtimer_active(period_timer))
100 break; 100 break;
101 101
102 now = hrtimer_cb_get_time(period_timer); 102 now = hrtimer_cb_get_time(period_timer);
103 hrtimer_forward(period_timer, now, period); 103 hrtimer_forward(period_timer, now, period);
104 104
105 soft = hrtimer_get_softexpires(period_timer); 105 soft = hrtimer_get_softexpires(period_timer);
106 hard = hrtimer_get_expires(period_timer); 106 hard = hrtimer_get_expires(period_timer);
107 delta = ktime_to_ns(ktime_sub(hard, soft)); 107 delta = ktime_to_ns(ktime_sub(hard, soft));
108 __hrtimer_start_range_ns(period_timer, soft, delta, 108 __hrtimer_start_range_ns(period_timer, soft, delta,
109 HRTIMER_MODE_ABS_PINNED, 0); 109 HRTIMER_MODE_ABS_PINNED, 0);
110 } 110 }
111 } 111 }
112 112
113 DEFINE_MUTEX(sched_domains_mutex); 113 DEFINE_MUTEX(sched_domains_mutex);
114 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 114 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
115 115
116 static void update_rq_clock_task(struct rq *rq, s64 delta); 116 static void update_rq_clock_task(struct rq *rq, s64 delta);
117 117
118 void update_rq_clock(struct rq *rq) 118 void update_rq_clock(struct rq *rq)
119 { 119 {
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 if (rq->skip_clock_update > 0)
123 return; 123 return;
124 124
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
126 rq->clock += delta; 126 rq->clock += delta;
127 update_rq_clock_task(rq, delta); 127 update_rq_clock_task(rq, delta);
128 } 128 }
129 129
130 /* 130 /*
131 * Debugging: various feature bits 131 * Debugging: various feature bits
132 */ 132 */
133 133
134 #define SCHED_FEAT(name, enabled) \ 134 #define SCHED_FEAT(name, enabled) \
135 (1UL << __SCHED_FEAT_##name) * enabled | 135 (1UL << __SCHED_FEAT_##name) * enabled |
136 136
137 const_debug unsigned int sysctl_sched_features = 137 const_debug unsigned int sysctl_sched_features =
138 #include "features.h" 138 #include "features.h"
139 0; 139 0;
140 140
141 #undef SCHED_FEAT 141 #undef SCHED_FEAT
142 142
143 #ifdef CONFIG_SCHED_DEBUG 143 #ifdef CONFIG_SCHED_DEBUG
144 #define SCHED_FEAT(name, enabled) \ 144 #define SCHED_FEAT(name, enabled) \
145 #name , 145 #name ,
146 146
147 static const char * const sched_feat_names[] = { 147 static const char * const sched_feat_names[] = {
148 #include "features.h" 148 #include "features.h"
149 }; 149 };
150 150
151 #undef SCHED_FEAT 151 #undef SCHED_FEAT
152 152
153 static int sched_feat_show(struct seq_file *m, void *v) 153 static int sched_feat_show(struct seq_file *m, void *v)
154 { 154 {
155 int i; 155 int i;
156 156
157 for (i = 0; i < __SCHED_FEAT_NR; i++) { 157 for (i = 0; i < __SCHED_FEAT_NR; i++) {
158 if (!(sysctl_sched_features & (1UL << i))) 158 if (!(sysctl_sched_features & (1UL << i)))
159 seq_puts(m, "NO_"); 159 seq_puts(m, "NO_");
160 seq_printf(m, "%s ", sched_feat_names[i]); 160 seq_printf(m, "%s ", sched_feat_names[i]);
161 } 161 }
162 seq_puts(m, "\n"); 162 seq_puts(m, "\n");
163 163
164 return 0; 164 return 0;
165 } 165 }
166 166
167 #ifdef HAVE_JUMP_LABEL 167 #ifdef HAVE_JUMP_LABEL
168 168
169 #define jump_label_key__true STATIC_KEY_INIT_TRUE 169 #define jump_label_key__true STATIC_KEY_INIT_TRUE
170 #define jump_label_key__false STATIC_KEY_INIT_FALSE 170 #define jump_label_key__false STATIC_KEY_INIT_FALSE
171 171
172 #define SCHED_FEAT(name, enabled) \ 172 #define SCHED_FEAT(name, enabled) \
173 jump_label_key__##enabled , 173 jump_label_key__##enabled ,
174 174
175 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { 175 struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
176 #include "features.h" 176 #include "features.h"
177 }; 177 };
178 178
179 #undef SCHED_FEAT 179 #undef SCHED_FEAT
180 180
181 static void sched_feat_disable(int i) 181 static void sched_feat_disable(int i)
182 { 182 {
183 if (static_key_enabled(&sched_feat_keys[i])) 183 if (static_key_enabled(&sched_feat_keys[i]))
184 static_key_slow_dec(&sched_feat_keys[i]); 184 static_key_slow_dec(&sched_feat_keys[i]);
185 } 185 }
186 186
187 static void sched_feat_enable(int i) 187 static void sched_feat_enable(int i)
188 { 188 {
189 if (!static_key_enabled(&sched_feat_keys[i])) 189 if (!static_key_enabled(&sched_feat_keys[i]))
190 static_key_slow_inc(&sched_feat_keys[i]); 190 static_key_slow_inc(&sched_feat_keys[i]);
191 } 191 }
192 #else 192 #else
193 static void sched_feat_disable(int i) { }; 193 static void sched_feat_disable(int i) { };
194 static void sched_feat_enable(int i) { }; 194 static void sched_feat_enable(int i) { };
195 #endif /* HAVE_JUMP_LABEL */ 195 #endif /* HAVE_JUMP_LABEL */
196 196
197 static int sched_feat_set(char *cmp) 197 static int sched_feat_set(char *cmp)
198 { 198 {
199 int i; 199 int i;
200 int neg = 0; 200 int neg = 0;
201 201
202 if (strncmp(cmp, "NO_", 3) == 0) { 202 if (strncmp(cmp, "NO_", 3) == 0) {
203 neg = 1; 203 neg = 1;
204 cmp += 3; 204 cmp += 3;
205 } 205 }
206 206
207 for (i = 0; i < __SCHED_FEAT_NR; i++) { 207 for (i = 0; i < __SCHED_FEAT_NR; i++) {
208 if (strcmp(cmp, sched_feat_names[i]) == 0) { 208 if (strcmp(cmp, sched_feat_names[i]) == 0) {
209 if (neg) { 209 if (neg) {
210 sysctl_sched_features &= ~(1UL << i); 210 sysctl_sched_features &= ~(1UL << i);
211 sched_feat_disable(i); 211 sched_feat_disable(i);
212 } else { 212 } else {
213 sysctl_sched_features |= (1UL << i); 213 sysctl_sched_features |= (1UL << i);
214 sched_feat_enable(i); 214 sched_feat_enable(i);
215 } 215 }
216 break; 216 break;
217 } 217 }
218 } 218 }
219 219
220 return i; 220 return i;
221 } 221 }
222 222
223 static ssize_t 223 static ssize_t
224 sched_feat_write(struct file *filp, const char __user *ubuf, 224 sched_feat_write(struct file *filp, const char __user *ubuf,
225 size_t cnt, loff_t *ppos) 225 size_t cnt, loff_t *ppos)
226 { 226 {
227 char buf[64]; 227 char buf[64];
228 char *cmp; 228 char *cmp;
229 int i; 229 int i;
230 230
231 if (cnt > 63) 231 if (cnt > 63)
232 cnt = 63; 232 cnt = 63;
233 233
234 if (copy_from_user(&buf, ubuf, cnt)) 234 if (copy_from_user(&buf, ubuf, cnt))
235 return -EFAULT; 235 return -EFAULT;
236 236
237 buf[cnt] = 0; 237 buf[cnt] = 0;
238 cmp = strstrip(buf); 238 cmp = strstrip(buf);
239 239
240 i = sched_feat_set(cmp); 240 i = sched_feat_set(cmp);
241 if (i == __SCHED_FEAT_NR) 241 if (i == __SCHED_FEAT_NR)
242 return -EINVAL; 242 return -EINVAL;
243 243
244 *ppos += cnt; 244 *ppos += cnt;
245 245
246 return cnt; 246 return cnt;
247 } 247 }
248 248
249 static int sched_feat_open(struct inode *inode, struct file *filp) 249 static int sched_feat_open(struct inode *inode, struct file *filp)
250 { 250 {
251 return single_open(filp, sched_feat_show, NULL); 251 return single_open(filp, sched_feat_show, NULL);
252 } 252 }
253 253
254 static const struct file_operations sched_feat_fops = { 254 static const struct file_operations sched_feat_fops = {
255 .open = sched_feat_open, 255 .open = sched_feat_open,
256 .write = sched_feat_write, 256 .write = sched_feat_write,
257 .read = seq_read, 257 .read = seq_read,
258 .llseek = seq_lseek, 258 .llseek = seq_lseek,
259 .release = single_release, 259 .release = single_release,
260 }; 260 };
261 261
262 static __init int sched_init_debug(void) 262 static __init int sched_init_debug(void)
263 { 263 {
264 debugfs_create_file("sched_features", 0644, NULL, NULL, 264 debugfs_create_file("sched_features", 0644, NULL, NULL,
265 &sched_feat_fops); 265 &sched_feat_fops);
266 266
267 return 0; 267 return 0;
268 } 268 }
269 late_initcall(sched_init_debug); 269 late_initcall(sched_init_debug);
270 #endif /* CONFIG_SCHED_DEBUG */ 270 #endif /* CONFIG_SCHED_DEBUG */
271 271
272 /* 272 /*
273 * Number of tasks to iterate in a single balance run. 273 * Number of tasks to iterate in a single balance run.
274 * Limited because this is done with IRQs disabled. 274 * Limited because this is done with IRQs disabled.
275 */ 275 */
276 const_debug unsigned int sysctl_sched_nr_migrate = 32; 276 const_debug unsigned int sysctl_sched_nr_migrate = 32;
277 277
278 /* 278 /*
279 * period over which we average the RT time consumption, measured 279 * period over which we average the RT time consumption, measured
280 * in ms. 280 * in ms.
281 * 281 *
282 * default: 1s 282 * default: 1s
283 */ 283 */
284 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 284 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
285 285
286 /* 286 /*
287 * period over which we measure -rt task cpu usage in us. 287 * period over which we measure -rt task cpu usage in us.
288 * default: 1s 288 * default: 1s
289 */ 289 */
290 unsigned int sysctl_sched_rt_period = 1000000; 290 unsigned int sysctl_sched_rt_period = 1000000;
291 291
292 __read_mostly int scheduler_running; 292 __read_mostly int scheduler_running;
293 293
294 /* 294 /*
295 * part of the period that we allow rt tasks to run in us. 295 * part of the period that we allow rt tasks to run in us.
296 * default: 0.95s 296 * default: 0.95s
297 */ 297 */
298 int sysctl_sched_rt_runtime = 950000; 298 int sysctl_sched_rt_runtime = 950000;
299 299
300 /* 300 /*
301 * __task_rq_lock - lock the rq @p resides on. 301 * __task_rq_lock - lock the rq @p resides on.
302 */ 302 */
303 static inline struct rq *__task_rq_lock(struct task_struct *p) 303 static inline struct rq *__task_rq_lock(struct task_struct *p)
304 __acquires(rq->lock) 304 __acquires(rq->lock)
305 { 305 {
306 struct rq *rq; 306 struct rq *rq;
307 307
308 lockdep_assert_held(&p->pi_lock); 308 lockdep_assert_held(&p->pi_lock);
309 309
310 for (;;) { 310 for (;;) {
311 rq = task_rq(p); 311 rq = task_rq(p);
312 raw_spin_lock(&rq->lock); 312 raw_spin_lock(&rq->lock);
313 if (likely(rq == task_rq(p))) 313 if (likely(rq == task_rq(p)))
314 return rq; 314 return rq;
315 raw_spin_unlock(&rq->lock); 315 raw_spin_unlock(&rq->lock);
316 } 316 }
317 } 317 }
318 318
319 /* 319 /*
320 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. 320 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
321 */ 321 */
322 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 322 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
323 __acquires(p->pi_lock) 323 __acquires(p->pi_lock)
324 __acquires(rq->lock) 324 __acquires(rq->lock)
325 { 325 {
326 struct rq *rq; 326 struct rq *rq;
327 327
328 for (;;) { 328 for (;;) {
329 raw_spin_lock_irqsave(&p->pi_lock, *flags); 329 raw_spin_lock_irqsave(&p->pi_lock, *flags);
330 rq = task_rq(p); 330 rq = task_rq(p);
331 raw_spin_lock(&rq->lock); 331 raw_spin_lock(&rq->lock);
332 if (likely(rq == task_rq(p))) 332 if (likely(rq == task_rq(p)))
333 return rq; 333 return rq;
334 raw_spin_unlock(&rq->lock); 334 raw_spin_unlock(&rq->lock);
335 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 335 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
336 } 336 }
337 } 337 }
338 338
339 static void __task_rq_unlock(struct rq *rq) 339 static void __task_rq_unlock(struct rq *rq)
340 __releases(rq->lock) 340 __releases(rq->lock)
341 { 341 {
342 raw_spin_unlock(&rq->lock); 342 raw_spin_unlock(&rq->lock);
343 } 343 }
344 344
345 static inline void 345 static inline void
346 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) 346 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
347 __releases(rq->lock) 347 __releases(rq->lock)
348 __releases(p->pi_lock) 348 __releases(p->pi_lock)
349 { 349 {
350 raw_spin_unlock(&rq->lock); 350 raw_spin_unlock(&rq->lock);
351 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 351 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
352 } 352 }
353 353
354 /* 354 /*
355 * this_rq_lock - lock this runqueue and disable interrupts. 355 * this_rq_lock - lock this runqueue and disable interrupts.
356 */ 356 */
357 static struct rq *this_rq_lock(void) 357 static struct rq *this_rq_lock(void)
358 __acquires(rq->lock) 358 __acquires(rq->lock)
359 { 359 {
360 struct rq *rq; 360 struct rq *rq;
361 361
362 local_irq_disable(); 362 local_irq_disable();
363 rq = this_rq(); 363 rq = this_rq();
364 raw_spin_lock(&rq->lock); 364 raw_spin_lock(&rq->lock);
365 365
366 return rq; 366 return rq;
367 } 367 }
368 368
369 #ifdef CONFIG_SCHED_HRTICK 369 #ifdef CONFIG_SCHED_HRTICK
370 /* 370 /*
371 * Use HR-timers to deliver accurate preemption points. 371 * Use HR-timers to deliver accurate preemption points.
372 */ 372 */
373 373
374 static void hrtick_clear(struct rq *rq) 374 static void hrtick_clear(struct rq *rq)
375 { 375 {
376 if (hrtimer_active(&rq->hrtick_timer)) 376 if (hrtimer_active(&rq->hrtick_timer))
377 hrtimer_cancel(&rq->hrtick_timer); 377 hrtimer_cancel(&rq->hrtick_timer);
378 } 378 }
379 379
380 /* 380 /*
381 * High-resolution timer tick. 381 * High-resolution timer tick.
382 * Runs from hardirq context with interrupts disabled. 382 * Runs from hardirq context with interrupts disabled.
383 */ 383 */
384 static enum hrtimer_restart hrtick(struct hrtimer *timer) 384 static enum hrtimer_restart hrtick(struct hrtimer *timer)
385 { 385 {
386 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 386 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
387 387
388 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 388 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
389 389
390 raw_spin_lock(&rq->lock); 390 raw_spin_lock(&rq->lock);
391 update_rq_clock(rq); 391 update_rq_clock(rq);
392 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 392 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
393 raw_spin_unlock(&rq->lock); 393 raw_spin_unlock(&rq->lock);
394 394
395 return HRTIMER_NORESTART; 395 return HRTIMER_NORESTART;
396 } 396 }
397 397
398 #ifdef CONFIG_SMP 398 #ifdef CONFIG_SMP
399 399
400 static int __hrtick_restart(struct rq *rq) 400 static int __hrtick_restart(struct rq *rq)
401 { 401 {
402 struct hrtimer *timer = &rq->hrtick_timer; 402 struct hrtimer *timer = &rq->hrtick_timer;
403 ktime_t time = hrtimer_get_softexpires(timer); 403 ktime_t time = hrtimer_get_softexpires(timer);
404 404
405 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); 405 return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
406 } 406 }
407 407
408 /* 408 /*
409 * called from hardirq (IPI) context 409 * called from hardirq (IPI) context
410 */ 410 */
411 static void __hrtick_start(void *arg) 411 static void __hrtick_start(void *arg)
412 { 412 {
413 struct rq *rq = arg; 413 struct rq *rq = arg;
414 414
415 raw_spin_lock(&rq->lock); 415 raw_spin_lock(&rq->lock);
416 __hrtick_restart(rq); 416 __hrtick_restart(rq);
417 rq->hrtick_csd_pending = 0; 417 rq->hrtick_csd_pending = 0;
418 raw_spin_unlock(&rq->lock); 418 raw_spin_unlock(&rq->lock);
419 } 419 }
420 420
421 /* 421 /*
422 * Called to set the hrtick timer state. 422 * Called to set the hrtick timer state.
423 * 423 *
424 * called with rq->lock held and irqs disabled 424 * called with rq->lock held and irqs disabled
425 */ 425 */
426 void hrtick_start(struct rq *rq, u64 delay) 426 void hrtick_start(struct rq *rq, u64 delay)
427 { 427 {
428 struct hrtimer *timer = &rq->hrtick_timer; 428 struct hrtimer *timer = &rq->hrtick_timer;
429 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 429 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
430 430
431 hrtimer_set_expires(timer, time); 431 hrtimer_set_expires(timer, time);
432 432
433 if (rq == this_rq()) { 433 if (rq == this_rq()) {
434 __hrtick_restart(rq); 434 __hrtick_restart(rq);
435 } else if (!rq->hrtick_csd_pending) { 435 } else if (!rq->hrtick_csd_pending) {
436 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); 436 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
437 rq->hrtick_csd_pending = 1; 437 rq->hrtick_csd_pending = 1;
438 } 438 }
439 } 439 }
440 440
441 static int 441 static int
442 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 442 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
443 { 443 {
444 int cpu = (int)(long)hcpu; 444 int cpu = (int)(long)hcpu;
445 445
446 switch (action) { 446 switch (action) {
447 case CPU_UP_CANCELED: 447 case CPU_UP_CANCELED:
448 case CPU_UP_CANCELED_FROZEN: 448 case CPU_UP_CANCELED_FROZEN:
449 case CPU_DOWN_PREPARE: 449 case CPU_DOWN_PREPARE:
450 case CPU_DOWN_PREPARE_FROZEN: 450 case CPU_DOWN_PREPARE_FROZEN:
451 case CPU_DEAD: 451 case CPU_DEAD:
452 case CPU_DEAD_FROZEN: 452 case CPU_DEAD_FROZEN:
453 hrtick_clear(cpu_rq(cpu)); 453 hrtick_clear(cpu_rq(cpu));
454 return NOTIFY_OK; 454 return NOTIFY_OK;
455 } 455 }
456 456
457 return NOTIFY_DONE; 457 return NOTIFY_DONE;
458 } 458 }
459 459
460 static __init void init_hrtick(void) 460 static __init void init_hrtick(void)
461 { 461 {
462 hotcpu_notifier(hotplug_hrtick, 0); 462 hotcpu_notifier(hotplug_hrtick, 0);
463 } 463 }
464 #else 464 #else
465 /* 465 /*
466 * Called to set the hrtick timer state. 466 * Called to set the hrtick timer state.
467 * 467 *
468 * called with rq->lock held and irqs disabled 468 * called with rq->lock held and irqs disabled
469 */ 469 */
470 void hrtick_start(struct rq *rq, u64 delay) 470 void hrtick_start(struct rq *rq, u64 delay)
471 { 471 {
472 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 472 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
473 HRTIMER_MODE_REL_PINNED, 0); 473 HRTIMER_MODE_REL_PINNED, 0);
474 } 474 }
475 475
476 static inline void init_hrtick(void) 476 static inline void init_hrtick(void)
477 { 477 {
478 } 478 }
479 #endif /* CONFIG_SMP */ 479 #endif /* CONFIG_SMP */
480 480
481 static void init_rq_hrtick(struct rq *rq) 481 static void init_rq_hrtick(struct rq *rq)
482 { 482 {
483 #ifdef CONFIG_SMP 483 #ifdef CONFIG_SMP
484 rq->hrtick_csd_pending = 0; 484 rq->hrtick_csd_pending = 0;
485 485
486 rq->hrtick_csd.flags = 0; 486 rq->hrtick_csd.flags = 0;
487 rq->hrtick_csd.func = __hrtick_start; 487 rq->hrtick_csd.func = __hrtick_start;
488 rq->hrtick_csd.info = rq; 488 rq->hrtick_csd.info = rq;
489 #endif 489 #endif
490 490
491 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 491 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
492 rq->hrtick_timer.function = hrtick; 492 rq->hrtick_timer.function = hrtick;
493 } 493 }
494 #else /* CONFIG_SCHED_HRTICK */ 494 #else /* CONFIG_SCHED_HRTICK */
495 static inline void hrtick_clear(struct rq *rq) 495 static inline void hrtick_clear(struct rq *rq)
496 { 496 {
497 } 497 }
498 498
499 static inline void init_rq_hrtick(struct rq *rq) 499 static inline void init_rq_hrtick(struct rq *rq)
500 { 500 {
501 } 501 }
502 502
503 static inline void init_hrtick(void) 503 static inline void init_hrtick(void)
504 { 504 {
505 } 505 }
506 #endif /* CONFIG_SCHED_HRTICK */ 506 #endif /* CONFIG_SCHED_HRTICK */
507 507
508 /* 508 /*
509 * resched_task - mark a task 'to be rescheduled now'. 509 * resched_task - mark a task 'to be rescheduled now'.
510 * 510 *
511 * On UP this means the setting of the need_resched flag, on SMP it 511 * On UP this means the setting of the need_resched flag, on SMP it
512 * might also involve a cross-CPU call to trigger the scheduler on 512 * might also involve a cross-CPU call to trigger the scheduler on
513 * the target CPU. 513 * the target CPU.
514 */ 514 */
515 void resched_task(struct task_struct *p) 515 void resched_task(struct task_struct *p)
516 { 516 {
517 int cpu; 517 int cpu;
518 518
519 lockdep_assert_held(&task_rq(p)->lock); 519 lockdep_assert_held(&task_rq(p)->lock);
520 520
521 if (test_tsk_need_resched(p)) 521 if (test_tsk_need_resched(p))
522 return; 522 return;
523 523
524 set_tsk_need_resched(p); 524 set_tsk_need_resched(p);
525 525
526 cpu = task_cpu(p); 526 cpu = task_cpu(p);
527 if (cpu == smp_processor_id()) { 527 if (cpu == smp_processor_id()) {
528 set_preempt_need_resched(); 528 set_preempt_need_resched();
529 return; 529 return;
530 } 530 }
531 531
532 /* NEED_RESCHED must be visible before we test polling */ 532 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 533 smp_mb();
534 if (!tsk_is_polling(p)) 534 if (!tsk_is_polling(p))
535 smp_send_reschedule(cpu); 535 smp_send_reschedule(cpu);
536 } 536 }
537 537
538 void resched_cpu(int cpu) 538 void resched_cpu(int cpu)
539 { 539 {
540 struct rq *rq = cpu_rq(cpu); 540 struct rq *rq = cpu_rq(cpu);
541 unsigned long flags; 541 unsigned long flags;
542 542
543 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 543 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
544 return; 544 return;
545 resched_task(cpu_curr(cpu)); 545 resched_task(cpu_curr(cpu));
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 546 raw_spin_unlock_irqrestore(&rq->lock, flags);
547 } 547 }
548 548
549 #ifdef CONFIG_SMP 549 #ifdef CONFIG_SMP
550 #ifdef CONFIG_NO_HZ_COMMON 550 #ifdef CONFIG_NO_HZ_COMMON
551 /* 551 /*
552 * In the semi idle case, use the nearest busy cpu for migrating timers 552 * In the semi idle case, use the nearest busy cpu for migrating timers
553 * from an idle cpu. This is good for power-savings. 553 * from an idle cpu. This is good for power-savings.
554 * 554 *
555 * We don't do similar optimization for completely idle system, as 555 * We don't do similar optimization for completely idle system, as
556 * selecting an idle cpu will add more delays to the timers than intended 556 * selecting an idle cpu will add more delays to the timers than intended
557 * (as that cpu's timer base may not be uptodate wrt jiffies etc). 557 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
558 */ 558 */
559 int get_nohz_timer_target(int pinned) 559 int get_nohz_timer_target(int pinned)
560 { 560 {
561 int cpu = smp_processor_id(); 561 int cpu = smp_processor_id();
562 int i; 562 int i;
563 struct sched_domain *sd; 563 struct sched_domain *sd;
564 564
565 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) 565 if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
566 return cpu; 566 return cpu;
567 567
568 rcu_read_lock(); 568 rcu_read_lock();
569 for_each_domain(cpu, sd) { 569 for_each_domain(cpu, sd) {
570 for_each_cpu(i, sched_domain_span(sd)) { 570 for_each_cpu(i, sched_domain_span(sd)) {
571 if (!idle_cpu(i)) { 571 if (!idle_cpu(i)) {
572 cpu = i; 572 cpu = i;
573 goto unlock; 573 goto unlock;
574 } 574 }
575 } 575 }
576 } 576 }
577 unlock: 577 unlock:
578 rcu_read_unlock(); 578 rcu_read_unlock();
579 return cpu; 579 return cpu;
580 } 580 }
581 /* 581 /*
582 * When add_timer_on() enqueues a timer into the timer wheel of an 582 * When add_timer_on() enqueues a timer into the timer wheel of an
583 * idle CPU then this timer might expire before the next timer event 583 * idle CPU then this timer might expire before the next timer event
584 * which is scheduled to wake up that CPU. In case of a completely 584 * which is scheduled to wake up that CPU. In case of a completely
585 * idle system the next event might even be infinite time into the 585 * idle system the next event might even be infinite time into the
586 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 586 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
587 * leaves the inner idle loop so the newly added timer is taken into 587 * leaves the inner idle loop so the newly added timer is taken into
588 * account when the CPU goes back to idle and evaluates the timer 588 * account when the CPU goes back to idle and evaluates the timer
589 * wheel for the next timer event. 589 * wheel for the next timer event.
590 */ 590 */
591 static void wake_up_idle_cpu(int cpu) 591 static void wake_up_idle_cpu(int cpu)
592 { 592 {
593 struct rq *rq = cpu_rq(cpu); 593 struct rq *rq = cpu_rq(cpu);
594 594
595 if (cpu == smp_processor_id()) 595 if (cpu == smp_processor_id())
596 return; 596 return;
597 597
598 /* 598 /*
599 * This is safe, as this function is called with the timer 599 * This is safe, as this function is called with the timer
600 * wheel base lock of (cpu) held. When the CPU is on the way 600 * wheel base lock of (cpu) held. When the CPU is on the way
601 * to idle and has not yet set rq->curr to idle then it will 601 * to idle and has not yet set rq->curr to idle then it will
602 * be serialized on the timer wheel base lock and take the new 602 * be serialized on the timer wheel base lock and take the new
603 * timer into account automatically. 603 * timer into account automatically.
604 */ 604 */
605 if (rq->curr != rq->idle) 605 if (rq->curr != rq->idle)
606 return; 606 return;
607 607
608 /* 608 /*
609 * We can set TIF_RESCHED on the idle task of the other CPU 609 * We can set TIF_RESCHED on the idle task of the other CPU
610 * lockless. The worst case is that the other CPU runs the 610 * lockless. The worst case is that the other CPU runs the
611 * idle task through an additional NOOP schedule() 611 * idle task through an additional NOOP schedule()
612 */ 612 */
613 set_tsk_need_resched(rq->idle); 613 set_tsk_need_resched(rq->idle);
614 614
615 /* NEED_RESCHED must be visible before we test polling */ 615 /* NEED_RESCHED must be visible before we test polling */
616 smp_mb(); 616 smp_mb();
617 if (!tsk_is_polling(rq->idle)) 617 if (!tsk_is_polling(rq->idle))
618 smp_send_reschedule(cpu); 618 smp_send_reschedule(cpu);
619 } 619 }
620 620
621 static bool wake_up_full_nohz_cpu(int cpu) 621 static bool wake_up_full_nohz_cpu(int cpu)
622 { 622 {
623 if (tick_nohz_full_cpu(cpu)) { 623 if (tick_nohz_full_cpu(cpu)) {
624 if (cpu != smp_processor_id() || 624 if (cpu != smp_processor_id() ||
625 tick_nohz_tick_stopped()) 625 tick_nohz_tick_stopped())
626 smp_send_reschedule(cpu); 626 smp_send_reschedule(cpu);
627 return true; 627 return true;
628 } 628 }
629 629
630 return false; 630 return false;
631 } 631 }
632 632
633 void wake_up_nohz_cpu(int cpu) 633 void wake_up_nohz_cpu(int cpu)
634 { 634 {
635 if (!wake_up_full_nohz_cpu(cpu)) 635 if (!wake_up_full_nohz_cpu(cpu))
636 wake_up_idle_cpu(cpu); 636 wake_up_idle_cpu(cpu);
637 } 637 }
638 638
639 static inline bool got_nohz_idle_kick(void) 639 static inline bool got_nohz_idle_kick(void)
640 { 640 {
641 int cpu = smp_processor_id(); 641 int cpu = smp_processor_id();
642 642
643 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) 643 if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
644 return false; 644 return false;
645 645
646 if (idle_cpu(cpu) && !need_resched()) 646 if (idle_cpu(cpu) && !need_resched())
647 return true; 647 return true;
648 648
649 /* 649 /*
650 * We can't run Idle Load Balance on this CPU for this time so we 650 * We can't run Idle Load Balance on this CPU for this time so we
651 * cancel it and clear NOHZ_BALANCE_KICK 651 * cancel it and clear NOHZ_BALANCE_KICK
652 */ 652 */
653 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); 653 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
654 return false; 654 return false;
655 } 655 }
656 656
657 #else /* CONFIG_NO_HZ_COMMON */ 657 #else /* CONFIG_NO_HZ_COMMON */
658 658
659 static inline bool got_nohz_idle_kick(void) 659 static inline bool got_nohz_idle_kick(void)
660 { 660 {
661 return false; 661 return false;
662 } 662 }
663 663
664 #endif /* CONFIG_NO_HZ_COMMON */ 664 #endif /* CONFIG_NO_HZ_COMMON */
665 665
666 #ifdef CONFIG_NO_HZ_FULL 666 #ifdef CONFIG_NO_HZ_FULL
667 bool sched_can_stop_tick(void) 667 bool sched_can_stop_tick(void)
668 { 668 {
669 struct rq *rq; 669 struct rq *rq;
670 670
671 rq = this_rq(); 671 rq = this_rq();
672 672
673 /* Make sure rq->nr_running update is visible after the IPI */ 673 /* Make sure rq->nr_running update is visible after the IPI */
674 smp_rmb(); 674 smp_rmb();
675 675
676 /* More than one running task need preemption */ 676 /* More than one running task need preemption */
677 if (rq->nr_running > 1) 677 if (rq->nr_running > 1)
678 return false; 678 return false;
679 679
680 return true; 680 return true;
681 } 681 }
682 #endif /* CONFIG_NO_HZ_FULL */ 682 #endif /* CONFIG_NO_HZ_FULL */
683 683
684 void sched_avg_update(struct rq *rq) 684 void sched_avg_update(struct rq *rq)
685 { 685 {
686 s64 period = sched_avg_period(); 686 s64 period = sched_avg_period();
687 687
688 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) { 688 while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
689 /* 689 /*
690 * Inline assembly required to prevent the compiler 690 * Inline assembly required to prevent the compiler
691 * optimising this loop into a divmod call. 691 * optimising this loop into a divmod call.
692 * See __iter_div_u64_rem() for another example of this. 692 * See __iter_div_u64_rem() for another example of this.
693 */ 693 */
694 asm("" : "+rm" (rq->age_stamp)); 694 asm("" : "+rm" (rq->age_stamp));
695 rq->age_stamp += period; 695 rq->age_stamp += period;
696 rq->rt_avg /= 2; 696 rq->rt_avg /= 2;
697 } 697 }
698 } 698 }
699 699
700 #endif /* CONFIG_SMP */ 700 #endif /* CONFIG_SMP */
701 701
702 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 702 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
703 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) 703 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
704 /* 704 /*
705 * Iterate task_group tree rooted at *from, calling @down when first entering a 705 * Iterate task_group tree rooted at *from, calling @down when first entering a
706 * node and @up when leaving it for the final time. 706 * node and @up when leaving it for the final time.
707 * 707 *
708 * Caller must hold rcu_lock or sufficient equivalent. 708 * Caller must hold rcu_lock or sufficient equivalent.
709 */ 709 */
710 int walk_tg_tree_from(struct task_group *from, 710 int walk_tg_tree_from(struct task_group *from,
711 tg_visitor down, tg_visitor up, void *data) 711 tg_visitor down, tg_visitor up, void *data)
712 { 712 {
713 struct task_group *parent, *child; 713 struct task_group *parent, *child;
714 int ret; 714 int ret;
715 715
716 parent = from; 716 parent = from;
717 717
718 down: 718 down:
719 ret = (*down)(parent, data); 719 ret = (*down)(parent, data);
720 if (ret) 720 if (ret)
721 goto out; 721 goto out;
722 list_for_each_entry_rcu(child, &parent->children, siblings) { 722 list_for_each_entry_rcu(child, &parent->children, siblings) {
723 parent = child; 723 parent = child;
724 goto down; 724 goto down;
725 725
726 up: 726 up:
727 continue; 727 continue;
728 } 728 }
729 ret = (*up)(parent, data); 729 ret = (*up)(parent, data);
730 if (ret || parent == from) 730 if (ret || parent == from)
731 goto out; 731 goto out;
732 732
733 child = parent; 733 child = parent;
734 parent = parent->parent; 734 parent = parent->parent;
735 if (parent) 735 if (parent)
736 goto up; 736 goto up;
737 out: 737 out:
738 return ret; 738 return ret;
739 } 739 }
740 740
741 int tg_nop(struct task_group *tg, void *data) 741 int tg_nop(struct task_group *tg, void *data)
742 { 742 {
743 return 0; 743 return 0;
744 } 744 }
745 #endif 745 #endif
746 746
747 static void set_load_weight(struct task_struct *p) 747 static void set_load_weight(struct task_struct *p)
748 { 748 {
749 int prio = p->static_prio - MAX_RT_PRIO; 749 int prio = p->static_prio - MAX_RT_PRIO;
750 struct load_weight *load = &p->se.load; 750 struct load_weight *load = &p->se.load;
751 751
752 /* 752 /*
753 * SCHED_IDLE tasks get minimal weight: 753 * SCHED_IDLE tasks get minimal weight:
754 */ 754 */
755 if (p->policy == SCHED_IDLE) { 755 if (p->policy == SCHED_IDLE) {
756 load->weight = scale_load(WEIGHT_IDLEPRIO); 756 load->weight = scale_load(WEIGHT_IDLEPRIO);
757 load->inv_weight = WMULT_IDLEPRIO; 757 load->inv_weight = WMULT_IDLEPRIO;
758 return; 758 return;
759 } 759 }
760 760
761 load->weight = scale_load(prio_to_weight[prio]); 761 load->weight = scale_load(prio_to_weight[prio]);
762 load->inv_weight = prio_to_wmult[prio]; 762 load->inv_weight = prio_to_wmult[prio];
763 } 763 }
764 764
765 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 765 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
766 { 766 {
767 update_rq_clock(rq); 767 update_rq_clock(rq);
768 sched_info_queued(rq, p); 768 sched_info_queued(rq, p);
769 p->sched_class->enqueue_task(rq, p, flags); 769 p->sched_class->enqueue_task(rq, p, flags);
770 } 770 }
771 771
772 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 772 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
773 { 773 {
774 update_rq_clock(rq); 774 update_rq_clock(rq);
775 sched_info_dequeued(rq, p); 775 sched_info_dequeued(rq, p);
776 p->sched_class->dequeue_task(rq, p, flags); 776 p->sched_class->dequeue_task(rq, p, flags);
777 } 777 }
778 778
779 void activate_task(struct rq *rq, struct task_struct *p, int flags) 779 void activate_task(struct rq *rq, struct task_struct *p, int flags)
780 { 780 {
781 if (task_contributes_to_load(p)) 781 if (task_contributes_to_load(p))
782 rq->nr_uninterruptible--; 782 rq->nr_uninterruptible--;
783 783
784 enqueue_task(rq, p, flags); 784 enqueue_task(rq, p, flags);
785 } 785 }
786 786
787 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 787 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
788 { 788 {
789 if (task_contributes_to_load(p)) 789 if (task_contributes_to_load(p))
790 rq->nr_uninterruptible++; 790 rq->nr_uninterruptible++;
791 791
792 dequeue_task(rq, p, flags); 792 dequeue_task(rq, p, flags);
793 } 793 }
794 794
795 static void update_rq_clock_task(struct rq *rq, s64 delta) 795 static void update_rq_clock_task(struct rq *rq, s64 delta)
796 { 796 {
797 /* 797 /*
798 * In theory, the compile should just see 0 here, and optimize out the call 798 * In theory, the compile should just see 0 here, and optimize out the call
799 * to sched_rt_avg_update. But I don't trust it... 799 * to sched_rt_avg_update. But I don't trust it...
800 */ 800 */
801 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 801 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
802 s64 steal = 0, irq_delta = 0; 802 s64 steal = 0, irq_delta = 0;
803 #endif 803 #endif
804 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 804 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
805 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; 805 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
806 806
807 /* 807 /*
808 * Since irq_time is only updated on {soft,}irq_exit, we might run into 808 * Since irq_time is only updated on {soft,}irq_exit, we might run into
809 * this case when a previous update_rq_clock() happened inside a 809 * this case when a previous update_rq_clock() happened inside a
810 * {soft,}irq region. 810 * {soft,}irq region.
811 * 811 *
812 * When this happens, we stop ->clock_task and only update the 812 * When this happens, we stop ->clock_task and only update the
813 * prev_irq_time stamp to account for the part that fit, so that a next 813 * prev_irq_time stamp to account for the part that fit, so that a next
814 * update will consume the rest. This ensures ->clock_task is 814 * update will consume the rest. This ensures ->clock_task is
815 * monotonic. 815 * monotonic.
816 * 816 *
817 * It does however cause some slight miss-attribution of {soft,}irq 817 * It does however cause some slight miss-attribution of {soft,}irq
818 * time, a more accurate solution would be to update the irq_time using 818 * time, a more accurate solution would be to update the irq_time using
819 * the current rq->clock timestamp, except that would require using 819 * the current rq->clock timestamp, except that would require using
820 * atomic ops. 820 * atomic ops.
821 */ 821 */
822 if (irq_delta > delta) 822 if (irq_delta > delta)
823 irq_delta = delta; 823 irq_delta = delta;
824 824
825 rq->prev_irq_time += irq_delta; 825 rq->prev_irq_time += irq_delta;
826 delta -= irq_delta; 826 delta -= irq_delta;
827 #endif 827 #endif
828 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 828 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
829 if (static_key_false((&paravirt_steal_rq_enabled))) { 829 if (static_key_false((&paravirt_steal_rq_enabled))) {
830 steal = paravirt_steal_clock(cpu_of(rq)); 830 steal = paravirt_steal_clock(cpu_of(rq));
831 steal -= rq->prev_steal_time_rq; 831 steal -= rq->prev_steal_time_rq;
832 832
833 if (unlikely(steal > delta)) 833 if (unlikely(steal > delta))
834 steal = delta; 834 steal = delta;
835 835
836 rq->prev_steal_time_rq += steal; 836 rq->prev_steal_time_rq += steal;
837 delta -= steal; 837 delta -= steal;
838 } 838 }
839 #endif 839 #endif
840 840
841 rq->clock_task += delta; 841 rq->clock_task += delta;
842 842
843 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) 843 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
844 if ((irq_delta + steal) && sched_feat(NONTASK_POWER)) 844 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
845 sched_rt_avg_update(rq, irq_delta + steal); 845 sched_rt_avg_update(rq, irq_delta + steal);
846 #endif 846 #endif
847 } 847 }
848 848
849 void sched_set_stop_task(int cpu, struct task_struct *stop) 849 void sched_set_stop_task(int cpu, struct task_struct *stop)
850 { 850 {
851 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; 851 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
852 struct task_struct *old_stop = cpu_rq(cpu)->stop; 852 struct task_struct *old_stop = cpu_rq(cpu)->stop;
853 853
854 if (stop) { 854 if (stop) {
855 /* 855 /*
856 * Make it appear like a SCHED_FIFO task, its something 856 * Make it appear like a SCHED_FIFO task, its something
857 * userspace knows about and won't get confused about. 857 * userspace knows about and won't get confused about.
858 * 858 *
859 * Also, it will make PI more or less work without too 859 * Also, it will make PI more or less work without too
860 * much confusion -- but then, stop work should not 860 * much confusion -- but then, stop work should not
861 * rely on PI working anyway. 861 * rely on PI working anyway.
862 */ 862 */
863 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param); 863 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
864 864
865 stop->sched_class = &stop_sched_class; 865 stop->sched_class = &stop_sched_class;
866 } 866 }
867 867
868 cpu_rq(cpu)->stop = stop; 868 cpu_rq(cpu)->stop = stop;
869 869
870 if (old_stop) { 870 if (old_stop) {
871 /* 871 /*
872 * Reset it back to a normal scheduling class so that 872 * Reset it back to a normal scheduling class so that
873 * it can die in pieces. 873 * it can die in pieces.
874 */ 874 */
875 old_stop->sched_class = &rt_sched_class; 875 old_stop->sched_class = &rt_sched_class;
876 } 876 }
877 } 877 }
878 878
879 /* 879 /*
880 * __normal_prio - return the priority that is based on the static prio 880 * __normal_prio - return the priority that is based on the static prio
881 */ 881 */
882 static inline int __normal_prio(struct task_struct *p) 882 static inline int __normal_prio(struct task_struct *p)
883 { 883 {
884 return p->static_prio; 884 return p->static_prio;
885 } 885 }
886 886
887 /* 887 /*
888 * Calculate the expected normal priority: i.e. priority 888 * Calculate the expected normal priority: i.e. priority
889 * without taking RT-inheritance into account. Might be 889 * without taking RT-inheritance into account. Might be
890 * boosted by interactivity modifiers. Changes upon fork, 890 * boosted by interactivity modifiers. Changes upon fork,
891 * setprio syscalls, and whenever the interactivity 891 * setprio syscalls, and whenever the interactivity
892 * estimator recalculates. 892 * estimator recalculates.
893 */ 893 */
894 static inline int normal_prio(struct task_struct *p) 894 static inline int normal_prio(struct task_struct *p)
895 { 895 {
896 int prio; 896 int prio;
897 897
898 if (task_has_dl_policy(p)) 898 if (task_has_dl_policy(p))
899 prio = MAX_DL_PRIO-1; 899 prio = MAX_DL_PRIO-1;
900 else if (task_has_rt_policy(p)) 900 else if (task_has_rt_policy(p))
901 prio = MAX_RT_PRIO-1 - p->rt_priority; 901 prio = MAX_RT_PRIO-1 - p->rt_priority;
902 else 902 else
903 prio = __normal_prio(p); 903 prio = __normal_prio(p);
904 return prio; 904 return prio;
905 } 905 }
906 906
907 /* 907 /*
908 * Calculate the current priority, i.e. the priority 908 * Calculate the current priority, i.e. the priority
909 * taken into account by the scheduler. This value might 909 * taken into account by the scheduler. This value might
910 * be boosted by RT tasks, or might be boosted by 910 * be boosted by RT tasks, or might be boosted by
911 * interactivity modifiers. Will be RT if the task got 911 * interactivity modifiers. Will be RT if the task got
912 * RT-boosted. If not then it returns p->normal_prio. 912 * RT-boosted. If not then it returns p->normal_prio.
913 */ 913 */
914 static int effective_prio(struct task_struct *p) 914 static int effective_prio(struct task_struct *p)
915 { 915 {
916 p->normal_prio = normal_prio(p); 916 p->normal_prio = normal_prio(p);
917 /* 917 /*
918 * If we are RT tasks or we were boosted to RT priority, 918 * If we are RT tasks or we were boosted to RT priority,
919 * keep the priority unchanged. Otherwise, update priority 919 * keep the priority unchanged. Otherwise, update priority
920 * to the normal priority: 920 * to the normal priority:
921 */ 921 */
922 if (!rt_prio(p->prio)) 922 if (!rt_prio(p->prio))
923 return p->normal_prio; 923 return p->normal_prio;
924 return p->prio; 924 return p->prio;
925 } 925 }
926 926
927 /** 927 /**
928 * task_curr - is this task currently executing on a CPU? 928 * task_curr - is this task currently executing on a CPU?
929 * @p: the task in question. 929 * @p: the task in question.
930 * 930 *
931 * Return: 1 if the task is currently executing. 0 otherwise. 931 * Return: 1 if the task is currently executing. 0 otherwise.
932 */ 932 */
933 inline int task_curr(const struct task_struct *p) 933 inline int task_curr(const struct task_struct *p)
934 { 934 {
935 return cpu_curr(task_cpu(p)) == p; 935 return cpu_curr(task_cpu(p)) == p;
936 } 936 }
937 937
938 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 938 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
939 const struct sched_class *prev_class, 939 const struct sched_class *prev_class,
940 int oldprio) 940 int oldprio)
941 { 941 {
942 if (prev_class != p->sched_class) { 942 if (prev_class != p->sched_class) {
943 if (prev_class->switched_from) 943 if (prev_class->switched_from)
944 prev_class->switched_from(rq, p); 944 prev_class->switched_from(rq, p);
945 p->sched_class->switched_to(rq, p); 945 p->sched_class->switched_to(rq, p);
946 } else if (oldprio != p->prio || dl_task(p)) 946 } else if (oldprio != p->prio || dl_task(p))
947 p->sched_class->prio_changed(rq, p, oldprio); 947 p->sched_class->prio_changed(rq, p, oldprio);
948 } 948 }
949 949
950 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 950 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
951 { 951 {
952 const struct sched_class *class; 952 const struct sched_class *class;
953 953
954 if (p->sched_class == rq->curr->sched_class) { 954 if (p->sched_class == rq->curr->sched_class) {
955 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 955 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
956 } else { 956 } else {
957 for_each_class(class) { 957 for_each_class(class) {
958 if (class == rq->curr->sched_class) 958 if (class == rq->curr->sched_class)
959 break; 959 break;
960 if (class == p->sched_class) { 960 if (class == p->sched_class) {
961 resched_task(rq->curr); 961 resched_task(rq->curr);
962 break; 962 break;
963 } 963 }
964 } 964 }
965 } 965 }
966 966
967 /* 967 /*
968 * A queue event has occurred, and we're going to schedule. In 968 * A queue event has occurred, and we're going to schedule. In
969 * this case, we can save a useless back to back clock update. 969 * this case, we can save a useless back to back clock update.
970 */ 970 */
971 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 971 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
972 rq->skip_clock_update = 1; 972 rq->skip_clock_update = 1;
973 } 973 }
974 974
975 #ifdef CONFIG_SMP 975 #ifdef CONFIG_SMP
976 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 976 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
977 { 977 {
978 #ifdef CONFIG_SCHED_DEBUG 978 #ifdef CONFIG_SCHED_DEBUG
979 /* 979 /*
980 * We should never call set_task_cpu() on a blocked task, 980 * We should never call set_task_cpu() on a blocked task,
981 * ttwu() will sort out the placement. 981 * ttwu() will sort out the placement.
982 */ 982 */
983 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 983 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
984 !(task_preempt_count(p) & PREEMPT_ACTIVE)); 984 !(task_preempt_count(p) & PREEMPT_ACTIVE));
985 985
986 #ifdef CONFIG_LOCKDEP 986 #ifdef CONFIG_LOCKDEP
987 /* 987 /*
988 * The caller should hold either p->pi_lock or rq->lock, when changing 988 * The caller should hold either p->pi_lock or rq->lock, when changing
989 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. 989 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
990 * 990 *
991 * sched_move_task() holds both and thus holding either pins the cgroup, 991 * sched_move_task() holds both and thus holding either pins the cgroup,
992 * see task_group(). 992 * see task_group().
993 * 993 *
994 * Furthermore, all task_rq users should acquire both locks, see 994 * Furthermore, all task_rq users should acquire both locks, see
995 * task_rq_lock(). 995 * task_rq_lock().
996 */ 996 */
997 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 997 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
998 lockdep_is_held(&task_rq(p)->lock))); 998 lockdep_is_held(&task_rq(p)->lock)));
999 #endif 999 #endif
1000 #endif 1000 #endif
1001 1001
1002 trace_sched_migrate_task(p, new_cpu); 1002 trace_sched_migrate_task(p, new_cpu);
1003 1003
1004 if (task_cpu(p) != new_cpu) { 1004 if (task_cpu(p) != new_cpu) {
1005 if (p->sched_class->migrate_task_rq) 1005 if (p->sched_class->migrate_task_rq)
1006 p->sched_class->migrate_task_rq(p, new_cpu); 1006 p->sched_class->migrate_task_rq(p, new_cpu);
1007 p->se.nr_migrations++; 1007 p->se.nr_migrations++;
1008 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1008 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
1009 } 1009 }
1010 1010
1011 __set_task_cpu(p, new_cpu); 1011 __set_task_cpu(p, new_cpu);
1012 } 1012 }
1013 1013
1014 static void __migrate_swap_task(struct task_struct *p, int cpu) 1014 static void __migrate_swap_task(struct task_struct *p, int cpu)
1015 { 1015 {
1016 if (p->on_rq) { 1016 if (p->on_rq) {
1017 struct rq *src_rq, *dst_rq; 1017 struct rq *src_rq, *dst_rq;
1018 1018
1019 src_rq = task_rq(p); 1019 src_rq = task_rq(p);
1020 dst_rq = cpu_rq(cpu); 1020 dst_rq = cpu_rq(cpu);
1021 1021
1022 deactivate_task(src_rq, p, 0); 1022 deactivate_task(src_rq, p, 0);
1023 set_task_cpu(p, cpu); 1023 set_task_cpu(p, cpu);
1024 activate_task(dst_rq, p, 0); 1024 activate_task(dst_rq, p, 0);
1025 check_preempt_curr(dst_rq, p, 0); 1025 check_preempt_curr(dst_rq, p, 0);
1026 } else { 1026 } else {
1027 /* 1027 /*
1028 * Task isn't running anymore; make it appear like we migrated 1028 * Task isn't running anymore; make it appear like we migrated
1029 * it before it went to sleep. This means on wakeup we make the 1029 * it before it went to sleep. This means on wakeup we make the
1030 * previous cpu our targer instead of where it really is. 1030 * previous cpu our targer instead of where it really is.
1031 */ 1031 */
1032 p->wake_cpu = cpu; 1032 p->wake_cpu = cpu;
1033 } 1033 }
1034 } 1034 }
1035 1035
1036 struct migration_swap_arg { 1036 struct migration_swap_arg {
1037 struct task_struct *src_task, *dst_task; 1037 struct task_struct *src_task, *dst_task;
1038 int src_cpu, dst_cpu; 1038 int src_cpu, dst_cpu;
1039 }; 1039 };
1040 1040
1041 static int migrate_swap_stop(void *data) 1041 static int migrate_swap_stop(void *data)
1042 { 1042 {
1043 struct migration_swap_arg *arg = data; 1043 struct migration_swap_arg *arg = data;
1044 struct rq *src_rq, *dst_rq; 1044 struct rq *src_rq, *dst_rq;
1045 int ret = -EAGAIN; 1045 int ret = -EAGAIN;
1046 1046
1047 src_rq = cpu_rq(arg->src_cpu); 1047 src_rq = cpu_rq(arg->src_cpu);
1048 dst_rq = cpu_rq(arg->dst_cpu); 1048 dst_rq = cpu_rq(arg->dst_cpu);
1049 1049
1050 double_raw_lock(&arg->src_task->pi_lock, 1050 double_raw_lock(&arg->src_task->pi_lock,
1051 &arg->dst_task->pi_lock); 1051 &arg->dst_task->pi_lock);
1052 double_rq_lock(src_rq, dst_rq); 1052 double_rq_lock(src_rq, dst_rq);
1053 if (task_cpu(arg->dst_task) != arg->dst_cpu) 1053 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1054 goto unlock; 1054 goto unlock;
1055 1055
1056 if (task_cpu(arg->src_task) != arg->src_cpu) 1056 if (task_cpu(arg->src_task) != arg->src_cpu)
1057 goto unlock; 1057 goto unlock;
1058 1058
1059 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) 1059 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1060 goto unlock; 1060 goto unlock;
1061 1061
1062 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) 1062 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1063 goto unlock; 1063 goto unlock;
1064 1064
1065 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1065 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1066 __migrate_swap_task(arg->dst_task, arg->src_cpu); 1066 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1067 1067
1068 ret = 0; 1068 ret = 0;
1069 1069
1070 unlock: 1070 unlock:
1071 double_rq_unlock(src_rq, dst_rq); 1071 double_rq_unlock(src_rq, dst_rq);
1072 raw_spin_unlock(&arg->dst_task->pi_lock); 1072 raw_spin_unlock(&arg->dst_task->pi_lock);
1073 raw_spin_unlock(&arg->src_task->pi_lock); 1073 raw_spin_unlock(&arg->src_task->pi_lock);
1074 1074
1075 return ret; 1075 return ret;
1076 } 1076 }
1077 1077
1078 /* 1078 /*
1079 * Cross migrate two tasks 1079 * Cross migrate two tasks
1080 */ 1080 */
1081 int migrate_swap(struct task_struct *cur, struct task_struct *p) 1081 int migrate_swap(struct task_struct *cur, struct task_struct *p)
1082 { 1082 {
1083 struct migration_swap_arg arg; 1083 struct migration_swap_arg arg;
1084 int ret = -EINVAL; 1084 int ret = -EINVAL;
1085 1085
1086 arg = (struct migration_swap_arg){ 1086 arg = (struct migration_swap_arg){
1087 .src_task = cur, 1087 .src_task = cur,
1088 .src_cpu = task_cpu(cur), 1088 .src_cpu = task_cpu(cur),
1089 .dst_task = p, 1089 .dst_task = p,
1090 .dst_cpu = task_cpu(p), 1090 .dst_cpu = task_cpu(p),
1091 }; 1091 };
1092 1092
1093 if (arg.src_cpu == arg.dst_cpu) 1093 if (arg.src_cpu == arg.dst_cpu)
1094 goto out; 1094 goto out;
1095 1095
1096 /* 1096 /*
1097 * These three tests are all lockless; this is OK since all of them 1097 * These three tests are all lockless; this is OK since all of them
1098 * will be re-checked with proper locks held further down the line. 1098 * will be re-checked with proper locks held further down the line.
1099 */ 1099 */
1100 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1100 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1101 goto out; 1101 goto out;
1102 1102
1103 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) 1103 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1104 goto out; 1104 goto out;
1105 1105
1106 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) 1106 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1107 goto out; 1107 goto out;
1108 1108
1109 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1109 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1110 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); 1110 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1111 1111
1112 out: 1112 out:
1113 return ret; 1113 return ret;
1114 } 1114 }
1115 1115
1116 struct migration_arg { 1116 struct migration_arg {
1117 struct task_struct *task; 1117 struct task_struct *task;
1118 int dest_cpu; 1118 int dest_cpu;
1119 }; 1119 };
1120 1120
1121 static int migration_cpu_stop(void *data); 1121 static int migration_cpu_stop(void *data);
1122 1122
1123 /* 1123 /*
1124 * wait_task_inactive - wait for a thread to unschedule. 1124 * wait_task_inactive - wait for a thread to unschedule.
1125 * 1125 *
1126 * If @match_state is nonzero, it's the @p->state value just checked and 1126 * If @match_state is nonzero, it's the @p->state value just checked and
1127 * not expected to change. If it changes, i.e. @p might have woken up, 1127 * not expected to change. If it changes, i.e. @p might have woken up,
1128 * then return zero. When we succeed in waiting for @p to be off its CPU, 1128 * then return zero. When we succeed in waiting for @p to be off its CPU,
1129 * we return a positive number (its total switch count). If a second call 1129 * we return a positive number (its total switch count). If a second call
1130 * a short while later returns the same number, the caller can be sure that 1130 * a short while later returns the same number, the caller can be sure that
1131 * @p has remained unscheduled the whole time. 1131 * @p has remained unscheduled the whole time.
1132 * 1132 *
1133 * The caller must ensure that the task *will* unschedule sometime soon, 1133 * The caller must ensure that the task *will* unschedule sometime soon,
1134 * else this function might spin for a *long* time. This function can't 1134 * else this function might spin for a *long* time. This function can't
1135 * be called with interrupts off, or it may introduce deadlock with 1135 * be called with interrupts off, or it may introduce deadlock with
1136 * smp_call_function() if an IPI is sent by the same process we are 1136 * smp_call_function() if an IPI is sent by the same process we are
1137 * waiting to become inactive. 1137 * waiting to become inactive.
1138 */ 1138 */
1139 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1139 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1140 { 1140 {
1141 unsigned long flags; 1141 unsigned long flags;
1142 int running, on_rq; 1142 int running, on_rq;
1143 unsigned long ncsw; 1143 unsigned long ncsw;
1144 struct rq *rq; 1144 struct rq *rq;
1145 1145
1146 for (;;) { 1146 for (;;) {
1147 /* 1147 /*
1148 * We do the initial early heuristics without holding 1148 * We do the initial early heuristics without holding
1149 * any task-queue locks at all. We'll only try to get 1149 * any task-queue locks at all. We'll only try to get
1150 * the runqueue lock when things look like they will 1150 * the runqueue lock when things look like they will
1151 * work out! 1151 * work out!
1152 */ 1152 */
1153 rq = task_rq(p); 1153 rq = task_rq(p);
1154 1154
1155 /* 1155 /*
1156 * If the task is actively running on another CPU 1156 * If the task is actively running on another CPU
1157 * still, just relax and busy-wait without holding 1157 * still, just relax and busy-wait without holding
1158 * any locks. 1158 * any locks.
1159 * 1159 *
1160 * NOTE! Since we don't hold any locks, it's not 1160 * NOTE! Since we don't hold any locks, it's not
1161 * even sure that "rq" stays as the right runqueue! 1161 * even sure that "rq" stays as the right runqueue!
1162 * But we don't care, since "task_running()" will 1162 * But we don't care, since "task_running()" will
1163 * return false if the runqueue has changed and p 1163 * return false if the runqueue has changed and p
1164 * is actually now running somewhere else! 1164 * is actually now running somewhere else!
1165 */ 1165 */
1166 while (task_running(rq, p)) { 1166 while (task_running(rq, p)) {
1167 if (match_state && unlikely(p->state != match_state)) 1167 if (match_state && unlikely(p->state != match_state))
1168 return 0; 1168 return 0;
1169 cpu_relax(); 1169 cpu_relax();
1170 } 1170 }
1171 1171
1172 /* 1172 /*
1173 * Ok, time to look more closely! We need the rq 1173 * Ok, time to look more closely! We need the rq
1174 * lock now, to be *sure*. If we're wrong, we'll 1174 * lock now, to be *sure*. If we're wrong, we'll
1175 * just go back and repeat. 1175 * just go back and repeat.
1176 */ 1176 */
1177 rq = task_rq_lock(p, &flags); 1177 rq = task_rq_lock(p, &flags);
1178 trace_sched_wait_task(p); 1178 trace_sched_wait_task(p);
1179 running = task_running(rq, p); 1179 running = task_running(rq, p);
1180 on_rq = p->on_rq; 1180 on_rq = p->on_rq;
1181 ncsw = 0; 1181 ncsw = 0;
1182 if (!match_state || p->state == match_state) 1182 if (!match_state || p->state == match_state)
1183 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1183 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1184 task_rq_unlock(rq, p, &flags); 1184 task_rq_unlock(rq, p, &flags);
1185 1185
1186 /* 1186 /*
1187 * If it changed from the expected state, bail out now. 1187 * If it changed from the expected state, bail out now.
1188 */ 1188 */
1189 if (unlikely(!ncsw)) 1189 if (unlikely(!ncsw))
1190 break; 1190 break;
1191 1191
1192 /* 1192 /*
1193 * Was it really running after all now that we 1193 * Was it really running after all now that we
1194 * checked with the proper locks actually held? 1194 * checked with the proper locks actually held?
1195 * 1195 *
1196 * Oops. Go back and try again.. 1196 * Oops. Go back and try again..
1197 */ 1197 */
1198 if (unlikely(running)) { 1198 if (unlikely(running)) {
1199 cpu_relax(); 1199 cpu_relax();
1200 continue; 1200 continue;
1201 } 1201 }
1202 1202
1203 /* 1203 /*
1204 * It's not enough that it's not actively running, 1204 * It's not enough that it's not actively running,
1205 * it must be off the runqueue _entirely_, and not 1205 * it must be off the runqueue _entirely_, and not
1206 * preempted! 1206 * preempted!
1207 * 1207 *
1208 * So if it was still runnable (but just not actively 1208 * So if it was still runnable (but just not actively
1209 * running right now), it's preempted, and we should 1209 * running right now), it's preempted, and we should
1210 * yield - it could be a while. 1210 * yield - it could be a while.
1211 */ 1211 */
1212 if (unlikely(on_rq)) { 1212 if (unlikely(on_rq)) {
1213 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1213 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1214 1214
1215 set_current_state(TASK_UNINTERRUPTIBLE); 1215 set_current_state(TASK_UNINTERRUPTIBLE);
1216 schedule_hrtimeout(&to, HRTIMER_MODE_REL); 1216 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1217 continue; 1217 continue;
1218 } 1218 }
1219 1219
1220 /* 1220 /*
1221 * Ahh, all good. It wasn't running, and it wasn't 1221 * Ahh, all good. It wasn't running, and it wasn't
1222 * runnable, which means that it will never become 1222 * runnable, which means that it will never become
1223 * running in the future either. We're all done! 1223 * running in the future either. We're all done!
1224 */ 1224 */
1225 break; 1225 break;
1226 } 1226 }
1227 1227
1228 return ncsw; 1228 return ncsw;
1229 } 1229 }
1230 1230
1231 /*** 1231 /***
1232 * kick_process - kick a running thread to enter/exit the kernel 1232 * kick_process - kick a running thread to enter/exit the kernel
1233 * @p: the to-be-kicked thread 1233 * @p: the to-be-kicked thread
1234 * 1234 *
1235 * Cause a process which is running on another CPU to enter 1235 * Cause a process which is running on another CPU to enter
1236 * kernel-mode, without any delay. (to get signals handled.) 1236 * kernel-mode, without any delay. (to get signals handled.)
1237 * 1237 *
1238 * NOTE: this function doesn't have to take the runqueue lock, 1238 * NOTE: this function doesn't have to take the runqueue lock,
1239 * because all it wants to ensure is that the remote task enters 1239 * because all it wants to ensure is that the remote task enters
1240 * the kernel. If the IPI races and the task has been migrated 1240 * the kernel. If the IPI races and the task has been migrated
1241 * to another CPU then no harm is done and the purpose has been 1241 * to another CPU then no harm is done and the purpose has been
1242 * achieved as well. 1242 * achieved as well.
1243 */ 1243 */
1244 void kick_process(struct task_struct *p) 1244 void kick_process(struct task_struct *p)
1245 { 1245 {
1246 int cpu; 1246 int cpu;
1247 1247
1248 preempt_disable(); 1248 preempt_disable();
1249 cpu = task_cpu(p); 1249 cpu = task_cpu(p);
1250 if ((cpu != smp_processor_id()) && task_curr(p)) 1250 if ((cpu != smp_processor_id()) && task_curr(p))
1251 smp_send_reschedule(cpu); 1251 smp_send_reschedule(cpu);
1252 preempt_enable(); 1252 preempt_enable();
1253 } 1253 }
1254 EXPORT_SYMBOL_GPL(kick_process); 1254 EXPORT_SYMBOL_GPL(kick_process);
1255 #endif /* CONFIG_SMP */ 1255 #endif /* CONFIG_SMP */
1256 1256
1257 #ifdef CONFIG_SMP 1257 #ifdef CONFIG_SMP
1258 /* 1258 /*
1259 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1259 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1260 */ 1260 */
1261 static int select_fallback_rq(int cpu, struct task_struct *p) 1261 static int select_fallback_rq(int cpu, struct task_struct *p)
1262 { 1262 {
1263 int nid = cpu_to_node(cpu); 1263 int nid = cpu_to_node(cpu);
1264 const struct cpumask *nodemask = NULL; 1264 const struct cpumask *nodemask = NULL;
1265 enum { cpuset, possible, fail } state = cpuset; 1265 enum { cpuset, possible, fail } state = cpuset;
1266 int dest_cpu; 1266 int dest_cpu;
1267 1267
1268 /* 1268 /*
1269 * If the node that the cpu is on has been offlined, cpu_to_node() 1269 * If the node that the cpu is on has been offlined, cpu_to_node()
1270 * will return -1. There is no cpu on the node, and we should 1270 * will return -1. There is no cpu on the node, and we should
1271 * select the cpu on the other node. 1271 * select the cpu on the other node.
1272 */ 1272 */
1273 if (nid != -1) { 1273 if (nid != -1) {
1274 nodemask = cpumask_of_node(nid); 1274 nodemask = cpumask_of_node(nid);
1275 1275
1276 /* Look for allowed, online CPU in same node. */ 1276 /* Look for allowed, online CPU in same node. */
1277 for_each_cpu(dest_cpu, nodemask) { 1277 for_each_cpu(dest_cpu, nodemask) {
1278 if (!cpu_online(dest_cpu)) 1278 if (!cpu_online(dest_cpu))
1279 continue; 1279 continue;
1280 if (!cpu_active(dest_cpu)) 1280 if (!cpu_active(dest_cpu))
1281 continue; 1281 continue;
1282 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1282 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1283 return dest_cpu; 1283 return dest_cpu;
1284 } 1284 }
1285 } 1285 }
1286 1286
1287 for (;;) { 1287 for (;;) {
1288 /* Any allowed, online CPU? */ 1288 /* Any allowed, online CPU? */
1289 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1289 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
1290 if (!cpu_online(dest_cpu)) 1290 if (!cpu_online(dest_cpu))
1291 continue; 1291 continue;
1292 if (!cpu_active(dest_cpu)) 1292 if (!cpu_active(dest_cpu))
1293 continue; 1293 continue;
1294 goto out; 1294 goto out;
1295 } 1295 }
1296 1296
1297 switch (state) { 1297 switch (state) {
1298 case cpuset: 1298 case cpuset:
1299 /* No more Mr. Nice Guy. */ 1299 /* No more Mr. Nice Guy. */
1300 cpuset_cpus_allowed_fallback(p); 1300 cpuset_cpus_allowed_fallback(p);
1301 state = possible; 1301 state = possible;
1302 break; 1302 break;
1303 1303
1304 case possible: 1304 case possible:
1305 do_set_cpus_allowed(p, cpu_possible_mask); 1305 do_set_cpus_allowed(p, cpu_possible_mask);
1306 state = fail; 1306 state = fail;
1307 break; 1307 break;
1308 1308
1309 case fail: 1309 case fail:
1310 BUG(); 1310 BUG();
1311 break; 1311 break;
1312 } 1312 }
1313 } 1313 }
1314 1314
1315 out: 1315 out:
1316 if (state != cpuset) { 1316 if (state != cpuset) {
1317 /* 1317 /*
1318 * Don't tell them about moving exiting tasks or 1318 * Don't tell them about moving exiting tasks or
1319 * kernel threads (both mm NULL), since they never 1319 * kernel threads (both mm NULL), since they never
1320 * leave kernel. 1320 * leave kernel.
1321 */ 1321 */
1322 if (p->mm && printk_ratelimit()) { 1322 if (p->mm && printk_ratelimit()) {
1323 printk_sched("process %d (%s) no longer affine to cpu%d\n", 1323 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1324 task_pid_nr(p), p->comm, cpu); 1324 task_pid_nr(p), p->comm, cpu);
1325 } 1325 }
1326 } 1326 }
1327 1327
1328 return dest_cpu; 1328 return dest_cpu;
1329 } 1329 }
1330 1330
1331 /* 1331 /*
1332 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1332 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1333 */ 1333 */
1334 static inline 1334 static inline
1335 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1335 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1336 { 1336 {
1337 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1337 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1338 1338
1339 /* 1339 /*
1340 * In order not to call set_task_cpu() on a blocking task we need 1340 * In order not to call set_task_cpu() on a blocking task we need
1341 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1341 * to rely on ttwu() to place the task on a valid ->cpus_allowed
1342 * cpu. 1342 * cpu.
1343 * 1343 *
1344 * Since this is common to all placement strategies, this lives here. 1344 * Since this is common to all placement strategies, this lives here.
1345 * 1345 *
1346 * [ this allows ->select_task() to simply return task_cpu(p) and 1346 * [ this allows ->select_task() to simply return task_cpu(p) and
1347 * not worry about this generic constraint ] 1347 * not worry about this generic constraint ]
1348 */ 1348 */
1349 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) || 1349 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
1350 !cpu_online(cpu))) 1350 !cpu_online(cpu)))
1351 cpu = select_fallback_rq(task_cpu(p), p); 1351 cpu = select_fallback_rq(task_cpu(p), p);
1352 1352
1353 return cpu; 1353 return cpu;
1354 } 1354 }
1355 1355
1356 static void update_avg(u64 *avg, u64 sample) 1356 static void update_avg(u64 *avg, u64 sample)
1357 { 1357 {
1358 s64 diff = sample - *avg; 1358 s64 diff = sample - *avg;
1359 *avg += diff >> 3; 1359 *avg += diff >> 3;
1360 } 1360 }
1361 #endif 1361 #endif
1362 1362
1363 static void 1363 static void
1364 ttwu_stat(struct task_struct *p, int cpu, int wake_flags) 1364 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1365 { 1365 {
1366 #ifdef CONFIG_SCHEDSTATS 1366 #ifdef CONFIG_SCHEDSTATS
1367 struct rq *rq = this_rq(); 1367 struct rq *rq = this_rq();
1368 1368
1369 #ifdef CONFIG_SMP 1369 #ifdef CONFIG_SMP
1370 int this_cpu = smp_processor_id(); 1370 int this_cpu = smp_processor_id();
1371 1371
1372 if (cpu == this_cpu) { 1372 if (cpu == this_cpu) {
1373 schedstat_inc(rq, ttwu_local); 1373 schedstat_inc(rq, ttwu_local);
1374 schedstat_inc(p, se.statistics.nr_wakeups_local); 1374 schedstat_inc(p, se.statistics.nr_wakeups_local);
1375 } else { 1375 } else {
1376 struct sched_domain *sd; 1376 struct sched_domain *sd;
1377 1377
1378 schedstat_inc(p, se.statistics.nr_wakeups_remote); 1378 schedstat_inc(p, se.statistics.nr_wakeups_remote);
1379 rcu_read_lock(); 1379 rcu_read_lock();
1380 for_each_domain(this_cpu, sd) { 1380 for_each_domain(this_cpu, sd) {
1381 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 1381 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1382 schedstat_inc(sd, ttwu_wake_remote); 1382 schedstat_inc(sd, ttwu_wake_remote);
1383 break; 1383 break;
1384 } 1384 }
1385 } 1385 }
1386 rcu_read_unlock(); 1386 rcu_read_unlock();
1387 } 1387 }
1388 1388
1389 if (wake_flags & WF_MIGRATED) 1389 if (wake_flags & WF_MIGRATED)
1390 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 1390 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
1391 1391
1392 #endif /* CONFIG_SMP */ 1392 #endif /* CONFIG_SMP */
1393 1393
1394 schedstat_inc(rq, ttwu_count); 1394 schedstat_inc(rq, ttwu_count);
1395 schedstat_inc(p, se.statistics.nr_wakeups); 1395 schedstat_inc(p, se.statistics.nr_wakeups);
1396 1396
1397 if (wake_flags & WF_SYNC) 1397 if (wake_flags & WF_SYNC)
1398 schedstat_inc(p, se.statistics.nr_wakeups_sync); 1398 schedstat_inc(p, se.statistics.nr_wakeups_sync);
1399 1399
1400 #endif /* CONFIG_SCHEDSTATS */ 1400 #endif /* CONFIG_SCHEDSTATS */
1401 } 1401 }
1402 1402
1403 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1403 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1404 { 1404 {
1405 activate_task(rq, p, en_flags); 1405 activate_task(rq, p, en_flags);
1406 p->on_rq = 1; 1406 p->on_rq = 1;
1407 1407
1408 /* if a worker is waking up, notify workqueue */ 1408 /* if a worker is waking up, notify workqueue */
1409 if (p->flags & PF_WQ_WORKER) 1409 if (p->flags & PF_WQ_WORKER)
1410 wq_worker_waking_up(p, cpu_of(rq)); 1410 wq_worker_waking_up(p, cpu_of(rq));
1411 } 1411 }
1412 1412
1413 /* 1413 /*
1414 * Mark the task runnable and perform wakeup-preemption. 1414 * Mark the task runnable and perform wakeup-preemption.
1415 */ 1415 */
1416 static void 1416 static void
1417 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1417 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1418 { 1418 {
1419 check_preempt_curr(rq, p, wake_flags); 1419 check_preempt_curr(rq, p, wake_flags);
1420 trace_sched_wakeup(p, true); 1420 trace_sched_wakeup(p, true);
1421 1421
1422 p->state = TASK_RUNNING; 1422 p->state = TASK_RUNNING;
1423 #ifdef CONFIG_SMP 1423 #ifdef CONFIG_SMP
1424 if (p->sched_class->task_woken) 1424 if (p->sched_class->task_woken)
1425 p->sched_class->task_woken(rq, p); 1425 p->sched_class->task_woken(rq, p);
1426 1426
1427 if (rq->idle_stamp) { 1427 if (rq->idle_stamp) {
1428 u64 delta = rq_clock(rq) - rq->idle_stamp; 1428 u64 delta = rq_clock(rq) - rq->idle_stamp;
1429 u64 max = 2*rq->max_idle_balance_cost; 1429 u64 max = 2*rq->max_idle_balance_cost;
1430 1430
1431 update_avg(&rq->avg_idle, delta); 1431 update_avg(&rq->avg_idle, delta);
1432 1432
1433 if (rq->avg_idle > max) 1433 if (rq->avg_idle > max)
1434 rq->avg_idle = max; 1434 rq->avg_idle = max;
1435 1435
1436 rq->idle_stamp = 0; 1436 rq->idle_stamp = 0;
1437 } 1437 }
1438 #endif 1438 #endif
1439 } 1439 }
1440 1440
1441 static void 1441 static void
1442 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) 1442 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
1443 { 1443 {
1444 #ifdef CONFIG_SMP 1444 #ifdef CONFIG_SMP
1445 if (p->sched_contributes_to_load) 1445 if (p->sched_contributes_to_load)
1446 rq->nr_uninterruptible--; 1446 rq->nr_uninterruptible--;
1447 #endif 1447 #endif
1448 1448
1449 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); 1449 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
1450 ttwu_do_wakeup(rq, p, wake_flags); 1450 ttwu_do_wakeup(rq, p, wake_flags);
1451 } 1451 }
1452 1452
1453 /* 1453 /*
1454 * Called in case the task @p isn't fully descheduled from its runqueue, 1454 * Called in case the task @p isn't fully descheduled from its runqueue,
1455 * in this case we must do a remote wakeup. Its a 'light' wakeup though, 1455 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1456 * since all we need to do is flip p->state to TASK_RUNNING, since 1456 * since all we need to do is flip p->state to TASK_RUNNING, since
1457 * the task is still ->on_rq. 1457 * the task is still ->on_rq.
1458 */ 1458 */
1459 static int ttwu_remote(struct task_struct *p, int wake_flags) 1459 static int ttwu_remote(struct task_struct *p, int wake_flags)
1460 { 1460 {
1461 struct rq *rq; 1461 struct rq *rq;
1462 int ret = 0; 1462 int ret = 0;
1463 1463
1464 rq = __task_rq_lock(p); 1464 rq = __task_rq_lock(p);
1465 if (p->on_rq) { 1465 if (p->on_rq) {
1466 /* check_preempt_curr() may use rq clock */ 1466 /* check_preempt_curr() may use rq clock */
1467 update_rq_clock(rq); 1467 update_rq_clock(rq);
1468 ttwu_do_wakeup(rq, p, wake_flags); 1468 ttwu_do_wakeup(rq, p, wake_flags);
1469 ret = 1; 1469 ret = 1;
1470 } 1470 }
1471 __task_rq_unlock(rq); 1471 __task_rq_unlock(rq);
1472 1472
1473 return ret; 1473 return ret;
1474 } 1474 }
1475 1475
1476 #ifdef CONFIG_SMP 1476 #ifdef CONFIG_SMP
1477 static void sched_ttwu_pending(void) 1477 static void sched_ttwu_pending(void)
1478 { 1478 {
1479 struct rq *rq = this_rq(); 1479 struct rq *rq = this_rq();
1480 struct llist_node *llist = llist_del_all(&rq->wake_list); 1480 struct llist_node *llist = llist_del_all(&rq->wake_list);
1481 struct task_struct *p; 1481 struct task_struct *p;
1482 1482
1483 raw_spin_lock(&rq->lock); 1483 raw_spin_lock(&rq->lock);
1484 1484
1485 while (llist) { 1485 while (llist) {
1486 p = llist_entry(llist, struct task_struct, wake_entry); 1486 p = llist_entry(llist, struct task_struct, wake_entry);
1487 llist = llist_next(llist); 1487 llist = llist_next(llist);
1488 ttwu_do_activate(rq, p, 0); 1488 ttwu_do_activate(rq, p, 0);
1489 } 1489 }
1490 1490
1491 raw_spin_unlock(&rq->lock); 1491 raw_spin_unlock(&rq->lock);
1492 } 1492 }
1493 1493
1494 void scheduler_ipi(void) 1494 void scheduler_ipi(void)
1495 { 1495 {
1496 /* 1496 /*
1497 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting 1497 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1498 * TIF_NEED_RESCHED remotely (for the first time) will also send 1498 * TIF_NEED_RESCHED remotely (for the first time) will also send
1499 * this IPI. 1499 * this IPI.
1500 */ 1500 */
1501 preempt_fold_need_resched(); 1501 preempt_fold_need_resched();
1502 1502
1503 if (llist_empty(&this_rq()->wake_list) 1503 if (llist_empty(&this_rq()->wake_list)
1504 && !tick_nohz_full_cpu(smp_processor_id()) 1504 && !tick_nohz_full_cpu(smp_processor_id())
1505 && !got_nohz_idle_kick()) 1505 && !got_nohz_idle_kick())
1506 return; 1506 return;
1507 1507
1508 /* 1508 /*
1509 * Not all reschedule IPI handlers call irq_enter/irq_exit, since 1509 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1510 * traditionally all their work was done from the interrupt return 1510 * traditionally all their work was done from the interrupt return
1511 * path. Now that we actually do some work, we need to make sure 1511 * path. Now that we actually do some work, we need to make sure
1512 * we do call them. 1512 * we do call them.
1513 * 1513 *
1514 * Some archs already do call them, luckily irq_enter/exit nest 1514 * Some archs already do call them, luckily irq_enter/exit nest
1515 * properly. 1515 * properly.
1516 * 1516 *
1517 * Arguably we should visit all archs and update all handlers, 1517 * Arguably we should visit all archs and update all handlers,
1518 * however a fair share of IPIs are still resched only so this would 1518 * however a fair share of IPIs are still resched only so this would
1519 * somewhat pessimize the simple resched case. 1519 * somewhat pessimize the simple resched case.
1520 */ 1520 */
1521 irq_enter(); 1521 irq_enter();
1522 tick_nohz_full_check(); 1522 tick_nohz_full_check();
1523 sched_ttwu_pending(); 1523 sched_ttwu_pending();
1524 1524
1525 /* 1525 /*
1526 * Check if someone kicked us for doing the nohz idle load balance. 1526 * Check if someone kicked us for doing the nohz idle load balance.
1527 */ 1527 */
1528 if (unlikely(got_nohz_idle_kick())) { 1528 if (unlikely(got_nohz_idle_kick())) {
1529 this_rq()->idle_balance = 1; 1529 this_rq()->idle_balance = 1;
1530 raise_softirq_irqoff(SCHED_SOFTIRQ); 1530 raise_softirq_irqoff(SCHED_SOFTIRQ);
1531 } 1531 }
1532 irq_exit(); 1532 irq_exit();
1533 } 1533 }
1534 1534
1535 static void ttwu_queue_remote(struct task_struct *p, int cpu) 1535 static void ttwu_queue_remote(struct task_struct *p, int cpu)
1536 { 1536 {
1537 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) 1537 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
1538 smp_send_reschedule(cpu); 1538 smp_send_reschedule(cpu);
1539 } 1539 }
1540 1540
1541 bool cpus_share_cache(int this_cpu, int that_cpu) 1541 bool cpus_share_cache(int this_cpu, int that_cpu)
1542 { 1542 {
1543 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1543 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1544 } 1544 }
1545 #endif /* CONFIG_SMP */ 1545 #endif /* CONFIG_SMP */
1546 1546
1547 static void ttwu_queue(struct task_struct *p, int cpu) 1547 static void ttwu_queue(struct task_struct *p, int cpu)
1548 { 1548 {
1549 struct rq *rq = cpu_rq(cpu); 1549 struct rq *rq = cpu_rq(cpu);
1550 1550
1551 #if defined(CONFIG_SMP) 1551 #if defined(CONFIG_SMP)
1552 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { 1552 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1553 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1553 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1554 ttwu_queue_remote(p, cpu); 1554 ttwu_queue_remote(p, cpu);
1555 return; 1555 return;
1556 } 1556 }
1557 #endif 1557 #endif
1558 1558
1559 raw_spin_lock(&rq->lock); 1559 raw_spin_lock(&rq->lock);
1560 ttwu_do_activate(rq, p, 0); 1560 ttwu_do_activate(rq, p, 0);
1561 raw_spin_unlock(&rq->lock); 1561 raw_spin_unlock(&rq->lock);
1562 } 1562 }
1563 1563
1564 /** 1564 /**
1565 * try_to_wake_up - wake up a thread 1565 * try_to_wake_up - wake up a thread
1566 * @p: the thread to be awakened 1566 * @p: the thread to be awakened
1567 * @state: the mask of task states that can be woken 1567 * @state: the mask of task states that can be woken
1568 * @wake_flags: wake modifier flags (WF_*) 1568 * @wake_flags: wake modifier flags (WF_*)
1569 * 1569 *
1570 * Put it on the run-queue if it's not already there. The "current" 1570 * Put it on the run-queue if it's not already there. The "current"
1571 * thread is always on the run-queue (except when the actual 1571 * thread is always on the run-queue (except when the actual
1572 * re-schedule is in progress), and as such you're allowed to do 1572 * re-schedule is in progress), and as such you're allowed to do
1573 * the simpler "current->state = TASK_RUNNING" to mark yourself 1573 * the simpler "current->state = TASK_RUNNING" to mark yourself
1574 * runnable without the overhead of this. 1574 * runnable without the overhead of this.
1575 * 1575 *
1576 * Return: %true if @p was woken up, %false if it was already running. 1576 * Return: %true if @p was woken up, %false if it was already running.
1577 * or @state didn't match @p's state. 1577 * or @state didn't match @p's state.
1578 */ 1578 */
1579 static int 1579 static int
1580 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 1580 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1581 { 1581 {
1582 unsigned long flags; 1582 unsigned long flags;
1583 int cpu, success = 0; 1583 int cpu, success = 0;
1584 1584
1585 /* 1585 /*
1586 * If we are going to wake up a thread waiting for CONDITION we 1586 * If we are going to wake up a thread waiting for CONDITION we
1587 * need to ensure that CONDITION=1 done by the caller can not be 1587 * need to ensure that CONDITION=1 done by the caller can not be
1588 * reordered with p->state check below. This pairs with mb() in 1588 * reordered with p->state check below. This pairs with mb() in
1589 * set_current_state() the waiting thread does. 1589 * set_current_state() the waiting thread does.
1590 */ 1590 */
1591 smp_mb__before_spinlock(); 1591 smp_mb__before_spinlock();
1592 raw_spin_lock_irqsave(&p->pi_lock, flags); 1592 raw_spin_lock_irqsave(&p->pi_lock, flags);
1593 if (!(p->state & state)) 1593 if (!(p->state & state))
1594 goto out; 1594 goto out;
1595 1595
1596 success = 1; /* we're going to change ->state */ 1596 success = 1; /* we're going to change ->state */
1597 cpu = task_cpu(p); 1597 cpu = task_cpu(p);
1598 1598
1599 if (p->on_rq && ttwu_remote(p, wake_flags)) 1599 if (p->on_rq && ttwu_remote(p, wake_flags))
1600 goto stat; 1600 goto stat;
1601 1601
1602 #ifdef CONFIG_SMP 1602 #ifdef CONFIG_SMP
1603 /* 1603 /*
1604 * If the owning (remote) cpu is still in the middle of schedule() with 1604 * If the owning (remote) cpu is still in the middle of schedule() with
1605 * this task as prev, wait until its done referencing the task. 1605 * this task as prev, wait until its done referencing the task.
1606 */ 1606 */
1607 while (p->on_cpu) 1607 while (p->on_cpu)
1608 cpu_relax(); 1608 cpu_relax();
1609 /* 1609 /*
1610 * Pairs with the smp_wmb() in finish_lock_switch(). 1610 * Pairs with the smp_wmb() in finish_lock_switch().
1611 */ 1611 */
1612 smp_rmb(); 1612 smp_rmb();
1613 1613
1614 p->sched_contributes_to_load = !!task_contributes_to_load(p); 1614 p->sched_contributes_to_load = !!task_contributes_to_load(p);
1615 p->state = TASK_WAKING; 1615 p->state = TASK_WAKING;
1616 1616
1617 if (p->sched_class->task_waking) 1617 if (p->sched_class->task_waking)
1618 p->sched_class->task_waking(p); 1618 p->sched_class->task_waking(p);
1619 1619
1620 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 1620 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1621 if (task_cpu(p) != cpu) { 1621 if (task_cpu(p) != cpu) {
1622 wake_flags |= WF_MIGRATED; 1622 wake_flags |= WF_MIGRATED;
1623 set_task_cpu(p, cpu); 1623 set_task_cpu(p, cpu);
1624 } 1624 }
1625 #endif /* CONFIG_SMP */ 1625 #endif /* CONFIG_SMP */
1626 1626
1627 ttwu_queue(p, cpu); 1627 ttwu_queue(p, cpu);
1628 stat: 1628 stat:
1629 ttwu_stat(p, cpu, wake_flags); 1629 ttwu_stat(p, cpu, wake_flags);
1630 out: 1630 out:
1631 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1631 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1632 1632
1633 return success; 1633 return success;
1634 } 1634 }
1635 1635
1636 /** 1636 /**
1637 * try_to_wake_up_local - try to wake up a local task with rq lock held 1637 * try_to_wake_up_local - try to wake up a local task with rq lock held
1638 * @p: the thread to be awakened 1638 * @p: the thread to be awakened
1639 * 1639 *
1640 * Put @p on the run-queue if it's not already there. The caller must 1640 * Put @p on the run-queue if it's not already there. The caller must
1641 * ensure that this_rq() is locked, @p is bound to this_rq() and not 1641 * ensure that this_rq() is locked, @p is bound to this_rq() and not
1642 * the current task. 1642 * the current task.
1643 */ 1643 */
1644 static void try_to_wake_up_local(struct task_struct *p) 1644 static void try_to_wake_up_local(struct task_struct *p)
1645 { 1645 {
1646 struct rq *rq = task_rq(p); 1646 struct rq *rq = task_rq(p);
1647 1647
1648 if (WARN_ON_ONCE(rq != this_rq()) || 1648 if (WARN_ON_ONCE(rq != this_rq()) ||
1649 WARN_ON_ONCE(p == current)) 1649 WARN_ON_ONCE(p == current))
1650 return; 1650 return;
1651 1651
1652 lockdep_assert_held(&rq->lock); 1652 lockdep_assert_held(&rq->lock);
1653 1653
1654 if (!raw_spin_trylock(&p->pi_lock)) { 1654 if (!raw_spin_trylock(&p->pi_lock)) {
1655 raw_spin_unlock(&rq->lock); 1655 raw_spin_unlock(&rq->lock);
1656 raw_spin_lock(&p->pi_lock); 1656 raw_spin_lock(&p->pi_lock);
1657 raw_spin_lock(&rq->lock); 1657 raw_spin_lock(&rq->lock);
1658 } 1658 }
1659 1659
1660 if (!(p->state & TASK_NORMAL)) 1660 if (!(p->state & TASK_NORMAL))
1661 goto out; 1661 goto out;
1662 1662
1663 if (!p->on_rq) 1663 if (!p->on_rq)
1664 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1664 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1665 1665
1666 ttwu_do_wakeup(rq, p, 0); 1666 ttwu_do_wakeup(rq, p, 0);
1667 ttwu_stat(p, smp_processor_id(), 0); 1667 ttwu_stat(p, smp_processor_id(), 0);
1668 out: 1668 out:
1669 raw_spin_unlock(&p->pi_lock); 1669 raw_spin_unlock(&p->pi_lock);
1670 } 1670 }
1671 1671
1672 /** 1672 /**
1673 * wake_up_process - Wake up a specific process 1673 * wake_up_process - Wake up a specific process
1674 * @p: The process to be woken up. 1674 * @p: The process to be woken up.
1675 * 1675 *
1676 * Attempt to wake up the nominated process and move it to the set of runnable 1676 * Attempt to wake up the nominated process and move it to the set of runnable
1677 * processes. 1677 * processes.
1678 * 1678 *
1679 * Return: 1 if the process was woken up, 0 if it was already running. 1679 * Return: 1 if the process was woken up, 0 if it was already running.
1680 * 1680 *
1681 * It may be assumed that this function implies a write memory barrier before 1681 * It may be assumed that this function implies a write memory barrier before
1682 * changing the task state if and only if any tasks are woken up. 1682 * changing the task state if and only if any tasks are woken up.
1683 */ 1683 */
1684 int wake_up_process(struct task_struct *p) 1684 int wake_up_process(struct task_struct *p)
1685 { 1685 {
1686 WARN_ON(task_is_stopped_or_traced(p)); 1686 WARN_ON(task_is_stopped_or_traced(p));
1687 return try_to_wake_up(p, TASK_NORMAL, 0); 1687 return try_to_wake_up(p, TASK_NORMAL, 0);
1688 } 1688 }
1689 EXPORT_SYMBOL(wake_up_process); 1689 EXPORT_SYMBOL(wake_up_process);
1690 1690
1691 int wake_up_state(struct task_struct *p, unsigned int state) 1691 int wake_up_state(struct task_struct *p, unsigned int state)
1692 { 1692 {
1693 return try_to_wake_up(p, state, 0); 1693 return try_to_wake_up(p, state, 0);
1694 } 1694 }
1695 1695
1696 /* 1696 /*
1697 * Perform scheduler related setup for a newly forked process p. 1697 * Perform scheduler related setup for a newly forked process p.
1698 * p is forked by current. 1698 * p is forked by current.
1699 * 1699 *
1700 * __sched_fork() is basic setup used by init_idle() too: 1700 * __sched_fork() is basic setup used by init_idle() too:
1701 */ 1701 */
1702 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 1702 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1703 { 1703 {
1704 p->on_rq = 0; 1704 p->on_rq = 0;
1705 1705
1706 p->se.on_rq = 0; 1706 p->se.on_rq = 0;
1707 p->se.exec_start = 0; 1707 p->se.exec_start = 0;
1708 p->se.sum_exec_runtime = 0; 1708 p->se.sum_exec_runtime = 0;
1709 p->se.prev_sum_exec_runtime = 0; 1709 p->se.prev_sum_exec_runtime = 0;
1710 p->se.nr_migrations = 0; 1710 p->se.nr_migrations = 0;
1711 p->se.vruntime = 0; 1711 p->se.vruntime = 0;
1712 INIT_LIST_HEAD(&p->se.group_node); 1712 INIT_LIST_HEAD(&p->se.group_node);
1713 1713
1714 #ifdef CONFIG_SCHEDSTATS 1714 #ifdef CONFIG_SCHEDSTATS
1715 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 1715 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
1716 #endif 1716 #endif
1717 1717
1718 RB_CLEAR_NODE(&p->dl.rb_node); 1718 RB_CLEAR_NODE(&p->dl.rb_node);
1719 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1719 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1720 p->dl.dl_runtime = p->dl.runtime = 0; 1720 p->dl.dl_runtime = p->dl.runtime = 0;
1721 p->dl.dl_deadline = p->dl.deadline = 0; 1721 p->dl.dl_deadline = p->dl.deadline = 0;
1722 p->dl.dl_period = 0; 1722 p->dl.dl_period = 0;
1723 p->dl.flags = 0; 1723 p->dl.flags = 0;
1724 1724
1725 INIT_LIST_HEAD(&p->rt.run_list); 1725 INIT_LIST_HEAD(&p->rt.run_list);
1726 1726
1727 #ifdef CONFIG_PREEMPT_NOTIFIERS 1727 #ifdef CONFIG_PREEMPT_NOTIFIERS
1728 INIT_HLIST_HEAD(&p->preempt_notifiers); 1728 INIT_HLIST_HEAD(&p->preempt_notifiers);
1729 #endif 1729 #endif
1730 1730
1731 #ifdef CONFIG_NUMA_BALANCING 1731 #ifdef CONFIG_NUMA_BALANCING
1732 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1732 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1733 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); 1733 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1734 p->mm->numa_scan_seq = 0; 1734 p->mm->numa_scan_seq = 0;
1735 } 1735 }
1736 1736
1737 if (clone_flags & CLONE_VM) 1737 if (clone_flags & CLONE_VM)
1738 p->numa_preferred_nid = current->numa_preferred_nid; 1738 p->numa_preferred_nid = current->numa_preferred_nid;
1739 else 1739 else
1740 p->numa_preferred_nid = -1; 1740 p->numa_preferred_nid = -1;
1741 1741
1742 p->node_stamp = 0ULL; 1742 p->node_stamp = 0ULL;
1743 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1743 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1744 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1744 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1745 p->numa_work.next = &p->numa_work; 1745 p->numa_work.next = &p->numa_work;
1746 p->numa_faults_memory = NULL; 1746 p->numa_faults_memory = NULL;
1747 p->numa_faults_buffer_memory = NULL; 1747 p->numa_faults_buffer_memory = NULL;
1748 p->last_task_numa_placement = 0; 1748 p->last_task_numa_placement = 0;
1749 p->last_sum_exec_runtime = 0; 1749 p->last_sum_exec_runtime = 0;
1750 1750
1751 INIT_LIST_HEAD(&p->numa_entry); 1751 INIT_LIST_HEAD(&p->numa_entry);
1752 p->numa_group = NULL; 1752 p->numa_group = NULL;
1753 #endif /* CONFIG_NUMA_BALANCING */ 1753 #endif /* CONFIG_NUMA_BALANCING */
1754 } 1754 }
1755 1755
1756 #ifdef CONFIG_NUMA_BALANCING 1756 #ifdef CONFIG_NUMA_BALANCING
1757 #ifdef CONFIG_SCHED_DEBUG 1757 #ifdef CONFIG_SCHED_DEBUG
1758 void set_numabalancing_state(bool enabled) 1758 void set_numabalancing_state(bool enabled)
1759 { 1759 {
1760 if (enabled) 1760 if (enabled)
1761 sched_feat_set("NUMA"); 1761 sched_feat_set("NUMA");
1762 else 1762 else
1763 sched_feat_set("NO_NUMA"); 1763 sched_feat_set("NO_NUMA");
1764 } 1764 }
1765 #else 1765 #else
1766 __read_mostly bool numabalancing_enabled; 1766 __read_mostly bool numabalancing_enabled;
1767 1767
1768 void set_numabalancing_state(bool enabled) 1768 void set_numabalancing_state(bool enabled)
1769 { 1769 {
1770 numabalancing_enabled = enabled; 1770 numabalancing_enabled = enabled;
1771 } 1771 }
1772 #endif /* CONFIG_SCHED_DEBUG */ 1772 #endif /* CONFIG_SCHED_DEBUG */
1773 1773
1774 #ifdef CONFIG_PROC_SYSCTL 1774 #ifdef CONFIG_PROC_SYSCTL
1775 int sysctl_numa_balancing(struct ctl_table *table, int write, 1775 int sysctl_numa_balancing(struct ctl_table *table, int write,
1776 void __user *buffer, size_t *lenp, loff_t *ppos) 1776 void __user *buffer, size_t *lenp, loff_t *ppos)
1777 { 1777 {
1778 struct ctl_table t; 1778 struct ctl_table t;
1779 int err; 1779 int err;
1780 int state = numabalancing_enabled; 1780 int state = numabalancing_enabled;
1781 1781
1782 if (write && !capable(CAP_SYS_ADMIN)) 1782 if (write && !capable(CAP_SYS_ADMIN))
1783 return -EPERM; 1783 return -EPERM;
1784 1784
1785 t = *table; 1785 t = *table;
1786 t.data = &state; 1786 t.data = &state;
1787 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); 1787 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
1788 if (err < 0) 1788 if (err < 0)
1789 return err; 1789 return err;
1790 if (write) 1790 if (write)
1791 set_numabalancing_state(state); 1791 set_numabalancing_state(state);
1792 return err; 1792 return err;
1793 } 1793 }
1794 #endif 1794 #endif
1795 #endif 1795 #endif
1796 1796
1797 /* 1797 /*
1798 * fork()/clone()-time setup: 1798 * fork()/clone()-time setup:
1799 */ 1799 */
1800 int sched_fork(unsigned long clone_flags, struct task_struct *p) 1800 int sched_fork(unsigned long clone_flags, struct task_struct *p)
1801 { 1801 {
1802 unsigned long flags; 1802 unsigned long flags;
1803 int cpu = get_cpu(); 1803 int cpu = get_cpu();
1804 1804
1805 __sched_fork(clone_flags, p); 1805 __sched_fork(clone_flags, p);
1806 /* 1806 /*
1807 * We mark the process as running here. This guarantees that 1807 * We mark the process as running here. This guarantees that
1808 * nobody will actually run it, and a signal or other external 1808 * nobody will actually run it, and a signal or other external
1809 * event cannot wake it up and insert it on the runqueue either. 1809 * event cannot wake it up and insert it on the runqueue either.
1810 */ 1810 */
1811 p->state = TASK_RUNNING; 1811 p->state = TASK_RUNNING;
1812 1812
1813 /* 1813 /*
1814 * Make sure we do not leak PI boosting priority to the child. 1814 * Make sure we do not leak PI boosting priority to the child.
1815 */ 1815 */
1816 p->prio = current->normal_prio; 1816 p->prio = current->normal_prio;
1817 1817
1818 /* 1818 /*
1819 * Revert to default priority/policy on fork if requested. 1819 * Revert to default priority/policy on fork if requested.
1820 */ 1820 */
1821 if (unlikely(p->sched_reset_on_fork)) { 1821 if (unlikely(p->sched_reset_on_fork)) {
1822 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 1822 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
1823 p->policy = SCHED_NORMAL; 1823 p->policy = SCHED_NORMAL;
1824 p->static_prio = NICE_TO_PRIO(0); 1824 p->static_prio = NICE_TO_PRIO(0);
1825 p->rt_priority = 0; 1825 p->rt_priority = 0;
1826 } else if (PRIO_TO_NICE(p->static_prio) < 0) 1826 } else if (PRIO_TO_NICE(p->static_prio) < 0)
1827 p->static_prio = NICE_TO_PRIO(0); 1827 p->static_prio = NICE_TO_PRIO(0);
1828 1828
1829 p->prio = p->normal_prio = __normal_prio(p); 1829 p->prio = p->normal_prio = __normal_prio(p);
1830 set_load_weight(p); 1830 set_load_weight(p);
1831 1831
1832 /* 1832 /*
1833 * We don't need the reset flag anymore after the fork. It has 1833 * We don't need the reset flag anymore after the fork. It has
1834 * fulfilled its duty: 1834 * fulfilled its duty:
1835 */ 1835 */
1836 p->sched_reset_on_fork = 0; 1836 p->sched_reset_on_fork = 0;
1837 } 1837 }
1838 1838
1839 if (dl_prio(p->prio)) { 1839 if (dl_prio(p->prio)) {
1840 put_cpu(); 1840 put_cpu();
1841 return -EAGAIN; 1841 return -EAGAIN;
1842 } else if (rt_prio(p->prio)) { 1842 } else if (rt_prio(p->prio)) {
1843 p->sched_class = &rt_sched_class; 1843 p->sched_class = &rt_sched_class;
1844 } else { 1844 } else {
1845 p->sched_class = &fair_sched_class; 1845 p->sched_class = &fair_sched_class;
1846 } 1846 }
1847 1847
1848 if (p->sched_class->task_fork) 1848 if (p->sched_class->task_fork)
1849 p->sched_class->task_fork(p); 1849 p->sched_class->task_fork(p);
1850 1850
1851 /* 1851 /*
1852 * The child is not yet in the pid-hash so no cgroup attach races, 1852 * The child is not yet in the pid-hash so no cgroup attach races,
1853 * and the cgroup is pinned to this child due to cgroup_fork() 1853 * and the cgroup is pinned to this child due to cgroup_fork()
1854 * is ran before sched_fork(). 1854 * is ran before sched_fork().
1855 * 1855 *
1856 * Silence PROVE_RCU. 1856 * Silence PROVE_RCU.
1857 */ 1857 */
1858 raw_spin_lock_irqsave(&p->pi_lock, flags); 1858 raw_spin_lock_irqsave(&p->pi_lock, flags);
1859 set_task_cpu(p, cpu); 1859 set_task_cpu(p, cpu);
1860 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 1860 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
1861 1861
1862 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1862 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1863 if (likely(sched_info_on())) 1863 if (likely(sched_info_on()))
1864 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1864 memset(&p->sched_info, 0, sizeof(p->sched_info));
1865 #endif 1865 #endif
1866 #if defined(CONFIG_SMP) 1866 #if defined(CONFIG_SMP)
1867 p->on_cpu = 0; 1867 p->on_cpu = 0;
1868 #endif 1868 #endif
1869 init_task_preempt_count(p); 1869 init_task_preempt_count(p);
1870 #ifdef CONFIG_SMP 1870 #ifdef CONFIG_SMP
1871 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1871 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1872 RB_CLEAR_NODE(&p->pushable_dl_tasks); 1872 RB_CLEAR_NODE(&p->pushable_dl_tasks);
1873 #endif 1873 #endif
1874 1874
1875 put_cpu(); 1875 put_cpu();
1876 return 0; 1876 return 0;
1877 } 1877 }
1878 1878
1879 unsigned long to_ratio(u64 period, u64 runtime) 1879 unsigned long to_ratio(u64 period, u64 runtime)
1880 { 1880 {
1881 if (runtime == RUNTIME_INF) 1881 if (runtime == RUNTIME_INF)
1882 return 1ULL << 20; 1882 return 1ULL << 20;
1883 1883
1884 /* 1884 /*
1885 * Doing this here saves a lot of checks in all 1885 * Doing this here saves a lot of checks in all
1886 * the calling paths, and returning zero seems 1886 * the calling paths, and returning zero seems
1887 * safe for them anyway. 1887 * safe for them anyway.
1888 */ 1888 */
1889 if (period == 0) 1889 if (period == 0)
1890 return 0; 1890 return 0;
1891 1891
1892 return div64_u64(runtime << 20, period); 1892 return div64_u64(runtime << 20, period);
1893 } 1893 }
1894 1894
1895 #ifdef CONFIG_SMP 1895 #ifdef CONFIG_SMP
1896 inline struct dl_bw *dl_bw_of(int i) 1896 inline struct dl_bw *dl_bw_of(int i)
1897 { 1897 {
1898 return &cpu_rq(i)->rd->dl_bw; 1898 return &cpu_rq(i)->rd->dl_bw;
1899 } 1899 }
1900 1900
1901 static inline int dl_bw_cpus(int i) 1901 static inline int dl_bw_cpus(int i)
1902 { 1902 {
1903 struct root_domain *rd = cpu_rq(i)->rd; 1903 struct root_domain *rd = cpu_rq(i)->rd;
1904 int cpus = 0; 1904 int cpus = 0;
1905 1905
1906 for_each_cpu_and(i, rd->span, cpu_active_mask) 1906 for_each_cpu_and(i, rd->span, cpu_active_mask)
1907 cpus++; 1907 cpus++;
1908 1908
1909 return cpus; 1909 return cpus;
1910 } 1910 }
1911 #else 1911 #else
1912 inline struct dl_bw *dl_bw_of(int i) 1912 inline struct dl_bw *dl_bw_of(int i)
1913 { 1913 {
1914 return &cpu_rq(i)->dl.dl_bw; 1914 return &cpu_rq(i)->dl.dl_bw;
1915 } 1915 }
1916 1916
1917 static inline int dl_bw_cpus(int i) 1917 static inline int dl_bw_cpus(int i)
1918 { 1918 {
1919 return 1; 1919 return 1;
1920 } 1920 }
1921 #endif 1921 #endif
1922 1922
1923 static inline 1923 static inline
1924 void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw) 1924 void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
1925 { 1925 {
1926 dl_b->total_bw -= tsk_bw; 1926 dl_b->total_bw -= tsk_bw;
1927 } 1927 }
1928 1928
1929 static inline 1929 static inline
1930 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw) 1930 void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
1931 { 1931 {
1932 dl_b->total_bw += tsk_bw; 1932 dl_b->total_bw += tsk_bw;
1933 } 1933 }
1934 1934
1935 static inline 1935 static inline
1936 bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) 1936 bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
1937 { 1937 {
1938 return dl_b->bw != -1 && 1938 return dl_b->bw != -1 &&
1939 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 1939 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
1940 } 1940 }
1941 1941
1942 /* 1942 /*
1943 * We must be sure that accepting a new task (or allowing changing the 1943 * We must be sure that accepting a new task (or allowing changing the
1944 * parameters of an existing one) is consistent with the bandwidth 1944 * parameters of an existing one) is consistent with the bandwidth
1945 * constraints. If yes, this function also accordingly updates the currently 1945 * constraints. If yes, this function also accordingly updates the currently
1946 * allocated bandwidth to reflect the new situation. 1946 * allocated bandwidth to reflect the new situation.
1947 * 1947 *
1948 * This function is called while holding p's rq->lock. 1948 * This function is called while holding p's rq->lock.
1949 */ 1949 */
1950 static int dl_overflow(struct task_struct *p, int policy, 1950 static int dl_overflow(struct task_struct *p, int policy,
1951 const struct sched_attr *attr) 1951 const struct sched_attr *attr)
1952 { 1952 {
1953 1953
1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 1954 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1955 u64 period = attr->sched_period ?: attr->sched_deadline; 1955 u64 period = attr->sched_period ?: attr->sched_deadline;
1956 u64 runtime = attr->sched_runtime; 1956 u64 runtime = attr->sched_runtime;
1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 1957 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
1958 int cpus, err = -1; 1958 int cpus, err = -1;
1959 1959
1960 if (new_bw == p->dl.dl_bw) 1960 if (new_bw == p->dl.dl_bw)
1961 return 0; 1961 return 0;
1962 1962
1963 /* 1963 /*
1964 * Either if a task, enters, leave, or stays -deadline but changes 1964 * Either if a task, enters, leave, or stays -deadline but changes
1965 * its parameters, we may need to update accordingly the total 1965 * its parameters, we may need to update accordingly the total
1966 * allocated bandwidth of the container. 1966 * allocated bandwidth of the container.
1967 */ 1967 */
1968 raw_spin_lock(&dl_b->lock); 1968 raw_spin_lock(&dl_b->lock);
1969 cpus = dl_bw_cpus(task_cpu(p)); 1969 cpus = dl_bw_cpus(task_cpu(p));
1970 if (dl_policy(policy) && !task_has_dl_policy(p) && 1970 if (dl_policy(policy) && !task_has_dl_policy(p) &&
1971 !__dl_overflow(dl_b, cpus, 0, new_bw)) { 1971 !__dl_overflow(dl_b, cpus, 0, new_bw)) {
1972 __dl_add(dl_b, new_bw); 1972 __dl_add(dl_b, new_bw);
1973 err = 0; 1973 err = 0;
1974 } else if (dl_policy(policy) && task_has_dl_policy(p) && 1974 } else if (dl_policy(policy) && task_has_dl_policy(p) &&
1975 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) { 1975 !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
1976 __dl_clear(dl_b, p->dl.dl_bw); 1976 __dl_clear(dl_b, p->dl.dl_bw);
1977 __dl_add(dl_b, new_bw); 1977 __dl_add(dl_b, new_bw);
1978 err = 0; 1978 err = 0;
1979 } else if (!dl_policy(policy) && task_has_dl_policy(p)) { 1979 } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
1980 __dl_clear(dl_b, p->dl.dl_bw); 1980 __dl_clear(dl_b, p->dl.dl_bw);
1981 err = 0; 1981 err = 0;
1982 } 1982 }
1983 raw_spin_unlock(&dl_b->lock); 1983 raw_spin_unlock(&dl_b->lock);
1984 1984
1985 return err; 1985 return err;
1986 } 1986 }
1987 1987
1988 extern void init_dl_bw(struct dl_bw *dl_b); 1988 extern void init_dl_bw(struct dl_bw *dl_b);
1989 1989
1990 /* 1990 /*
1991 * wake_up_new_task - wake up a newly created task for the first time. 1991 * wake_up_new_task - wake up a newly created task for the first time.
1992 * 1992 *
1993 * This function will do some initial scheduler statistics housekeeping 1993 * This function will do some initial scheduler statistics housekeeping
1994 * that must be done for every newly created context, then puts the task 1994 * that must be done for every newly created context, then puts the task
1995 * on the runqueue and wakes it. 1995 * on the runqueue and wakes it.
1996 */ 1996 */
1997 void wake_up_new_task(struct task_struct *p) 1997 void wake_up_new_task(struct task_struct *p)
1998 { 1998 {
1999 unsigned long flags; 1999 unsigned long flags;
2000 struct rq *rq; 2000 struct rq *rq;
2001 2001
2002 raw_spin_lock_irqsave(&p->pi_lock, flags); 2002 raw_spin_lock_irqsave(&p->pi_lock, flags);
2003 #ifdef CONFIG_SMP 2003 #ifdef CONFIG_SMP
2004 /* 2004 /*
2005 * Fork balancing, do it here and not earlier because: 2005 * Fork balancing, do it here and not earlier because:
2006 * - cpus_allowed can change in the fork path 2006 * - cpus_allowed can change in the fork path
2007 * - any previously selected cpu might disappear through hotplug 2007 * - any previously selected cpu might disappear through hotplug
2008 */ 2008 */
2009 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2009 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
2010 #endif 2010 #endif
2011 2011
2012 /* Initialize new task's runnable average */ 2012 /* Initialize new task's runnable average */
2013 init_task_runnable_average(p); 2013 init_task_runnable_average(p);
2014 rq = __task_rq_lock(p); 2014 rq = __task_rq_lock(p);
2015 activate_task(rq, p, 0); 2015 activate_task(rq, p, 0);
2016 p->on_rq = 1; 2016 p->on_rq = 1;
2017 trace_sched_wakeup_new(p, true); 2017 trace_sched_wakeup_new(p, true);
2018 check_preempt_curr(rq, p, WF_FORK); 2018 check_preempt_curr(rq, p, WF_FORK);
2019 #ifdef CONFIG_SMP 2019 #ifdef CONFIG_SMP
2020 if (p->sched_class->task_woken) 2020 if (p->sched_class->task_woken)
2021 p->sched_class->task_woken(rq, p); 2021 p->sched_class->task_woken(rq, p);
2022 #endif 2022 #endif
2023 task_rq_unlock(rq, p, &flags); 2023 task_rq_unlock(rq, p, &flags);
2024 } 2024 }
2025 2025
2026 #ifdef CONFIG_PREEMPT_NOTIFIERS 2026 #ifdef CONFIG_PREEMPT_NOTIFIERS
2027 2027
2028 /** 2028 /**
2029 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2029 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2030 * @notifier: notifier struct to register 2030 * @notifier: notifier struct to register
2031 */ 2031 */
2032 void preempt_notifier_register(struct preempt_notifier *notifier) 2032 void preempt_notifier_register(struct preempt_notifier *notifier)
2033 { 2033 {
2034 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2034 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2035 } 2035 }
2036 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2036 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2037 2037
2038 /** 2038 /**
2039 * preempt_notifier_unregister - no longer interested in preemption notifications 2039 * preempt_notifier_unregister - no longer interested in preemption notifications
2040 * @notifier: notifier struct to unregister 2040 * @notifier: notifier struct to unregister
2041 * 2041 *
2042 * This is safe to call from within a preemption notifier. 2042 * This is safe to call from within a preemption notifier.
2043 */ 2043 */
2044 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2044 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2045 { 2045 {
2046 hlist_del(&notifier->link); 2046 hlist_del(&notifier->link);
2047 } 2047 }
2048 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2048 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2049 2049
2050 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2050 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2051 { 2051 {
2052 struct preempt_notifier *notifier; 2052 struct preempt_notifier *notifier;
2053 2053
2054 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2054 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2055 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2055 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2056 } 2056 }
2057 2057
2058 static void 2058 static void
2059 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2059 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2060 struct task_struct *next) 2060 struct task_struct *next)
2061 { 2061 {
2062 struct preempt_notifier *notifier; 2062 struct preempt_notifier *notifier;
2063 2063
2064 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) 2064 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2065 notifier->ops->sched_out(notifier, next); 2065 notifier->ops->sched_out(notifier, next);
2066 } 2066 }
2067 2067
2068 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2068 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2069 2069
2070 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2070 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2071 { 2071 {
2072 } 2072 }
2073 2073
2074 static void 2074 static void
2075 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2075 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2076 struct task_struct *next) 2076 struct task_struct *next)
2077 { 2077 {
2078 } 2078 }
2079 2079
2080 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2080 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2081 2081
2082 /** 2082 /**
2083 * prepare_task_switch - prepare to switch tasks 2083 * prepare_task_switch - prepare to switch tasks
2084 * @rq: the runqueue preparing to switch 2084 * @rq: the runqueue preparing to switch
2085 * @prev: the current task that is being switched out 2085 * @prev: the current task that is being switched out
2086 * @next: the task we are going to switch to. 2086 * @next: the task we are going to switch to.
2087 * 2087 *
2088 * This is called with the rq lock held and interrupts off. It must 2088 * This is called with the rq lock held and interrupts off. It must
2089 * be paired with a subsequent finish_task_switch after the context 2089 * be paired with a subsequent finish_task_switch after the context
2090 * switch. 2090 * switch.
2091 * 2091 *
2092 * prepare_task_switch sets up locking and calls architecture specific 2092 * prepare_task_switch sets up locking and calls architecture specific
2093 * hooks. 2093 * hooks.
2094 */ 2094 */
2095 static inline void 2095 static inline void
2096 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2096 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2097 struct task_struct *next) 2097 struct task_struct *next)
2098 { 2098 {
2099 trace_sched_switch(prev, next); 2099 trace_sched_switch(prev, next);
2100 sched_info_switch(rq, prev, next); 2100 sched_info_switch(rq, prev, next);
2101 perf_event_task_sched_out(prev, next); 2101 perf_event_task_sched_out(prev, next);
2102 fire_sched_out_preempt_notifiers(prev, next); 2102 fire_sched_out_preempt_notifiers(prev, next);
2103 prepare_lock_switch(rq, next); 2103 prepare_lock_switch(rq, next);
2104 prepare_arch_switch(next); 2104 prepare_arch_switch(next);
2105 } 2105 }
2106 2106
2107 /** 2107 /**
2108 * finish_task_switch - clean up after a task-switch 2108 * finish_task_switch - clean up after a task-switch
2109 * @rq: runqueue associated with task-switch 2109 * @rq: runqueue associated with task-switch
2110 * @prev: the thread we just switched away from. 2110 * @prev: the thread we just switched away from.
2111 * 2111 *
2112 * finish_task_switch must be called after the context switch, paired 2112 * finish_task_switch must be called after the context switch, paired
2113 * with a prepare_task_switch call before the context switch. 2113 * with a prepare_task_switch call before the context switch.
2114 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2114 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2115 * and do any other architecture-specific cleanup actions. 2115 * and do any other architecture-specific cleanup actions.
2116 * 2116 *
2117 * Note that we may have delayed dropping an mm in context_switch(). If 2117 * Note that we may have delayed dropping an mm in context_switch(). If
2118 * so, we finish that here outside of the runqueue lock. (Doing it 2118 * so, we finish that here outside of the runqueue lock. (Doing it
2119 * with the lock held can cause deadlocks; see schedule() for 2119 * with the lock held can cause deadlocks; see schedule() for
2120 * details.) 2120 * details.)
2121 */ 2121 */
2122 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2122 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2123 __releases(rq->lock) 2123 __releases(rq->lock)
2124 { 2124 {
2125 struct mm_struct *mm = rq->prev_mm; 2125 struct mm_struct *mm = rq->prev_mm;
2126 long prev_state; 2126 long prev_state;
2127 2127
2128 rq->prev_mm = NULL; 2128 rq->prev_mm = NULL;
2129 2129
2130 /* 2130 /*
2131 * A task struct has one reference for the use as "current". 2131 * A task struct has one reference for the use as "current".
2132 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2132 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2133 * schedule one last time. The schedule call will never return, and 2133 * schedule one last time. The schedule call will never return, and
2134 * the scheduled task must drop that reference. 2134 * the scheduled task must drop that reference.
2135 * The test for TASK_DEAD must occur while the runqueue locks are 2135 * The test for TASK_DEAD must occur while the runqueue locks are
2136 * still held, otherwise prev could be scheduled on another cpu, die 2136 * still held, otherwise prev could be scheduled on another cpu, die
2137 * there before we look at prev->state, and then the reference would 2137 * there before we look at prev->state, and then the reference would
2138 * be dropped twice. 2138 * be dropped twice.
2139 * Manfred Spraul <manfred@colorfullife.com> 2139 * Manfred Spraul <manfred@colorfullife.com>
2140 */ 2140 */
2141 prev_state = prev->state; 2141 prev_state = prev->state;
2142 vtime_task_switch(prev); 2142 vtime_task_switch(prev);
2143 finish_arch_switch(prev); 2143 finish_arch_switch(prev);
2144 perf_event_task_sched_in(prev, current); 2144 perf_event_task_sched_in(prev, current);
2145 finish_lock_switch(rq, prev); 2145 finish_lock_switch(rq, prev);
2146 finish_arch_post_lock_switch(); 2146 finish_arch_post_lock_switch();
2147 2147
2148 fire_sched_in_preempt_notifiers(current); 2148 fire_sched_in_preempt_notifiers(current);
2149 if (mm) 2149 if (mm)
2150 mmdrop(mm); 2150 mmdrop(mm);
2151 if (unlikely(prev_state == TASK_DEAD)) { 2151 if (unlikely(prev_state == TASK_DEAD)) {
2152 if (prev->sched_class->task_dead) 2152 if (prev->sched_class->task_dead)
2153 prev->sched_class->task_dead(prev); 2153 prev->sched_class->task_dead(prev);
2154 2154
2155 /* 2155 /*
2156 * Remove function-return probe instances associated with this 2156 * Remove function-return probe instances associated with this
2157 * task and put them back on the free list. 2157 * task and put them back on the free list.
2158 */ 2158 */
2159 kprobe_flush_task(prev); 2159 kprobe_flush_task(prev);
2160 put_task_struct(prev); 2160 put_task_struct(prev);
2161 } 2161 }
2162 2162
2163 tick_nohz_task_switch(current); 2163 tick_nohz_task_switch(current);
2164 } 2164 }
2165 2165
2166 #ifdef CONFIG_SMP 2166 #ifdef CONFIG_SMP
2167 2167
2168 /* rq->lock is NOT held, but preemption is disabled */ 2168 /* rq->lock is NOT held, but preemption is disabled */
2169 static inline void post_schedule(struct rq *rq) 2169 static inline void post_schedule(struct rq *rq)
2170 { 2170 {
2171 if (rq->post_schedule) { 2171 if (rq->post_schedule) {
2172 unsigned long flags; 2172 unsigned long flags;
2173 2173
2174 raw_spin_lock_irqsave(&rq->lock, flags); 2174 raw_spin_lock_irqsave(&rq->lock, flags);
2175 if (rq->curr->sched_class->post_schedule) 2175 if (rq->curr->sched_class->post_schedule)
2176 rq->curr->sched_class->post_schedule(rq); 2176 rq->curr->sched_class->post_schedule(rq);
2177 raw_spin_unlock_irqrestore(&rq->lock, flags); 2177 raw_spin_unlock_irqrestore(&rq->lock, flags);
2178 2178
2179 rq->post_schedule = 0; 2179 rq->post_schedule = 0;
2180 } 2180 }
2181 } 2181 }
2182 2182
2183 #else 2183 #else
2184 2184
2185 static inline void post_schedule(struct rq *rq) 2185 static inline void post_schedule(struct rq *rq)
2186 { 2186 {
2187 } 2187 }
2188 2188
2189 #endif 2189 #endif
2190 2190
2191 /** 2191 /**
2192 * schedule_tail - first thing a freshly forked thread must call. 2192 * schedule_tail - first thing a freshly forked thread must call.
2193 * @prev: the thread we just switched away from. 2193 * @prev: the thread we just switched away from.
2194 */ 2194 */
2195 asmlinkage __visible void schedule_tail(struct task_struct *prev) 2195 asmlinkage __visible void schedule_tail(struct task_struct *prev)
2196 __releases(rq->lock) 2196 __releases(rq->lock)
2197 { 2197 {
2198 struct rq *rq = this_rq(); 2198 struct rq *rq = this_rq();
2199 2199
2200 finish_task_switch(rq, prev); 2200 finish_task_switch(rq, prev);
2201 2201
2202 /* 2202 /*
2203 * FIXME: do we need to worry about rq being invalidated by the 2203 * FIXME: do we need to worry about rq being invalidated by the
2204 * task_switch? 2204 * task_switch?
2205 */ 2205 */
2206 post_schedule(rq); 2206 post_schedule(rq);
2207 2207
2208 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 2208 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
2209 /* In this case, finish_task_switch does not reenable preemption */ 2209 /* In this case, finish_task_switch does not reenable preemption */
2210 preempt_enable(); 2210 preempt_enable();
2211 #endif 2211 #endif
2212 if (current->set_child_tid) 2212 if (current->set_child_tid)
2213 put_user(task_pid_vnr(current), current->set_child_tid); 2213 put_user(task_pid_vnr(current), current->set_child_tid);
2214 } 2214 }
2215 2215
2216 /* 2216 /*
2217 * context_switch - switch to the new MM and the new 2217 * context_switch - switch to the new MM and the new
2218 * thread's register state. 2218 * thread's register state.
2219 */ 2219 */
2220 static inline void 2220 static inline void
2221 context_switch(struct rq *rq, struct task_struct *prev, 2221 context_switch(struct rq *rq, struct task_struct *prev,
2222 struct task_struct *next) 2222 struct task_struct *next)
2223 { 2223 {
2224 struct mm_struct *mm, *oldmm; 2224 struct mm_struct *mm, *oldmm;
2225 2225
2226 prepare_task_switch(rq, prev, next); 2226 prepare_task_switch(rq, prev, next);
2227 2227
2228 mm = next->mm; 2228 mm = next->mm;
2229 oldmm = prev->active_mm; 2229 oldmm = prev->active_mm;
2230 /* 2230 /*
2231 * For paravirt, this is coupled with an exit in switch_to to 2231 * For paravirt, this is coupled with an exit in switch_to to
2232 * combine the page table reload and the switch backend into 2232 * combine the page table reload and the switch backend into
2233 * one hypercall. 2233 * one hypercall.
2234 */ 2234 */
2235 arch_start_context_switch(prev); 2235 arch_start_context_switch(prev);
2236 2236
2237 if (!mm) { 2237 if (!mm) {
2238 next->active_mm = oldmm; 2238 next->active_mm = oldmm;
2239 atomic_inc(&oldmm->mm_count); 2239 atomic_inc(&oldmm->mm_count);
2240 enter_lazy_tlb(oldmm, next); 2240 enter_lazy_tlb(oldmm, next);
2241 } else 2241 } else
2242 switch_mm(oldmm, mm, next); 2242 switch_mm(oldmm, mm, next);
2243 2243
2244 if (!prev->mm) { 2244 if (!prev->mm) {
2245 prev->active_mm = NULL; 2245 prev->active_mm = NULL;
2246 rq->prev_mm = oldmm; 2246 rq->prev_mm = oldmm;
2247 } 2247 }
2248 /* 2248 /*
2249 * Since the runqueue lock will be released by the next 2249 * Since the runqueue lock will be released by the next
2250 * task (which is an invalid locking op but in the case 2250 * task (which is an invalid locking op but in the case
2251 * of the scheduler it's an obvious special-case), so we 2251 * of the scheduler it's an obvious special-case), so we
2252 * do an early lockdep release here: 2252 * do an early lockdep release here:
2253 */ 2253 */
2254 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 2254 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
2255 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2255 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2256 #endif 2256 #endif
2257 2257
2258 context_tracking_task_switch(prev, next); 2258 context_tracking_task_switch(prev, next);
2259 /* Here we just switch the register state and the stack. */ 2259 /* Here we just switch the register state and the stack. */
2260 switch_to(prev, next, prev); 2260 switch_to(prev, next, prev);
2261 2261
2262 barrier(); 2262 barrier();
2263 /* 2263 /*
2264 * this_rq must be evaluated again because prev may have moved 2264 * this_rq must be evaluated again because prev may have moved
2265 * CPUs since it called schedule(), thus the 'rq' on its stack 2265 * CPUs since it called schedule(), thus the 'rq' on its stack
2266 * frame will be invalid. 2266 * frame will be invalid.
2267 */ 2267 */
2268 finish_task_switch(this_rq(), prev); 2268 finish_task_switch(this_rq(), prev);
2269 } 2269 }
2270 2270
2271 /* 2271 /*
2272 * nr_running and nr_context_switches: 2272 * nr_running and nr_context_switches:
2273 * 2273 *
2274 * externally visible scheduler statistics: current number of runnable 2274 * externally visible scheduler statistics: current number of runnable
2275 * threads, total number of context switches performed since bootup. 2275 * threads, total number of context switches performed since bootup.
2276 */ 2276 */
2277 unsigned long nr_running(void) 2277 unsigned long nr_running(void)
2278 { 2278 {
2279 unsigned long i, sum = 0; 2279 unsigned long i, sum = 0;
2280 2280
2281 for_each_online_cpu(i) 2281 for_each_online_cpu(i)
2282 sum += cpu_rq(i)->nr_running; 2282 sum += cpu_rq(i)->nr_running;
2283 2283
2284 return sum; 2284 return sum;
2285 } 2285 }
2286 2286
2287 unsigned long long nr_context_switches(void) 2287 unsigned long long nr_context_switches(void)
2288 { 2288 {
2289 int i; 2289 int i;
2290 unsigned long long sum = 0; 2290 unsigned long long sum = 0;
2291 2291
2292 for_each_possible_cpu(i) 2292 for_each_possible_cpu(i)
2293 sum += cpu_rq(i)->nr_switches; 2293 sum += cpu_rq(i)->nr_switches;
2294 2294
2295 return sum; 2295 return sum;
2296 } 2296 }
2297 2297
2298 unsigned long nr_iowait(void) 2298 unsigned long nr_iowait(void)
2299 { 2299 {
2300 unsigned long i, sum = 0; 2300 unsigned long i, sum = 0;
2301 2301
2302 for_each_possible_cpu(i) 2302 for_each_possible_cpu(i)
2303 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2303 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2304 2304
2305 return sum; 2305 return sum;
2306 } 2306 }
2307 2307
2308 unsigned long nr_iowait_cpu(int cpu) 2308 unsigned long nr_iowait_cpu(int cpu)
2309 { 2309 {
2310 struct rq *this = cpu_rq(cpu); 2310 struct rq *this = cpu_rq(cpu);
2311 return atomic_read(&this->nr_iowait); 2311 return atomic_read(&this->nr_iowait);
2312 } 2312 }
2313 2313
2314 #ifdef CONFIG_SMP 2314 #ifdef CONFIG_SMP
2315 2315
2316 /* 2316 /*
2317 * sched_exec - execve() is a valuable balancing opportunity, because at 2317 * sched_exec - execve() is a valuable balancing opportunity, because at
2318 * this point the task has the smallest effective memory and cache footprint. 2318 * this point the task has the smallest effective memory and cache footprint.
2319 */ 2319 */
2320 void sched_exec(void) 2320 void sched_exec(void)
2321 { 2321 {
2322 struct task_struct *p = current; 2322 struct task_struct *p = current;
2323 unsigned long flags; 2323 unsigned long flags;
2324 int dest_cpu; 2324 int dest_cpu;
2325 2325
2326 raw_spin_lock_irqsave(&p->pi_lock, flags); 2326 raw_spin_lock_irqsave(&p->pi_lock, flags);
2327 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); 2327 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2328 if (dest_cpu == smp_processor_id()) 2328 if (dest_cpu == smp_processor_id())
2329 goto unlock; 2329 goto unlock;
2330 2330
2331 if (likely(cpu_active(dest_cpu))) { 2331 if (likely(cpu_active(dest_cpu))) {
2332 struct migration_arg arg = { p, dest_cpu }; 2332 struct migration_arg arg = { p, dest_cpu };
2333 2333
2334 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2334 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2335 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); 2335 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2336 return; 2336 return;
2337 } 2337 }
2338 unlock: 2338 unlock:
2339 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2339 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2340 } 2340 }
2341 2341
2342 #endif 2342 #endif
2343 2343
2344 DEFINE_PER_CPU(struct kernel_stat, kstat); 2344 DEFINE_PER_CPU(struct kernel_stat, kstat);
2345 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); 2345 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2346 2346
2347 EXPORT_PER_CPU_SYMBOL(kstat); 2347 EXPORT_PER_CPU_SYMBOL(kstat);
2348 EXPORT_PER_CPU_SYMBOL(kernel_cpustat); 2348 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2349 2349
2350 /* 2350 /*
2351 * Return any ns on the sched_clock that have not yet been accounted in 2351 * Return any ns on the sched_clock that have not yet been accounted in
2352 * @p in case that task is currently running. 2352 * @p in case that task is currently running.
2353 * 2353 *
2354 * Called with task_rq_lock() held on @rq. 2354 * Called with task_rq_lock() held on @rq.
2355 */ 2355 */
2356 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 2356 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2357 { 2357 {
2358 u64 ns = 0; 2358 u64 ns = 0;
2359 2359
2360 if (task_current(rq, p)) { 2360 if (task_current(rq, p)) {
2361 update_rq_clock(rq); 2361 update_rq_clock(rq);
2362 ns = rq_clock_task(rq) - p->se.exec_start; 2362 ns = rq_clock_task(rq) - p->se.exec_start;
2363 if ((s64)ns < 0) 2363 if ((s64)ns < 0)
2364 ns = 0; 2364 ns = 0;
2365 } 2365 }
2366 2366
2367 return ns; 2367 return ns;
2368 } 2368 }
2369 2369
2370 unsigned long long task_delta_exec(struct task_struct *p) 2370 unsigned long long task_delta_exec(struct task_struct *p)
2371 { 2371 {
2372 unsigned long flags; 2372 unsigned long flags;
2373 struct rq *rq; 2373 struct rq *rq;
2374 u64 ns = 0; 2374 u64 ns = 0;
2375 2375
2376 rq = task_rq_lock(p, &flags); 2376 rq = task_rq_lock(p, &flags);
2377 ns = do_task_delta_exec(p, rq); 2377 ns = do_task_delta_exec(p, rq);
2378 task_rq_unlock(rq, p, &flags); 2378 task_rq_unlock(rq, p, &flags);
2379 2379
2380 return ns; 2380 return ns;
2381 } 2381 }
2382 2382
2383 /* 2383 /*
2384 * Return accounted runtime for the task. 2384 * Return accounted runtime for the task.
2385 * In case the task is currently running, return the runtime plus current's 2385 * In case the task is currently running, return the runtime plus current's
2386 * pending runtime that have not been accounted yet. 2386 * pending runtime that have not been accounted yet.
2387 */ 2387 */
2388 unsigned long long task_sched_runtime(struct task_struct *p) 2388 unsigned long long task_sched_runtime(struct task_struct *p)
2389 { 2389 {
2390 unsigned long flags; 2390 unsigned long flags;
2391 struct rq *rq; 2391 struct rq *rq;
2392 u64 ns = 0; 2392 u64 ns = 0;
2393 2393
2394 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 2394 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
2395 /* 2395 /*
2396 * 64-bit doesn't need locks to atomically read a 64bit value. 2396 * 64-bit doesn't need locks to atomically read a 64bit value.
2397 * So we have a optimization chance when the task's delta_exec is 0. 2397 * So we have a optimization chance when the task's delta_exec is 0.
2398 * Reading ->on_cpu is racy, but this is ok. 2398 * Reading ->on_cpu is racy, but this is ok.
2399 * 2399 *
2400 * If we race with it leaving cpu, we'll take a lock. So we're correct. 2400 * If we race with it leaving cpu, we'll take a lock. So we're correct.
2401 * If we race with it entering cpu, unaccounted time is 0. This is 2401 * If we race with it entering cpu, unaccounted time is 0. This is
2402 * indistinguishable from the read occurring a few cycles earlier. 2402 * indistinguishable from the read occurring a few cycles earlier.
2403 */ 2403 */
2404 if (!p->on_cpu) 2404 if (!p->on_cpu)
2405 return p->se.sum_exec_runtime; 2405 return p->se.sum_exec_runtime;
2406 #endif 2406 #endif
2407 2407
2408 rq = task_rq_lock(p, &flags); 2408 rq = task_rq_lock(p, &flags);
2409 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 2409 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
2410 task_rq_unlock(rq, p, &flags); 2410 task_rq_unlock(rq, p, &flags);
2411 2411
2412 return ns; 2412 return ns;
2413 } 2413 }
2414 2414
2415 /* 2415 /*
2416 * This function gets called by the timer code, with HZ frequency. 2416 * This function gets called by the timer code, with HZ frequency.
2417 * We call it with interrupts disabled. 2417 * We call it with interrupts disabled.
2418 */ 2418 */
2419 void scheduler_tick(void) 2419 void scheduler_tick(void)
2420 { 2420 {
2421 int cpu = smp_processor_id(); 2421 int cpu = smp_processor_id();
2422 struct rq *rq = cpu_rq(cpu); 2422 struct rq *rq = cpu_rq(cpu);
2423 struct task_struct *curr = rq->curr; 2423 struct task_struct *curr = rq->curr;
2424 2424
2425 sched_clock_tick(); 2425 sched_clock_tick();
2426 2426
2427 raw_spin_lock(&rq->lock); 2427 raw_spin_lock(&rq->lock);
2428 update_rq_clock(rq); 2428 update_rq_clock(rq);
2429 curr->sched_class->task_tick(rq, curr, 0); 2429 curr->sched_class->task_tick(rq, curr, 0);
2430 update_cpu_load_active(rq); 2430 update_cpu_load_active(rq);
2431 raw_spin_unlock(&rq->lock); 2431 raw_spin_unlock(&rq->lock);
2432 2432
2433 perf_event_task_tick(); 2433 perf_event_task_tick();
2434 2434
2435 #ifdef CONFIG_SMP 2435 #ifdef CONFIG_SMP
2436 rq->idle_balance = idle_cpu(cpu); 2436 rq->idle_balance = idle_cpu(cpu);
2437 trigger_load_balance(rq); 2437 trigger_load_balance(rq);
2438 #endif 2438 #endif
2439 rq_last_tick_reset(rq); 2439 rq_last_tick_reset(rq);
2440 } 2440 }
2441 2441
2442 #ifdef CONFIG_NO_HZ_FULL 2442 #ifdef CONFIG_NO_HZ_FULL
2443 /** 2443 /**
2444 * scheduler_tick_max_deferment 2444 * scheduler_tick_max_deferment
2445 * 2445 *
2446 * Keep at least one tick per second when a single 2446 * Keep at least one tick per second when a single
2447 * active task is running because the scheduler doesn't 2447 * active task is running because the scheduler doesn't
2448 * yet completely support full dynticks environment. 2448 * yet completely support full dynticks environment.
2449 * 2449 *
2450 * This makes sure that uptime, CFS vruntime, load 2450 * This makes sure that uptime, CFS vruntime, load
2451 * balancing, etc... continue to move forward, even 2451 * balancing, etc... continue to move forward, even
2452 * with a very low granularity. 2452 * with a very low granularity.
2453 * 2453 *
2454 * Return: Maximum deferment in nanoseconds. 2454 * Return: Maximum deferment in nanoseconds.
2455 */ 2455 */
2456 u64 scheduler_tick_max_deferment(void) 2456 u64 scheduler_tick_max_deferment(void)
2457 { 2457 {
2458 struct rq *rq = this_rq(); 2458 struct rq *rq = this_rq();
2459 unsigned long next, now = ACCESS_ONCE(jiffies); 2459 unsigned long next, now = ACCESS_ONCE(jiffies);
2460 2460
2461 next = rq->last_sched_tick + HZ; 2461 next = rq->last_sched_tick + HZ;
2462 2462
2463 if (time_before_eq(next, now)) 2463 if (time_before_eq(next, now))
2464 return 0; 2464 return 0;
2465 2465
2466 return jiffies_to_nsecs(next - now); 2466 return jiffies_to_nsecs(next - now);
2467 } 2467 }
2468 #endif 2468 #endif
2469 2469
2470 notrace unsigned long get_parent_ip(unsigned long addr) 2470 notrace unsigned long get_parent_ip(unsigned long addr)
2471 { 2471 {
2472 if (in_lock_functions(addr)) { 2472 if (in_lock_functions(addr)) {
2473 addr = CALLER_ADDR2; 2473 addr = CALLER_ADDR2;
2474 if (in_lock_functions(addr)) 2474 if (in_lock_functions(addr))
2475 addr = CALLER_ADDR3; 2475 addr = CALLER_ADDR3;
2476 } 2476 }
2477 return addr; 2477 return addr;
2478 } 2478 }
2479 2479
2480 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2480 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2481 defined(CONFIG_PREEMPT_TRACER)) 2481 defined(CONFIG_PREEMPT_TRACER))
2482 2482
2483 void __kprobes preempt_count_add(int val) 2483 void __kprobes preempt_count_add(int val)
2484 { 2484 {
2485 #ifdef CONFIG_DEBUG_PREEMPT 2485 #ifdef CONFIG_DEBUG_PREEMPT
2486 /* 2486 /*
2487 * Underflow? 2487 * Underflow?
2488 */ 2488 */
2489 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2489 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2490 return; 2490 return;
2491 #endif 2491 #endif
2492 __preempt_count_add(val); 2492 __preempt_count_add(val);
2493 #ifdef CONFIG_DEBUG_PREEMPT 2493 #ifdef CONFIG_DEBUG_PREEMPT
2494 /* 2494 /*
2495 * Spinlock count overflowing soon? 2495 * Spinlock count overflowing soon?
2496 */ 2496 */
2497 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 2497 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2498 PREEMPT_MASK - 10); 2498 PREEMPT_MASK - 10);
2499 #endif 2499 #endif
2500 if (preempt_count() == val) { 2500 if (preempt_count() == val) {
2501 unsigned long ip = get_parent_ip(CALLER_ADDR1); 2501 unsigned long ip = get_parent_ip(CALLER_ADDR1);
2502 #ifdef CONFIG_DEBUG_PREEMPT 2502 #ifdef CONFIG_DEBUG_PREEMPT
2503 current->preempt_disable_ip = ip; 2503 current->preempt_disable_ip = ip;
2504 #endif 2504 #endif
2505 trace_preempt_off(CALLER_ADDR0, ip); 2505 trace_preempt_off(CALLER_ADDR0, ip);
2506 } 2506 }
2507 } 2507 }
2508 EXPORT_SYMBOL(preempt_count_add); 2508 EXPORT_SYMBOL(preempt_count_add);
2509 2509
2510 void __kprobes preempt_count_sub(int val) 2510 void __kprobes preempt_count_sub(int val)
2511 { 2511 {
2512 #ifdef CONFIG_DEBUG_PREEMPT 2512 #ifdef CONFIG_DEBUG_PREEMPT
2513 /* 2513 /*
2514 * Underflow? 2514 * Underflow?
2515 */ 2515 */
2516 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 2516 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
2517 return; 2517 return;
2518 /* 2518 /*
2519 * Is the spinlock portion underflowing? 2519 * Is the spinlock portion underflowing?
2520 */ 2520 */
2521 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 2521 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
2522 !(preempt_count() & PREEMPT_MASK))) 2522 !(preempt_count() & PREEMPT_MASK)))
2523 return; 2523 return;
2524 #endif 2524 #endif
2525 2525
2526 if (preempt_count() == val) 2526 if (preempt_count() == val)
2527 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2527 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2528 __preempt_count_sub(val); 2528 __preempt_count_sub(val);
2529 } 2529 }
2530 EXPORT_SYMBOL(preempt_count_sub); 2530 EXPORT_SYMBOL(preempt_count_sub);
2531 2531
2532 #endif 2532 #endif
2533 2533
2534 /* 2534 /*
2535 * Print scheduling while atomic bug: 2535 * Print scheduling while atomic bug:
2536 */ 2536 */
2537 static noinline void __schedule_bug(struct task_struct *prev) 2537 static noinline void __schedule_bug(struct task_struct *prev)
2538 { 2538 {
2539 if (oops_in_progress) 2539 if (oops_in_progress)
2540 return; 2540 return;
2541 2541
2542 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 2542 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
2543 prev->comm, prev->pid, preempt_count()); 2543 prev->comm, prev->pid, preempt_count());
2544 2544
2545 debug_show_held_locks(prev); 2545 debug_show_held_locks(prev);
2546 print_modules(); 2546 print_modules();
2547 if (irqs_disabled()) 2547 if (irqs_disabled())
2548 print_irqtrace_events(prev); 2548 print_irqtrace_events(prev);
2549 #ifdef CONFIG_DEBUG_PREEMPT 2549 #ifdef CONFIG_DEBUG_PREEMPT
2550 if (in_atomic_preempt_off()) { 2550 if (in_atomic_preempt_off()) {
2551 pr_err("Preemption disabled at:"); 2551 pr_err("Preemption disabled at:");
2552 print_ip_sym(current->preempt_disable_ip); 2552 print_ip_sym(current->preempt_disable_ip);
2553 pr_cont("\n"); 2553 pr_cont("\n");
2554 } 2554 }
2555 #endif 2555 #endif
2556 dump_stack(); 2556 dump_stack();
2557 add_taint(TAINT_WARN, LOCKDEP_STILL_OK); 2557 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2558 } 2558 }
2559 2559
2560 /* 2560 /*
2561 * Various schedule()-time debugging checks and statistics: 2561 * Various schedule()-time debugging checks and statistics:
2562 */ 2562 */
2563 static inline void schedule_debug(struct task_struct *prev) 2563 static inline void schedule_debug(struct task_struct *prev)
2564 { 2564 {
2565 /* 2565 /*
2566 * Test if we are atomic. Since do_exit() needs to call into 2566 * Test if we are atomic. Since do_exit() needs to call into
2567 * schedule() atomically, we ignore that path. Otherwise whine 2567 * schedule() atomically, we ignore that path. Otherwise whine
2568 * if we are scheduling when we should not. 2568 * if we are scheduling when we should not.
2569 */ 2569 */
2570 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD)) 2570 if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
2571 __schedule_bug(prev); 2571 __schedule_bug(prev);
2572 rcu_sleep_check(); 2572 rcu_sleep_check();
2573 2573
2574 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 2574 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
2575 2575
2576 schedstat_inc(this_rq(), sched_count); 2576 schedstat_inc(this_rq(), sched_count);
2577 } 2577 }
2578 2578
2579 /* 2579 /*
2580 * Pick up the highest-prio task: 2580 * Pick up the highest-prio task:
2581 */ 2581 */
2582 static inline struct task_struct * 2582 static inline struct task_struct *
2583 pick_next_task(struct rq *rq, struct task_struct *prev) 2583 pick_next_task(struct rq *rq, struct task_struct *prev)
2584 { 2584 {
2585 const struct sched_class *class = &fair_sched_class; 2585 const struct sched_class *class = &fair_sched_class;
2586 struct task_struct *p; 2586 struct task_struct *p;
2587 2587
2588 /* 2588 /*
2589 * Optimization: we know that if all tasks are in 2589 * Optimization: we know that if all tasks are in
2590 * the fair class we can call that function directly: 2590 * the fair class we can call that function directly:
2591 */ 2591 */
2592 if (likely(prev->sched_class == class && 2592 if (likely(prev->sched_class == class &&
2593 rq->nr_running == rq->cfs.h_nr_running)) { 2593 rq->nr_running == rq->cfs.h_nr_running)) {
2594 p = fair_sched_class.pick_next_task(rq, prev); 2594 p = fair_sched_class.pick_next_task(rq, prev);
2595 if (unlikely(p == RETRY_TASK)) 2595 if (unlikely(p == RETRY_TASK))
2596 goto again; 2596 goto again;
2597 2597
2598 /* assumes fair_sched_class->next == idle_sched_class */ 2598 /* assumes fair_sched_class->next == idle_sched_class */
2599 if (unlikely(!p)) 2599 if (unlikely(!p))
2600 p = idle_sched_class.pick_next_task(rq, prev); 2600 p = idle_sched_class.pick_next_task(rq, prev);
2601 2601
2602 return p; 2602 return p;
2603 } 2603 }
2604 2604
2605 again: 2605 again:
2606 for_each_class(class) { 2606 for_each_class(class) {
2607 p = class->pick_next_task(rq, prev); 2607 p = class->pick_next_task(rq, prev);
2608 if (p) { 2608 if (p) {
2609 if (unlikely(p == RETRY_TASK)) 2609 if (unlikely(p == RETRY_TASK))
2610 goto again; 2610 goto again;
2611 return p; 2611 return p;
2612 } 2612 }
2613 } 2613 }
2614 2614
2615 BUG(); /* the idle class will always have a runnable task */ 2615 BUG(); /* the idle class will always have a runnable task */
2616 } 2616 }
2617 2617
2618 /* 2618 /*
2619 * __schedule() is the main scheduler function. 2619 * __schedule() is the main scheduler function.
2620 * 2620 *
2621 * The main means of driving the scheduler and thus entering this function are: 2621 * The main means of driving the scheduler and thus entering this function are:
2622 * 2622 *
2623 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. 2623 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
2624 * 2624 *
2625 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return 2625 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
2626 * paths. For example, see arch/x86/entry_64.S. 2626 * paths. For example, see arch/x86/entry_64.S.
2627 * 2627 *
2628 * To drive preemption between tasks, the scheduler sets the flag in timer 2628 * To drive preemption between tasks, the scheduler sets the flag in timer
2629 * interrupt handler scheduler_tick(). 2629 * interrupt handler scheduler_tick().
2630 * 2630 *
2631 * 3. Wakeups don't really cause entry into schedule(). They add a 2631 * 3. Wakeups don't really cause entry into schedule(). They add a
2632 * task to the run-queue and that's it. 2632 * task to the run-queue and that's it.
2633 * 2633 *
2634 * Now, if the new task added to the run-queue preempts the current 2634 * Now, if the new task added to the run-queue preempts the current
2635 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 2635 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
2636 * called on the nearest possible occasion: 2636 * called on the nearest possible occasion:
2637 * 2637 *
2638 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 2638 * - If the kernel is preemptible (CONFIG_PREEMPT=y):
2639 * 2639 *
2640 * - in syscall or exception context, at the next outmost 2640 * - in syscall or exception context, at the next outmost
2641 * preempt_enable(). (this might be as soon as the wake_up()'s 2641 * preempt_enable(). (this might be as soon as the wake_up()'s
2642 * spin_unlock()!) 2642 * spin_unlock()!)
2643 * 2643 *
2644 * - in IRQ context, return from interrupt-handler to 2644 * - in IRQ context, return from interrupt-handler to
2645 * preemptible context 2645 * preemptible context
2646 * 2646 *
2647 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 2647 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
2648 * then at the next: 2648 * then at the next:
2649 * 2649 *
2650 * - cond_resched() call 2650 * - cond_resched() call
2651 * - explicit schedule() call 2651 * - explicit schedule() call
2652 * - return from syscall or exception to user-space 2652 * - return from syscall or exception to user-space
2653 * - return from interrupt-handler to user-space 2653 * - return from interrupt-handler to user-space
2654 */ 2654 */
2655 static void __sched __schedule(void) 2655 static void __sched __schedule(void)
2656 { 2656 {
2657 struct task_struct *prev, *next; 2657 struct task_struct *prev, *next;
2658 unsigned long *switch_count; 2658 unsigned long *switch_count;
2659 struct rq *rq; 2659 struct rq *rq;
2660 int cpu; 2660 int cpu;
2661 2661
2662 need_resched: 2662 need_resched:
2663 preempt_disable(); 2663 preempt_disable();
2664 cpu = smp_processor_id(); 2664 cpu = smp_processor_id();
2665 rq = cpu_rq(cpu); 2665 rq = cpu_rq(cpu);
2666 rcu_note_context_switch(cpu); 2666 rcu_note_context_switch(cpu);
2667 prev = rq->curr; 2667 prev = rq->curr;
2668 2668
2669 schedule_debug(prev); 2669 schedule_debug(prev);
2670 2670
2671 if (sched_feat(HRTICK)) 2671 if (sched_feat(HRTICK))
2672 hrtick_clear(rq); 2672 hrtick_clear(rq);
2673 2673
2674 /* 2674 /*
2675 * Make sure that signal_pending_state()->signal_pending() below 2675 * Make sure that signal_pending_state()->signal_pending() below
2676 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 2676 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
2677 * done by the caller to avoid the race with signal_wake_up(). 2677 * done by the caller to avoid the race with signal_wake_up().
2678 */ 2678 */
2679 smp_mb__before_spinlock(); 2679 smp_mb__before_spinlock();
2680 raw_spin_lock_irq(&rq->lock); 2680 raw_spin_lock_irq(&rq->lock);
2681 2681
2682 switch_count = &prev->nivcsw; 2682 switch_count = &prev->nivcsw;
2683 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2683 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2684 if (unlikely(signal_pending_state(prev->state, prev))) { 2684 if (unlikely(signal_pending_state(prev->state, prev))) {
2685 prev->state = TASK_RUNNING; 2685 prev->state = TASK_RUNNING;
2686 } else { 2686 } else {
2687 deactivate_task(rq, prev, DEQUEUE_SLEEP); 2687 deactivate_task(rq, prev, DEQUEUE_SLEEP);
2688 prev->on_rq = 0; 2688 prev->on_rq = 0;
2689 2689
2690 /* 2690 /*
2691 * If a worker went to sleep, notify and ask workqueue 2691 * If a worker went to sleep, notify and ask workqueue
2692 * whether it wants to wake up a task to maintain 2692 * whether it wants to wake up a task to maintain
2693 * concurrency. 2693 * concurrency.
2694 */ 2694 */
2695 if (prev->flags & PF_WQ_WORKER) { 2695 if (prev->flags & PF_WQ_WORKER) {
2696 struct task_struct *to_wakeup; 2696 struct task_struct *to_wakeup;
2697 2697
2698 to_wakeup = wq_worker_sleeping(prev, cpu); 2698 to_wakeup = wq_worker_sleeping(prev, cpu);
2699 if (to_wakeup) 2699 if (to_wakeup)
2700 try_to_wake_up_local(to_wakeup); 2700 try_to_wake_up_local(to_wakeup);
2701 } 2701 }
2702 } 2702 }
2703 switch_count = &prev->nvcsw; 2703 switch_count = &prev->nvcsw;
2704 } 2704 }
2705 2705
2706 if (prev->on_rq || rq->skip_clock_update < 0) 2706 if (prev->on_rq || rq->skip_clock_update < 0)
2707 update_rq_clock(rq); 2707 update_rq_clock(rq);
2708 2708
2709 next = pick_next_task(rq, prev); 2709 next = pick_next_task(rq, prev);
2710 clear_tsk_need_resched(prev); 2710 clear_tsk_need_resched(prev);
2711 clear_preempt_need_resched(); 2711 clear_preempt_need_resched();
2712 rq->skip_clock_update = 0; 2712 rq->skip_clock_update = 0;
2713 2713
2714 if (likely(prev != next)) { 2714 if (likely(prev != next)) {
2715 rq->nr_switches++; 2715 rq->nr_switches++;
2716 rq->curr = next; 2716 rq->curr = next;
2717 ++*switch_count; 2717 ++*switch_count;
2718 2718
2719 context_switch(rq, prev, next); /* unlocks the rq */ 2719 context_switch(rq, prev, next); /* unlocks the rq */
2720 /* 2720 /*
2721 * The context switch have flipped the stack from under us 2721 * The context switch have flipped the stack from under us
2722 * and restored the local variables which were saved when 2722 * and restored the local variables which were saved when
2723 * this task called schedule() in the past. prev == current 2723 * this task called schedule() in the past. prev == current
2724 * is still correct, but it can be moved to another cpu/rq. 2724 * is still correct, but it can be moved to another cpu/rq.
2725 */ 2725 */
2726 cpu = smp_processor_id(); 2726 cpu = smp_processor_id();
2727 rq = cpu_rq(cpu); 2727 rq = cpu_rq(cpu);
2728 } else 2728 } else
2729 raw_spin_unlock_irq(&rq->lock); 2729 raw_spin_unlock_irq(&rq->lock);
2730 2730
2731 post_schedule(rq); 2731 post_schedule(rq);
2732 2732
2733 sched_preempt_enable_no_resched(); 2733 sched_preempt_enable_no_resched();
2734 if (need_resched()) 2734 if (need_resched())
2735 goto need_resched; 2735 goto need_resched;
2736 } 2736 }
2737 2737
2738 static inline void sched_submit_work(struct task_struct *tsk) 2738 static inline void sched_submit_work(struct task_struct *tsk)
2739 { 2739 {
2740 if (!tsk->state || tsk_is_pi_blocked(tsk)) 2740 if (!tsk->state || tsk_is_pi_blocked(tsk))
2741 return; 2741 return;
2742 /* 2742 /*
2743 * If we are going to sleep and we have plugged IO queued, 2743 * If we are going to sleep and we have plugged IO queued,
2744 * make sure to submit it to avoid deadlocks. 2744 * make sure to submit it to avoid deadlocks.
2745 */ 2745 */
2746 if (blk_needs_flush_plug(tsk)) 2746 if (blk_needs_flush_plug(tsk))
2747 blk_schedule_flush_plug(tsk); 2747 blk_schedule_flush_plug(tsk);
2748 } 2748 }
2749 2749
2750 asmlinkage __visible void __sched schedule(void) 2750 asmlinkage __visible void __sched schedule(void)
2751 { 2751 {
2752 struct task_struct *tsk = current; 2752 struct task_struct *tsk = current;
2753 2753
2754 sched_submit_work(tsk); 2754 sched_submit_work(tsk);
2755 __schedule(); 2755 __schedule();
2756 } 2756 }
2757 EXPORT_SYMBOL(schedule); 2757 EXPORT_SYMBOL(schedule);
2758 2758
2759 #ifdef CONFIG_CONTEXT_TRACKING 2759 #ifdef CONFIG_CONTEXT_TRACKING
2760 asmlinkage __visible void __sched schedule_user(void) 2760 asmlinkage __visible void __sched schedule_user(void)
2761 { 2761 {
2762 /* 2762 /*
2763 * If we come here after a random call to set_need_resched(), 2763 * If we come here after a random call to set_need_resched(),
2764 * or we have been woken up remotely but the IPI has not yet arrived, 2764 * or we have been woken up remotely but the IPI has not yet arrived,
2765 * we haven't yet exited the RCU idle mode. Do it here manually until 2765 * we haven't yet exited the RCU idle mode. Do it here manually until
2766 * we find a better solution. 2766 * we find a better solution.
2767 */ 2767 */
2768 user_exit(); 2768 user_exit();
2769 schedule(); 2769 schedule();
2770 user_enter(); 2770 user_enter();
2771 } 2771 }
2772 #endif 2772 #endif
2773 2773
2774 /** 2774 /**
2775 * schedule_preempt_disabled - called with preemption disabled 2775 * schedule_preempt_disabled - called with preemption disabled
2776 * 2776 *
2777 * Returns with preemption disabled. Note: preempt_count must be 1 2777 * Returns with preemption disabled. Note: preempt_count must be 1
2778 */ 2778 */
2779 void __sched schedule_preempt_disabled(void) 2779 void __sched schedule_preempt_disabled(void)
2780 { 2780 {
2781 sched_preempt_enable_no_resched(); 2781 sched_preempt_enable_no_resched();
2782 schedule(); 2782 schedule();
2783 preempt_disable(); 2783 preempt_disable();
2784 } 2784 }
2785 2785
2786 #ifdef CONFIG_PREEMPT 2786 #ifdef CONFIG_PREEMPT
2787 /* 2787 /*
2788 * this is the entry point to schedule() from in-kernel preemption 2788 * this is the entry point to schedule() from in-kernel preemption
2789 * off of preempt_enable. Kernel preemptions off return from interrupt 2789 * off of preempt_enable. Kernel preemptions off return from interrupt
2790 * occur there and call schedule directly. 2790 * occur there and call schedule directly.
2791 */ 2791 */
2792 asmlinkage __visible void __sched notrace preempt_schedule(void) 2792 asmlinkage __visible void __sched notrace preempt_schedule(void)
2793 { 2793 {
2794 /* 2794 /*
2795 * If there is a non-zero preempt_count or interrupts are disabled, 2795 * If there is a non-zero preempt_count or interrupts are disabled,
2796 * we do not want to preempt the current task. Just return.. 2796 * we do not want to preempt the current task. Just return..
2797 */ 2797 */
2798 if (likely(!preemptible())) 2798 if (likely(!preemptible()))
2799 return; 2799 return;
2800 2800
2801 do { 2801 do {
2802 __preempt_count_add(PREEMPT_ACTIVE); 2802 __preempt_count_add(PREEMPT_ACTIVE);
2803 __schedule(); 2803 __schedule();
2804 __preempt_count_sub(PREEMPT_ACTIVE); 2804 __preempt_count_sub(PREEMPT_ACTIVE);
2805 2805
2806 /* 2806 /*
2807 * Check again in case we missed a preemption opportunity 2807 * Check again in case we missed a preemption opportunity
2808 * between schedule and now. 2808 * between schedule and now.
2809 */ 2809 */
2810 barrier(); 2810 barrier();
2811 } while (need_resched()); 2811 } while (need_resched());
2812 } 2812 }
2813 EXPORT_SYMBOL(preempt_schedule); 2813 EXPORT_SYMBOL(preempt_schedule);
2814 #endif /* CONFIG_PREEMPT */ 2814 #endif /* CONFIG_PREEMPT */
2815 2815
2816 /* 2816 /*
2817 * this is the entry point to schedule() from kernel preemption 2817 * this is the entry point to schedule() from kernel preemption
2818 * off of irq context. 2818 * off of irq context.
2819 * Note, that this is called and return with irqs disabled. This will 2819 * Note, that this is called and return with irqs disabled. This will
2820 * protect us against recursive calling from irq. 2820 * protect us against recursive calling from irq.
2821 */ 2821 */
2822 asmlinkage __visible void __sched preempt_schedule_irq(void) 2822 asmlinkage __visible void __sched preempt_schedule_irq(void)
2823 { 2823 {
2824 enum ctx_state prev_state; 2824 enum ctx_state prev_state;
2825 2825
2826 /* Catch callers which need to be fixed */ 2826 /* Catch callers which need to be fixed */
2827 BUG_ON(preempt_count() || !irqs_disabled()); 2827 BUG_ON(preempt_count() || !irqs_disabled());
2828 2828
2829 prev_state = exception_enter(); 2829 prev_state = exception_enter();
2830 2830
2831 do { 2831 do {
2832 __preempt_count_add(PREEMPT_ACTIVE); 2832 __preempt_count_add(PREEMPT_ACTIVE);
2833 local_irq_enable(); 2833 local_irq_enable();
2834 __schedule(); 2834 __schedule();
2835 local_irq_disable(); 2835 local_irq_disable();
2836 __preempt_count_sub(PREEMPT_ACTIVE); 2836 __preempt_count_sub(PREEMPT_ACTIVE);
2837 2837
2838 /* 2838 /*
2839 * Check again in case we missed a preemption opportunity 2839 * Check again in case we missed a preemption opportunity
2840 * between schedule and now. 2840 * between schedule and now.
2841 */ 2841 */
2842 barrier(); 2842 barrier();
2843 } while (need_resched()); 2843 } while (need_resched());
2844 2844
2845 exception_exit(prev_state); 2845 exception_exit(prev_state);
2846 } 2846 }
2847 2847
2848 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 2848 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2849 void *key) 2849 void *key)
2850 { 2850 {
2851 return try_to_wake_up(curr->private, mode, wake_flags); 2851 return try_to_wake_up(curr->private, mode, wake_flags);
2852 } 2852 }
2853 EXPORT_SYMBOL(default_wake_function); 2853 EXPORT_SYMBOL(default_wake_function);
2854 2854
2855 #ifdef CONFIG_RT_MUTEXES 2855 #ifdef CONFIG_RT_MUTEXES
2856 2856
2857 /* 2857 /*
2858 * rt_mutex_setprio - set the current priority of a task 2858 * rt_mutex_setprio - set the current priority of a task
2859 * @p: task 2859 * @p: task
2860 * @prio: prio value (kernel-internal form) 2860 * @prio: prio value (kernel-internal form)
2861 * 2861 *
2862 * This function changes the 'effective' priority of a task. It does 2862 * This function changes the 'effective' priority of a task. It does
2863 * not touch ->normal_prio like __setscheduler(). 2863 * not touch ->normal_prio like __setscheduler().
2864 * 2864 *
2865 * Used by the rt_mutex code to implement priority inheritance 2865 * Used by the rt_mutex code to implement priority inheritance
2866 * logic. Call site only calls if the priority of the task changed. 2866 * logic. Call site only calls if the priority of the task changed.
2867 */ 2867 */
2868 void rt_mutex_setprio(struct task_struct *p, int prio) 2868 void rt_mutex_setprio(struct task_struct *p, int prio)
2869 { 2869 {
2870 int oldprio, on_rq, running, enqueue_flag = 0; 2870 int oldprio, on_rq, running, enqueue_flag = 0;
2871 struct rq *rq; 2871 struct rq *rq;
2872 const struct sched_class *prev_class; 2872 const struct sched_class *prev_class;
2873 2873
2874 BUG_ON(prio > MAX_PRIO); 2874 BUG_ON(prio > MAX_PRIO);
2875 2875
2876 rq = __task_rq_lock(p); 2876 rq = __task_rq_lock(p);
2877 2877
2878 /* 2878 /*
2879 * Idle task boosting is a nono in general. There is one 2879 * Idle task boosting is a nono in general. There is one
2880 * exception, when PREEMPT_RT and NOHZ is active: 2880 * exception, when PREEMPT_RT and NOHZ is active:
2881 * 2881 *
2882 * The idle task calls get_next_timer_interrupt() and holds 2882 * The idle task calls get_next_timer_interrupt() and holds
2883 * the timer wheel base->lock on the CPU and another CPU wants 2883 * the timer wheel base->lock on the CPU and another CPU wants
2884 * to access the timer (probably to cancel it). We can safely 2884 * to access the timer (probably to cancel it). We can safely
2885 * ignore the boosting request, as the idle CPU runs this code 2885 * ignore the boosting request, as the idle CPU runs this code
2886 * with interrupts disabled and will complete the lock 2886 * with interrupts disabled and will complete the lock
2887 * protected section without being interrupted. So there is no 2887 * protected section without being interrupted. So there is no
2888 * real need to boost. 2888 * real need to boost.
2889 */ 2889 */
2890 if (unlikely(p == rq->idle)) { 2890 if (unlikely(p == rq->idle)) {
2891 WARN_ON(p != rq->curr); 2891 WARN_ON(p != rq->curr);
2892 WARN_ON(p->pi_blocked_on); 2892 WARN_ON(p->pi_blocked_on);
2893 goto out_unlock; 2893 goto out_unlock;
2894 } 2894 }
2895 2895
2896 trace_sched_pi_setprio(p, prio); 2896 trace_sched_pi_setprio(p, prio);
2897 p->pi_top_task = rt_mutex_get_top_task(p); 2897 p->pi_top_task = rt_mutex_get_top_task(p);
2898 oldprio = p->prio; 2898 oldprio = p->prio;
2899 prev_class = p->sched_class; 2899 prev_class = p->sched_class;
2900 on_rq = p->on_rq; 2900 on_rq = p->on_rq;
2901 running = task_current(rq, p); 2901 running = task_current(rq, p);
2902 if (on_rq) 2902 if (on_rq)
2903 dequeue_task(rq, p, 0); 2903 dequeue_task(rq, p, 0);
2904 if (running) 2904 if (running)
2905 p->sched_class->put_prev_task(rq, p); 2905 p->sched_class->put_prev_task(rq, p);
2906 2906
2907 /* 2907 /*
2908 * Boosting condition are: 2908 * Boosting condition are:
2909 * 1. -rt task is running and holds mutex A 2909 * 1. -rt task is running and holds mutex A
2910 * --> -dl task blocks on mutex A 2910 * --> -dl task blocks on mutex A
2911 * 2911 *
2912 * 2. -dl task is running and holds mutex A 2912 * 2. -dl task is running and holds mutex A
2913 * --> -dl task blocks on mutex A and could preempt the 2913 * --> -dl task blocks on mutex A and could preempt the
2914 * running task 2914 * running task
2915 */ 2915 */
2916 if (dl_prio(prio)) { 2916 if (dl_prio(prio)) {
2917 if (!dl_prio(p->normal_prio) || (p->pi_top_task && 2917 if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
2918 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) { 2918 dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
2919 p->dl.dl_boosted = 1; 2919 p->dl.dl_boosted = 1;
2920 p->dl.dl_throttled = 0; 2920 p->dl.dl_throttled = 0;
2921 enqueue_flag = ENQUEUE_REPLENISH; 2921 enqueue_flag = ENQUEUE_REPLENISH;
2922 } else 2922 } else
2923 p->dl.dl_boosted = 0; 2923 p->dl.dl_boosted = 0;
2924 p->sched_class = &dl_sched_class; 2924 p->sched_class = &dl_sched_class;
2925 } else if (rt_prio(prio)) { 2925 } else if (rt_prio(prio)) {
2926 if (dl_prio(oldprio)) 2926 if (dl_prio(oldprio))
2927 p->dl.dl_boosted = 0; 2927 p->dl.dl_boosted = 0;
2928 if (oldprio < prio) 2928 if (oldprio < prio)
2929 enqueue_flag = ENQUEUE_HEAD; 2929 enqueue_flag = ENQUEUE_HEAD;
2930 p->sched_class = &rt_sched_class; 2930 p->sched_class = &rt_sched_class;
2931 } else { 2931 } else {
2932 if (dl_prio(oldprio)) 2932 if (dl_prio(oldprio))
2933 p->dl.dl_boosted = 0; 2933 p->dl.dl_boosted = 0;
2934 p->sched_class = &fair_sched_class; 2934 p->sched_class = &fair_sched_class;
2935 } 2935 }
2936 2936
2937 p->prio = prio; 2937 p->prio = prio;
2938 2938
2939 if (running) 2939 if (running)
2940 p->sched_class->set_curr_task(rq); 2940 p->sched_class->set_curr_task(rq);
2941 if (on_rq) 2941 if (on_rq)
2942 enqueue_task(rq, p, enqueue_flag); 2942 enqueue_task(rq, p, enqueue_flag);
2943 2943
2944 check_class_changed(rq, p, prev_class, oldprio); 2944 check_class_changed(rq, p, prev_class, oldprio);
2945 out_unlock: 2945 out_unlock:
2946 __task_rq_unlock(rq); 2946 __task_rq_unlock(rq);
2947 } 2947 }
2948 #endif 2948 #endif
2949 2949
2950 void set_user_nice(struct task_struct *p, long nice) 2950 void set_user_nice(struct task_struct *p, long nice)
2951 { 2951 {
2952 int old_prio, delta, on_rq; 2952 int old_prio, delta, on_rq;
2953 unsigned long flags; 2953 unsigned long flags;
2954 struct rq *rq; 2954 struct rq *rq;
2955 2955
2956 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) 2956 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
2957 return; 2957 return;
2958 /* 2958 /*
2959 * We have to be careful, if called from sys_setpriority(), 2959 * We have to be careful, if called from sys_setpriority(),
2960 * the task might be in the middle of scheduling on another CPU. 2960 * the task might be in the middle of scheduling on another CPU.
2961 */ 2961 */
2962 rq = task_rq_lock(p, &flags); 2962 rq = task_rq_lock(p, &flags);
2963 /* 2963 /*
2964 * The RT priorities are set via sched_setscheduler(), but we still 2964 * The RT priorities are set via sched_setscheduler(), but we still
2965 * allow the 'normal' nice value to be set - but as expected 2965 * allow the 'normal' nice value to be set - but as expected
2966 * it wont have any effect on scheduling until the task is 2966 * it wont have any effect on scheduling until the task is
2967 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: 2967 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
2968 */ 2968 */
2969 if (task_has_dl_policy(p) || task_has_rt_policy(p)) { 2969 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2970 p->static_prio = NICE_TO_PRIO(nice); 2970 p->static_prio = NICE_TO_PRIO(nice);
2971 goto out_unlock; 2971 goto out_unlock;
2972 } 2972 }
2973 on_rq = p->on_rq; 2973 on_rq = p->on_rq;
2974 if (on_rq) 2974 if (on_rq)
2975 dequeue_task(rq, p, 0); 2975 dequeue_task(rq, p, 0);
2976 2976
2977 p->static_prio = NICE_TO_PRIO(nice); 2977 p->static_prio = NICE_TO_PRIO(nice);
2978 set_load_weight(p); 2978 set_load_weight(p);
2979 old_prio = p->prio; 2979 old_prio = p->prio;
2980 p->prio = effective_prio(p); 2980 p->prio = effective_prio(p);
2981 delta = p->prio - old_prio; 2981 delta = p->prio - old_prio;
2982 2982
2983 if (on_rq) { 2983 if (on_rq) {
2984 enqueue_task(rq, p, 0); 2984 enqueue_task(rq, p, 0);
2985 /* 2985 /*
2986 * If the task increased its priority or is running and 2986 * If the task increased its priority or is running and
2987 * lowered its priority, then reschedule its CPU: 2987 * lowered its priority, then reschedule its CPU:
2988 */ 2988 */
2989 if (delta < 0 || (delta > 0 && task_running(rq, p))) 2989 if (delta < 0 || (delta > 0 && task_running(rq, p)))
2990 resched_task(rq->curr); 2990 resched_task(rq->curr);
2991 } 2991 }
2992 out_unlock: 2992 out_unlock:
2993 task_rq_unlock(rq, p, &flags); 2993 task_rq_unlock(rq, p, &flags);
2994 } 2994 }
2995 EXPORT_SYMBOL(set_user_nice); 2995 EXPORT_SYMBOL(set_user_nice);
2996 2996
2997 /* 2997 /*
2998 * can_nice - check if a task can reduce its nice value 2998 * can_nice - check if a task can reduce its nice value
2999 * @p: task 2999 * @p: task
3000 * @nice: nice value 3000 * @nice: nice value
3001 */ 3001 */
3002 int can_nice(const struct task_struct *p, const int nice) 3002 int can_nice(const struct task_struct *p, const int nice)
3003 { 3003 {
3004 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3004 /* convert nice value [19,-20] to rlimit style value [1,40] */
3005 int nice_rlim = 20 - nice; 3005 int nice_rlim = 20 - nice;
3006 3006
3007 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 3007 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3008 capable(CAP_SYS_NICE)); 3008 capable(CAP_SYS_NICE));
3009 } 3009 }
3010 3010
3011 #ifdef __ARCH_WANT_SYS_NICE 3011 #ifdef __ARCH_WANT_SYS_NICE
3012 3012
3013 /* 3013 /*
3014 * sys_nice - change the priority of the current process. 3014 * sys_nice - change the priority of the current process.
3015 * @increment: priority increment 3015 * @increment: priority increment
3016 * 3016 *
3017 * sys_setpriority is a more generic, but much slower function that 3017 * sys_setpriority is a more generic, but much slower function that
3018 * does similar things. 3018 * does similar things.
3019 */ 3019 */
3020 SYSCALL_DEFINE1(nice, int, increment) 3020 SYSCALL_DEFINE1(nice, int, increment)
3021 { 3021 {
3022 long nice, retval; 3022 long nice, retval;
3023 3023
3024 /* 3024 /*
3025 * Setpriority might change our priority at the same moment. 3025 * Setpriority might change our priority at the same moment.
3026 * We don't have to worry. Conceptually one call occurs first 3026 * We don't have to worry. Conceptually one call occurs first
3027 * and we have a single winner. 3027 * and we have a single winner.
3028 */ 3028 */
3029 if (increment < -40) 3029 if (increment < -40)
3030 increment = -40; 3030 increment = -40;
3031 if (increment > 40) 3031 if (increment > 40)
3032 increment = 40; 3032 increment = 40;
3033 3033
3034 nice = task_nice(current) + increment; 3034 nice = task_nice(current) + increment;
3035 if (nice < MIN_NICE) 3035 if (nice < MIN_NICE)
3036 nice = MIN_NICE; 3036 nice = MIN_NICE;
3037 if (nice > MAX_NICE) 3037 if (nice > MAX_NICE)
3038 nice = MAX_NICE; 3038 nice = MAX_NICE;
3039 3039
3040 if (increment < 0 && !can_nice(current, nice)) 3040 if (increment < 0 && !can_nice(current, nice))
3041 return -EPERM; 3041 return -EPERM;
3042 3042
3043 retval = security_task_setnice(current, nice); 3043 retval = security_task_setnice(current, nice);
3044 if (retval) 3044 if (retval)
3045 return retval; 3045 return retval;
3046 3046
3047 set_user_nice(current, nice); 3047 set_user_nice(current, nice);
3048 return 0; 3048 return 0;
3049 } 3049 }
3050 3050
3051 #endif 3051 #endif
3052 3052
3053 /** 3053 /**
3054 * task_prio - return the priority value of a given task. 3054 * task_prio - return the priority value of a given task.
3055 * @p: the task in question. 3055 * @p: the task in question.
3056 * 3056 *
3057 * Return: The priority value as seen by users in /proc. 3057 * Return: The priority value as seen by users in /proc.
3058 * RT tasks are offset by -200. Normal tasks are centered 3058 * RT tasks are offset by -200. Normal tasks are centered
3059 * around 0, value goes from -16 to +15. 3059 * around 0, value goes from -16 to +15.
3060 */ 3060 */
3061 int task_prio(const struct task_struct *p) 3061 int task_prio(const struct task_struct *p)
3062 { 3062 {
3063 return p->prio - MAX_RT_PRIO; 3063 return p->prio - MAX_RT_PRIO;
3064 } 3064 }
3065 3065
3066 /** 3066 /**
3067 * idle_cpu - is a given cpu idle currently? 3067 * idle_cpu - is a given cpu idle currently?
3068 * @cpu: the processor in question. 3068 * @cpu: the processor in question.
3069 * 3069 *
3070 * Return: 1 if the CPU is currently idle. 0 otherwise. 3070 * Return: 1 if the CPU is currently idle. 0 otherwise.
3071 */ 3071 */
3072 int idle_cpu(int cpu) 3072 int idle_cpu(int cpu)
3073 { 3073 {
3074 struct rq *rq = cpu_rq(cpu); 3074 struct rq *rq = cpu_rq(cpu);
3075 3075
3076 if (rq->curr != rq->idle) 3076 if (rq->curr != rq->idle)
3077 return 0; 3077 return 0;
3078 3078
3079 if (rq->nr_running) 3079 if (rq->nr_running)
3080 return 0; 3080 return 0;
3081 3081
3082 #ifdef CONFIG_SMP 3082 #ifdef CONFIG_SMP
3083 if (!llist_empty(&rq->wake_list)) 3083 if (!llist_empty(&rq->wake_list))
3084 return 0; 3084 return 0;
3085 #endif 3085 #endif
3086 3086
3087 return 1; 3087 return 1;
3088 } 3088 }
3089 3089
3090 /** 3090 /**
3091 * idle_task - return the idle task for a given cpu. 3091 * idle_task - return the idle task for a given cpu.
3092 * @cpu: the processor in question. 3092 * @cpu: the processor in question.
3093 * 3093 *
3094 * Return: The idle task for the cpu @cpu. 3094 * Return: The idle task for the cpu @cpu.
3095 */ 3095 */
3096 struct task_struct *idle_task(int cpu) 3096 struct task_struct *idle_task(int cpu)
3097 { 3097 {
3098 return cpu_rq(cpu)->idle; 3098 return cpu_rq(cpu)->idle;
3099 } 3099 }
3100 3100
3101 /** 3101 /**
3102 * find_process_by_pid - find a process with a matching PID value. 3102 * find_process_by_pid - find a process with a matching PID value.
3103 * @pid: the pid in question. 3103 * @pid: the pid in question.
3104 * 3104 *
3105 * The task of @pid, if found. %NULL otherwise. 3105 * The task of @pid, if found. %NULL otherwise.
3106 */ 3106 */
3107 static struct task_struct *find_process_by_pid(pid_t pid) 3107 static struct task_struct *find_process_by_pid(pid_t pid)
3108 { 3108 {
3109 return pid ? find_task_by_vpid(pid) : current; 3109 return pid ? find_task_by_vpid(pid) : current;
3110 } 3110 }
3111 3111
3112 /* 3112 /*
3113 * This function initializes the sched_dl_entity of a newly becoming 3113 * This function initializes the sched_dl_entity of a newly becoming
3114 * SCHED_DEADLINE task. 3114 * SCHED_DEADLINE task.
3115 * 3115 *
3116 * Only the static values are considered here, the actual runtime and the 3116 * Only the static values are considered here, the actual runtime and the
3117 * absolute deadline will be properly calculated when the task is enqueued 3117 * absolute deadline will be properly calculated when the task is enqueued
3118 * for the first time with its new policy. 3118 * for the first time with its new policy.
3119 */ 3119 */
3120 static void 3120 static void
3121 __setparam_dl(struct task_struct *p, const struct sched_attr *attr) 3121 __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3122 { 3122 {
3123 struct sched_dl_entity *dl_se = &p->dl; 3123 struct sched_dl_entity *dl_se = &p->dl;
3124 3124
3125 init_dl_task_timer(dl_se); 3125 init_dl_task_timer(dl_se);
3126 dl_se->dl_runtime = attr->sched_runtime; 3126 dl_se->dl_runtime = attr->sched_runtime;
3127 dl_se->dl_deadline = attr->sched_deadline; 3127 dl_se->dl_deadline = attr->sched_deadline;
3128 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3128 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3129 dl_se->flags = attr->sched_flags; 3129 dl_se->flags = attr->sched_flags;
3130 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3130 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3131 dl_se->dl_throttled = 0; 3131 dl_se->dl_throttled = 0;
3132 dl_se->dl_new = 1; 3132 dl_se->dl_new = 1;
3133 dl_se->dl_yielded = 0; 3133 dl_se->dl_yielded = 0;
3134 } 3134 }
3135 3135
3136 static void __setscheduler_params(struct task_struct *p, 3136 static void __setscheduler_params(struct task_struct *p,
3137 const struct sched_attr *attr) 3137 const struct sched_attr *attr)
3138 { 3138 {
3139 int policy = attr->sched_policy; 3139 int policy = attr->sched_policy;
3140 3140
3141 if (policy == -1) /* setparam */ 3141 if (policy == -1) /* setparam */
3142 policy = p->policy; 3142 policy = p->policy;
3143 3143
3144 p->policy = policy; 3144 p->policy = policy;
3145 3145
3146 if (dl_policy(policy)) 3146 if (dl_policy(policy))
3147 __setparam_dl(p, attr); 3147 __setparam_dl(p, attr);
3148 else if (fair_policy(policy)) 3148 else if (fair_policy(policy))
3149 p->static_prio = NICE_TO_PRIO(attr->sched_nice); 3149 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
3150 3150
3151 /* 3151 /*
3152 * __sched_setscheduler() ensures attr->sched_priority == 0 when 3152 * __sched_setscheduler() ensures attr->sched_priority == 0 when
3153 * !rt_policy. Always setting this ensures that things like 3153 * !rt_policy. Always setting this ensures that things like
3154 * getparam()/getattr() don't report silly values for !rt tasks. 3154 * getparam()/getattr() don't report silly values for !rt tasks.
3155 */ 3155 */
3156 p->rt_priority = attr->sched_priority; 3156 p->rt_priority = attr->sched_priority;
3157 p->normal_prio = normal_prio(p); 3157 p->normal_prio = normal_prio(p);
3158 set_load_weight(p); 3158 set_load_weight(p);
3159 } 3159 }
3160 3160
3161 /* Actually do priority change: must hold pi & rq lock. */ 3161 /* Actually do priority change: must hold pi & rq lock. */
3162 static void __setscheduler(struct rq *rq, struct task_struct *p, 3162 static void __setscheduler(struct rq *rq, struct task_struct *p,
3163 const struct sched_attr *attr) 3163 const struct sched_attr *attr)
3164 { 3164 {
3165 __setscheduler_params(p, attr); 3165 __setscheduler_params(p, attr);
3166 3166
3167 /* 3167 /*
3168 * If we get here, there was no pi waiters boosting the 3168 * If we get here, there was no pi waiters boosting the
3169 * task. It is safe to use the normal prio. 3169 * task. It is safe to use the normal prio.
3170 */ 3170 */
3171 p->prio = normal_prio(p); 3171 p->prio = normal_prio(p);
3172 3172
3173 if (dl_prio(p->prio)) 3173 if (dl_prio(p->prio))
3174 p->sched_class = &dl_sched_class; 3174 p->sched_class = &dl_sched_class;
3175 else if (rt_prio(p->prio)) 3175 else if (rt_prio(p->prio))
3176 p->sched_class = &rt_sched_class; 3176 p->sched_class = &rt_sched_class;
3177 else 3177 else
3178 p->sched_class = &fair_sched_class; 3178 p->sched_class = &fair_sched_class;
3179 } 3179 }
3180 3180
3181 static void 3181 static void
3182 __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3182 __getparam_dl(struct task_struct *p, struct sched_attr *attr)
3183 { 3183 {
3184 struct sched_dl_entity *dl_se = &p->dl; 3184 struct sched_dl_entity *dl_se = &p->dl;
3185 3185
3186 attr->sched_priority = p->rt_priority; 3186 attr->sched_priority = p->rt_priority;
3187 attr->sched_runtime = dl_se->dl_runtime; 3187 attr->sched_runtime = dl_se->dl_runtime;
3188 attr->sched_deadline = dl_se->dl_deadline; 3188 attr->sched_deadline = dl_se->dl_deadline;
3189 attr->sched_period = dl_se->dl_period; 3189 attr->sched_period = dl_se->dl_period;
3190 attr->sched_flags = dl_se->flags; 3190 attr->sched_flags = dl_se->flags;
3191 } 3191 }
3192 3192
3193 /* 3193 /*
3194 * This function validates the new parameters of a -deadline task. 3194 * This function validates the new parameters of a -deadline task.
3195 * We ask for the deadline not being zero, and greater or equal 3195 * We ask for the deadline not being zero, and greater or equal
3196 * than the runtime, as well as the period of being zero or 3196 * than the runtime, as well as the period of being zero or
3197 * greater than deadline. Furthermore, we have to be sure that 3197 * greater than deadline. Furthermore, we have to be sure that
3198 * user parameters are above the internal resolution (1us); we 3198 * user parameters are above the internal resolution of 1us (we
3199 * check sched_runtime only since it is always the smaller one. 3199 * check sched_runtime only since it is always the smaller one) and
3200 * below 2^63 ns (we have to check both sched_deadline and
3201 * sched_period, as the latter can be zero).
3200 */ 3202 */
3201 static bool 3203 static bool
3202 __checkparam_dl(const struct sched_attr *attr) 3204 __checkparam_dl(const struct sched_attr *attr)
3203 { 3205 {
3204 return attr && attr->sched_deadline != 0 && 3206 /* deadline != 0 */
3205 (attr->sched_period == 0 || 3207 if (attr->sched_deadline == 0)
3206 (s64)(attr->sched_period - attr->sched_deadline) >= 0) && 3208 return false;
3207 (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && 3209
3208 attr->sched_runtime >= (2 << (DL_SCALE - 1)); 3210 /*
3211 * Since we truncate DL_SCALE bits, make sure we're at least
3212 * that big.
3213 */
3214 if (attr->sched_runtime < (1ULL << DL_SCALE))
3215 return false;
3216
3217 /*
3218 * Since we use the MSB for wrap-around and sign issues, make
3219 * sure it's not set (mind that period can be equal to zero).
3220 */
3221 if (attr->sched_deadline & (1ULL << 63) ||
3222 attr->sched_period & (1ULL << 63))
3223 return false;
3224
3225 /* runtime <= deadline <= period (if period != 0) */
3226 if ((attr->sched_period != 0 &&
3227 attr->sched_period < attr->sched_deadline) ||
3228 attr->sched_deadline < attr->sched_runtime)
3229 return false;
3230
3231 return true;
3209 } 3232 }
3210 3233
3211 /* 3234 /*
3212 * check the target process has a UID that matches the current process's 3235 * check the target process has a UID that matches the current process's
3213 */ 3236 */
3214 static bool check_same_owner(struct task_struct *p) 3237 static bool check_same_owner(struct task_struct *p)
3215 { 3238 {
3216 const struct cred *cred = current_cred(), *pcred; 3239 const struct cred *cred = current_cred(), *pcred;
3217 bool match; 3240 bool match;
3218 3241
3219 rcu_read_lock(); 3242 rcu_read_lock();
3220 pcred = __task_cred(p); 3243 pcred = __task_cred(p);
3221 match = (uid_eq(cred->euid, pcred->euid) || 3244 match = (uid_eq(cred->euid, pcred->euid) ||
3222 uid_eq(cred->euid, pcred->uid)); 3245 uid_eq(cred->euid, pcred->uid));
3223 rcu_read_unlock(); 3246 rcu_read_unlock();
3224 return match; 3247 return match;
3225 } 3248 }
3226 3249
3227 static int __sched_setscheduler(struct task_struct *p, 3250 static int __sched_setscheduler(struct task_struct *p,
3228 const struct sched_attr *attr, 3251 const struct sched_attr *attr,
3229 bool user) 3252 bool user)
3230 { 3253 {
3231 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3254 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3232 MAX_RT_PRIO - 1 - attr->sched_priority; 3255 MAX_RT_PRIO - 1 - attr->sched_priority;
3233 int retval, oldprio, oldpolicy = -1, on_rq, running; 3256 int retval, oldprio, oldpolicy = -1, on_rq, running;
3234 int policy = attr->sched_policy; 3257 int policy = attr->sched_policy;
3235 unsigned long flags; 3258 unsigned long flags;
3236 const struct sched_class *prev_class; 3259 const struct sched_class *prev_class;
3237 struct rq *rq; 3260 struct rq *rq;
3238 int reset_on_fork; 3261 int reset_on_fork;
3239 3262
3240 /* may grab non-irq protected spin_locks */ 3263 /* may grab non-irq protected spin_locks */
3241 BUG_ON(in_interrupt()); 3264 BUG_ON(in_interrupt());
3242 recheck: 3265 recheck:
3243 /* double check policy once rq lock held */ 3266 /* double check policy once rq lock held */
3244 if (policy < 0) { 3267 if (policy < 0) {
3245 reset_on_fork = p->sched_reset_on_fork; 3268 reset_on_fork = p->sched_reset_on_fork;
3246 policy = oldpolicy = p->policy; 3269 policy = oldpolicy = p->policy;
3247 } else { 3270 } else {
3248 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK); 3271 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
3249 3272
3250 if (policy != SCHED_DEADLINE && 3273 if (policy != SCHED_DEADLINE &&
3251 policy != SCHED_FIFO && policy != SCHED_RR && 3274 policy != SCHED_FIFO && policy != SCHED_RR &&
3252 policy != SCHED_NORMAL && policy != SCHED_BATCH && 3275 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
3253 policy != SCHED_IDLE) 3276 policy != SCHED_IDLE)
3254 return -EINVAL; 3277 return -EINVAL;
3255 } 3278 }
3256 3279
3257 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK)) 3280 if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
3258 return -EINVAL; 3281 return -EINVAL;
3259 3282
3260 /* 3283 /*
3261 * Valid priorities for SCHED_FIFO and SCHED_RR are 3284 * Valid priorities for SCHED_FIFO and SCHED_RR are
3262 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 3285 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
3263 * SCHED_BATCH and SCHED_IDLE is 0. 3286 * SCHED_BATCH and SCHED_IDLE is 0.
3264 */ 3287 */
3265 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || 3288 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
3266 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) 3289 (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
3267 return -EINVAL; 3290 return -EINVAL;
3268 if ((dl_policy(policy) && !__checkparam_dl(attr)) || 3291 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
3269 (rt_policy(policy) != (attr->sched_priority != 0))) 3292 (rt_policy(policy) != (attr->sched_priority != 0)))
3270 return -EINVAL; 3293 return -EINVAL;
3271 3294
3272 /* 3295 /*
3273 * Allow unprivileged RT tasks to decrease priority: 3296 * Allow unprivileged RT tasks to decrease priority:
3274 */ 3297 */
3275 if (user && !capable(CAP_SYS_NICE)) { 3298 if (user && !capable(CAP_SYS_NICE)) {
3276 if (fair_policy(policy)) { 3299 if (fair_policy(policy)) {
3277 if (attr->sched_nice < task_nice(p) && 3300 if (attr->sched_nice < task_nice(p) &&
3278 !can_nice(p, attr->sched_nice)) 3301 !can_nice(p, attr->sched_nice))
3279 return -EPERM; 3302 return -EPERM;
3280 } 3303 }
3281 3304
3282 if (rt_policy(policy)) { 3305 if (rt_policy(policy)) {
3283 unsigned long rlim_rtprio = 3306 unsigned long rlim_rtprio =
3284 task_rlimit(p, RLIMIT_RTPRIO); 3307 task_rlimit(p, RLIMIT_RTPRIO);
3285 3308
3286 /* can't set/change the rt policy */ 3309 /* can't set/change the rt policy */
3287 if (policy != p->policy && !rlim_rtprio) 3310 if (policy != p->policy && !rlim_rtprio)
3288 return -EPERM; 3311 return -EPERM;
3289 3312
3290 /* can't increase priority */ 3313 /* can't increase priority */
3291 if (attr->sched_priority > p->rt_priority && 3314 if (attr->sched_priority > p->rt_priority &&
3292 attr->sched_priority > rlim_rtprio) 3315 attr->sched_priority > rlim_rtprio)
3293 return -EPERM; 3316 return -EPERM;
3294 } 3317 }
3295 3318
3296 /* 3319 /*
3297 * Can't set/change SCHED_DEADLINE policy at all for now 3320 * Can't set/change SCHED_DEADLINE policy at all for now
3298 * (safest behavior); in the future we would like to allow 3321 * (safest behavior); in the future we would like to allow
3299 * unprivileged DL tasks to increase their relative deadline 3322 * unprivileged DL tasks to increase their relative deadline
3300 * or reduce their runtime (both ways reducing utilization) 3323 * or reduce their runtime (both ways reducing utilization)
3301 */ 3324 */
3302 if (dl_policy(policy)) 3325 if (dl_policy(policy))
3303 return -EPERM; 3326 return -EPERM;
3304 3327
3305 /* 3328 /*
3306 * Treat SCHED_IDLE as nice 20. Only allow a switch to 3329 * Treat SCHED_IDLE as nice 20. Only allow a switch to
3307 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. 3330 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
3308 */ 3331 */
3309 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { 3332 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
3310 if (!can_nice(p, task_nice(p))) 3333 if (!can_nice(p, task_nice(p)))
3311 return -EPERM; 3334 return -EPERM;
3312 } 3335 }
3313 3336
3314 /* can't change other user's priorities */ 3337 /* can't change other user's priorities */
3315 if (!check_same_owner(p)) 3338 if (!check_same_owner(p))
3316 return -EPERM; 3339 return -EPERM;
3317 3340
3318 /* Normal users shall not reset the sched_reset_on_fork flag */ 3341 /* Normal users shall not reset the sched_reset_on_fork flag */
3319 if (p->sched_reset_on_fork && !reset_on_fork) 3342 if (p->sched_reset_on_fork && !reset_on_fork)
3320 return -EPERM; 3343 return -EPERM;
3321 } 3344 }
3322 3345
3323 if (user) { 3346 if (user) {
3324 retval = security_task_setscheduler(p); 3347 retval = security_task_setscheduler(p);
3325 if (retval) 3348 if (retval)
3326 return retval; 3349 return retval;
3327 } 3350 }
3328 3351
3329 /* 3352 /*
3330 * make sure no PI-waiters arrive (or leave) while we are 3353 * make sure no PI-waiters arrive (or leave) while we are
3331 * changing the priority of the task: 3354 * changing the priority of the task:
3332 * 3355 *
3333 * To be able to change p->policy safely, the appropriate 3356 * To be able to change p->policy safely, the appropriate
3334 * runqueue lock must be held. 3357 * runqueue lock must be held.
3335 */ 3358 */
3336 rq = task_rq_lock(p, &flags); 3359 rq = task_rq_lock(p, &flags);
3337 3360
3338 /* 3361 /*
3339 * Changing the policy of the stop threads its a very bad idea 3362 * Changing the policy of the stop threads its a very bad idea
3340 */ 3363 */
3341 if (p == rq->stop) { 3364 if (p == rq->stop) {
3342 task_rq_unlock(rq, p, &flags); 3365 task_rq_unlock(rq, p, &flags);
3343 return -EINVAL; 3366 return -EINVAL;
3344 } 3367 }
3345 3368
3346 /* 3369 /*
3347 * If not changing anything there's no need to proceed further, 3370 * If not changing anything there's no need to proceed further,
3348 * but store a possible modification of reset_on_fork. 3371 * but store a possible modification of reset_on_fork.
3349 */ 3372 */
3350 if (unlikely(policy == p->policy)) { 3373 if (unlikely(policy == p->policy)) {
3351 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) 3374 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
3352 goto change; 3375 goto change;
3353 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3376 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3354 goto change; 3377 goto change;
3355 if (dl_policy(policy)) 3378 if (dl_policy(policy))
3356 goto change; 3379 goto change;
3357 3380
3358 p->sched_reset_on_fork = reset_on_fork; 3381 p->sched_reset_on_fork = reset_on_fork;
3359 task_rq_unlock(rq, p, &flags); 3382 task_rq_unlock(rq, p, &flags);
3360 return 0; 3383 return 0;
3361 } 3384 }
3362 change: 3385 change:
3363 3386
3364 if (user) { 3387 if (user) {
3365 #ifdef CONFIG_RT_GROUP_SCHED 3388 #ifdef CONFIG_RT_GROUP_SCHED
3366 /* 3389 /*
3367 * Do not allow realtime tasks into groups that have no runtime 3390 * Do not allow realtime tasks into groups that have no runtime
3368 * assigned. 3391 * assigned.
3369 */ 3392 */
3370 if (rt_bandwidth_enabled() && rt_policy(policy) && 3393 if (rt_bandwidth_enabled() && rt_policy(policy) &&
3371 task_group(p)->rt_bandwidth.rt_runtime == 0 && 3394 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
3372 !task_group_is_autogroup(task_group(p))) { 3395 !task_group_is_autogroup(task_group(p))) {
3373 task_rq_unlock(rq, p, &flags); 3396 task_rq_unlock(rq, p, &flags);
3374 return -EPERM; 3397 return -EPERM;
3375 } 3398 }
3376 #endif 3399 #endif
3377 #ifdef CONFIG_SMP 3400 #ifdef CONFIG_SMP
3378 if (dl_bandwidth_enabled() && dl_policy(policy)) { 3401 if (dl_bandwidth_enabled() && dl_policy(policy)) {
3379 cpumask_t *span = rq->rd->span; 3402 cpumask_t *span = rq->rd->span;
3380 3403
3381 /* 3404 /*
3382 * Don't allow tasks with an affinity mask smaller than 3405 * Don't allow tasks with an affinity mask smaller than
3383 * the entire root_domain to become SCHED_DEADLINE. We 3406 * the entire root_domain to become SCHED_DEADLINE. We
3384 * will also fail if there's no bandwidth available. 3407 * will also fail if there's no bandwidth available.
3385 */ 3408 */
3386 if (!cpumask_subset(span, &p->cpus_allowed) || 3409 if (!cpumask_subset(span, &p->cpus_allowed) ||
3387 rq->rd->dl_bw.bw == 0) { 3410 rq->rd->dl_bw.bw == 0) {
3388 task_rq_unlock(rq, p, &flags); 3411 task_rq_unlock(rq, p, &flags);
3389 return -EPERM; 3412 return -EPERM;
3390 } 3413 }
3391 } 3414 }
3392 #endif 3415 #endif
3393 } 3416 }
3394 3417
3395 /* recheck policy now with rq lock held */ 3418 /* recheck policy now with rq lock held */
3396 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 3419 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3397 policy = oldpolicy = -1; 3420 policy = oldpolicy = -1;
3398 task_rq_unlock(rq, p, &flags); 3421 task_rq_unlock(rq, p, &flags);
3399 goto recheck; 3422 goto recheck;
3400 } 3423 }
3401 3424
3402 /* 3425 /*
3403 * If setscheduling to SCHED_DEADLINE (or changing the parameters 3426 * If setscheduling to SCHED_DEADLINE (or changing the parameters
3404 * of a SCHED_DEADLINE task) we need to check if enough bandwidth 3427 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
3405 * is available. 3428 * is available.
3406 */ 3429 */
3407 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) { 3430 if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
3408 task_rq_unlock(rq, p, &flags); 3431 task_rq_unlock(rq, p, &flags);
3409 return -EBUSY; 3432 return -EBUSY;
3410 } 3433 }
3411 3434
3412 p->sched_reset_on_fork = reset_on_fork; 3435 p->sched_reset_on_fork = reset_on_fork;
3413 oldprio = p->prio; 3436 oldprio = p->prio;
3414 3437
3415 /* 3438 /*
3416 * Special case for priority boosted tasks. 3439 * Special case for priority boosted tasks.
3417 * 3440 *
3418 * If the new priority is lower or equal (user space view) 3441 * If the new priority is lower or equal (user space view)
3419 * than the current (boosted) priority, we just store the new 3442 * than the current (boosted) priority, we just store the new
3420 * normal parameters and do not touch the scheduler class and 3443 * normal parameters and do not touch the scheduler class and
3421 * the runqueue. This will be done when the task deboost 3444 * the runqueue. This will be done when the task deboost
3422 * itself. 3445 * itself.
3423 */ 3446 */
3424 if (rt_mutex_check_prio(p, newprio)) { 3447 if (rt_mutex_check_prio(p, newprio)) {
3425 __setscheduler_params(p, attr); 3448 __setscheduler_params(p, attr);
3426 task_rq_unlock(rq, p, &flags); 3449 task_rq_unlock(rq, p, &flags);
3427 return 0; 3450 return 0;
3428 } 3451 }
3429 3452
3430 on_rq = p->on_rq; 3453 on_rq = p->on_rq;
3431 running = task_current(rq, p); 3454 running = task_current(rq, p);
3432 if (on_rq) 3455 if (on_rq)
3433 dequeue_task(rq, p, 0); 3456 dequeue_task(rq, p, 0);
3434 if (running) 3457 if (running)
3435 p->sched_class->put_prev_task(rq, p); 3458 p->sched_class->put_prev_task(rq, p);
3436 3459
3437 prev_class = p->sched_class; 3460 prev_class = p->sched_class;
3438 __setscheduler(rq, p, attr); 3461 __setscheduler(rq, p, attr);
3439 3462
3440 if (running) 3463 if (running)
3441 p->sched_class->set_curr_task(rq); 3464 p->sched_class->set_curr_task(rq);
3442 if (on_rq) { 3465 if (on_rq) {
3443 /* 3466 /*
3444 * We enqueue to tail when the priority of a task is 3467 * We enqueue to tail when the priority of a task is
3445 * increased (user space view). 3468 * increased (user space view).
3446 */ 3469 */
3447 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0); 3470 enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
3448 } 3471 }
3449 3472
3450 check_class_changed(rq, p, prev_class, oldprio); 3473 check_class_changed(rq, p, prev_class, oldprio);
3451 task_rq_unlock(rq, p, &flags); 3474 task_rq_unlock(rq, p, &flags);
3452 3475
3453 rt_mutex_adjust_pi(p); 3476 rt_mutex_adjust_pi(p);
3454 3477
3455 return 0; 3478 return 0;
3456 } 3479 }
3457 3480
3458 static int _sched_setscheduler(struct task_struct *p, int policy, 3481 static int _sched_setscheduler(struct task_struct *p, int policy,
3459 const struct sched_param *param, bool check) 3482 const struct sched_param *param, bool check)
3460 { 3483 {
3461 struct sched_attr attr = { 3484 struct sched_attr attr = {
3462 .sched_policy = policy, 3485 .sched_policy = policy,
3463 .sched_priority = param->sched_priority, 3486 .sched_priority = param->sched_priority,
3464 .sched_nice = PRIO_TO_NICE(p->static_prio), 3487 .sched_nice = PRIO_TO_NICE(p->static_prio),
3465 }; 3488 };
3466 3489
3467 /* 3490 /*
3468 * Fixup the legacy SCHED_RESET_ON_FORK hack 3491 * Fixup the legacy SCHED_RESET_ON_FORK hack
3469 */ 3492 */
3470 if (policy & SCHED_RESET_ON_FORK) { 3493 if (policy & SCHED_RESET_ON_FORK) {
3471 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3494 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3472 policy &= ~SCHED_RESET_ON_FORK; 3495 policy &= ~SCHED_RESET_ON_FORK;
3473 attr.sched_policy = policy; 3496 attr.sched_policy = policy;
3474 } 3497 }
3475 3498
3476 return __sched_setscheduler(p, &attr, check); 3499 return __sched_setscheduler(p, &attr, check);
3477 } 3500 }
3478 /** 3501 /**
3479 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 3502 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
3480 * @p: the task in question. 3503 * @p: the task in question.
3481 * @policy: new policy. 3504 * @policy: new policy.
3482 * @param: structure containing the new RT priority. 3505 * @param: structure containing the new RT priority.
3483 * 3506 *
3484 * Return: 0 on success. An error code otherwise. 3507 * Return: 0 on success. An error code otherwise.
3485 * 3508 *
3486 * NOTE that the task may be already dead. 3509 * NOTE that the task may be already dead.
3487 */ 3510 */
3488 int sched_setscheduler(struct task_struct *p, int policy, 3511 int sched_setscheduler(struct task_struct *p, int policy,
3489 const struct sched_param *param) 3512 const struct sched_param *param)
3490 { 3513 {
3491 return _sched_setscheduler(p, policy, param, true); 3514 return _sched_setscheduler(p, policy, param, true);
3492 } 3515 }
3493 EXPORT_SYMBOL_GPL(sched_setscheduler); 3516 EXPORT_SYMBOL_GPL(sched_setscheduler);
3494 3517
3495 int sched_setattr(struct task_struct *p, const struct sched_attr *attr) 3518 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
3496 { 3519 {
3497 return __sched_setscheduler(p, attr, true); 3520 return __sched_setscheduler(p, attr, true);
3498 } 3521 }
3499 EXPORT_SYMBOL_GPL(sched_setattr); 3522 EXPORT_SYMBOL_GPL(sched_setattr);
3500 3523
3501 /** 3524 /**
3502 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 3525 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
3503 * @p: the task in question. 3526 * @p: the task in question.
3504 * @policy: new policy. 3527 * @policy: new policy.
3505 * @param: structure containing the new RT priority. 3528 * @param: structure containing the new RT priority.
3506 * 3529 *
3507 * Just like sched_setscheduler, only don't bother checking if the 3530 * Just like sched_setscheduler, only don't bother checking if the
3508 * current context has permission. For example, this is needed in 3531 * current context has permission. For example, this is needed in
3509 * stop_machine(): we create temporary high priority worker threads, 3532 * stop_machine(): we create temporary high priority worker threads,
3510 * but our caller might not have that capability. 3533 * but our caller might not have that capability.
3511 * 3534 *
3512 * Return: 0 on success. An error code otherwise. 3535 * Return: 0 on success. An error code otherwise.
3513 */ 3536 */
3514 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 3537 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
3515 const struct sched_param *param) 3538 const struct sched_param *param)
3516 { 3539 {
3517 return _sched_setscheduler(p, policy, param, false); 3540 return _sched_setscheduler(p, policy, param, false);
3518 } 3541 }
3519 3542
3520 static int 3543 static int
3521 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3544 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3522 { 3545 {
3523 struct sched_param lparam; 3546 struct sched_param lparam;
3524 struct task_struct *p; 3547 struct task_struct *p;
3525 int retval; 3548 int retval;
3526 3549
3527 if (!param || pid < 0) 3550 if (!param || pid < 0)
3528 return -EINVAL; 3551 return -EINVAL;
3529 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 3552 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3530 return -EFAULT; 3553 return -EFAULT;
3531 3554
3532 rcu_read_lock(); 3555 rcu_read_lock();
3533 retval = -ESRCH; 3556 retval = -ESRCH;
3534 p = find_process_by_pid(pid); 3557 p = find_process_by_pid(pid);
3535 if (p != NULL) 3558 if (p != NULL)
3536 retval = sched_setscheduler(p, policy, &lparam); 3559 retval = sched_setscheduler(p, policy, &lparam);
3537 rcu_read_unlock(); 3560 rcu_read_unlock();
3538 3561
3539 return retval; 3562 return retval;
3540 } 3563 }
3541 3564
3542 /* 3565 /*
3543 * Mimics kernel/events/core.c perf_copy_attr(). 3566 * Mimics kernel/events/core.c perf_copy_attr().
3544 */ 3567 */
3545 static int sched_copy_attr(struct sched_attr __user *uattr, 3568 static int sched_copy_attr(struct sched_attr __user *uattr,
3546 struct sched_attr *attr) 3569 struct sched_attr *attr)
3547 { 3570 {
3548 u32 size; 3571 u32 size;
3549 int ret; 3572 int ret;
3550 3573
3551 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) 3574 if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
3552 return -EFAULT; 3575 return -EFAULT;
3553 3576
3554 /* 3577 /*
3555 * zero the full structure, so that a short copy will be nice. 3578 * zero the full structure, so that a short copy will be nice.
3556 */ 3579 */
3557 memset(attr, 0, sizeof(*attr)); 3580 memset(attr, 0, sizeof(*attr));
3558 3581
3559 ret = get_user(size, &uattr->size); 3582 ret = get_user(size, &uattr->size);
3560 if (ret) 3583 if (ret)
3561 return ret; 3584 return ret;
3562 3585
3563 if (size > PAGE_SIZE) /* silly large */ 3586 if (size > PAGE_SIZE) /* silly large */
3564 goto err_size; 3587 goto err_size;
3565 3588
3566 if (!size) /* abi compat */ 3589 if (!size) /* abi compat */
3567 size = SCHED_ATTR_SIZE_VER0; 3590 size = SCHED_ATTR_SIZE_VER0;
3568 3591
3569 if (size < SCHED_ATTR_SIZE_VER0) 3592 if (size < SCHED_ATTR_SIZE_VER0)
3570 goto err_size; 3593 goto err_size;
3571 3594
3572 /* 3595 /*
3573 * If we're handed a bigger struct than we know of, 3596 * If we're handed a bigger struct than we know of,
3574 * ensure all the unknown bits are 0 - i.e. new 3597 * ensure all the unknown bits are 0 - i.e. new
3575 * user-space does not rely on any kernel feature 3598 * user-space does not rely on any kernel feature
3576 * extensions we dont know about yet. 3599 * extensions we dont know about yet.
3577 */ 3600 */
3578 if (size > sizeof(*attr)) { 3601 if (size > sizeof(*attr)) {
3579 unsigned char __user *addr; 3602 unsigned char __user *addr;
3580 unsigned char __user *end; 3603 unsigned char __user *end;
3581 unsigned char val; 3604 unsigned char val;
3582 3605
3583 addr = (void __user *)uattr + sizeof(*attr); 3606 addr = (void __user *)uattr + sizeof(*attr);
3584 end = (void __user *)uattr + size; 3607 end = (void __user *)uattr + size;
3585 3608
3586 for (; addr < end; addr++) { 3609 for (; addr < end; addr++) {
3587 ret = get_user(val, addr); 3610 ret = get_user(val, addr);
3588 if (ret) 3611 if (ret)
3589 return ret; 3612 return ret;
3590 if (val) 3613 if (val)
3591 goto err_size; 3614 goto err_size;
3592 } 3615 }
3593 size = sizeof(*attr); 3616 size = sizeof(*attr);
3594 } 3617 }
3595 3618
3596 ret = copy_from_user(attr, uattr, size); 3619 ret = copy_from_user(attr, uattr, size);
3597 if (ret) 3620 if (ret)
3598 return -EFAULT; 3621 return -EFAULT;
3599 3622
3600 /* 3623 /*
3601 * XXX: do we want to be lenient like existing syscalls; or do we want 3624 * XXX: do we want to be lenient like existing syscalls; or do we want
3602 * to be strict and return an error on out-of-bounds values? 3625 * to be strict and return an error on out-of-bounds values?
3603 */ 3626 */
3604 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE); 3627 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
3605 3628
3606 out: 3629 out:
3607 return ret; 3630 return ret;
3608 3631
3609 err_size: 3632 err_size:
3610 put_user(sizeof(*attr), &uattr->size); 3633 put_user(sizeof(*attr), &uattr->size);
3611 ret = -E2BIG; 3634 ret = -E2BIG;
3612 goto out; 3635 goto out;
3613 } 3636 }
3614 3637
3615 /** 3638 /**
3616 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 3639 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
3617 * @pid: the pid in question. 3640 * @pid: the pid in question.
3618 * @policy: new policy. 3641 * @policy: new policy.
3619 * @param: structure containing the new RT priority. 3642 * @param: structure containing the new RT priority.
3620 * 3643 *
3621 * Return: 0 on success. An error code otherwise. 3644 * Return: 0 on success. An error code otherwise.
3622 */ 3645 */
3623 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 3646 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
3624 struct sched_param __user *, param) 3647 struct sched_param __user *, param)
3625 { 3648 {
3626 /* negative values for policy are not valid */ 3649 /* negative values for policy are not valid */
3627 if (policy < 0) 3650 if (policy < 0)
3628 return -EINVAL; 3651 return -EINVAL;
3629 3652
3630 return do_sched_setscheduler(pid, policy, param); 3653 return do_sched_setscheduler(pid, policy, param);
3631 } 3654 }
3632 3655
3633 /** 3656 /**
3634 * sys_sched_setparam - set/change the RT priority of a thread 3657 * sys_sched_setparam - set/change the RT priority of a thread
3635 * @pid: the pid in question. 3658 * @pid: the pid in question.
3636 * @param: structure containing the new RT priority. 3659 * @param: structure containing the new RT priority.
3637 * 3660 *
3638 * Return: 0 on success. An error code otherwise. 3661 * Return: 0 on success. An error code otherwise.
3639 */ 3662 */
3640 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 3663 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
3641 { 3664 {
3642 return do_sched_setscheduler(pid, -1, param); 3665 return do_sched_setscheduler(pid, -1, param);
3643 } 3666 }
3644 3667
3645 /** 3668 /**
3646 * sys_sched_setattr - same as above, but with extended sched_attr 3669 * sys_sched_setattr - same as above, but with extended sched_attr
3647 * @pid: the pid in question. 3670 * @pid: the pid in question.
3648 * @uattr: structure containing the extended parameters. 3671 * @uattr: structure containing the extended parameters.
3649 * @flags: for future extension. 3672 * @flags: for future extension.
3650 */ 3673 */
3651 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, 3674 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
3652 unsigned int, flags) 3675 unsigned int, flags)
3653 { 3676 {
3654 struct sched_attr attr; 3677 struct sched_attr attr;
3655 struct task_struct *p; 3678 struct task_struct *p;
3656 int retval; 3679 int retval;
3657 3680
3658 if (!uattr || pid < 0 || flags) 3681 if (!uattr || pid < 0 || flags)
3659 return -EINVAL; 3682 return -EINVAL;
3660 3683
3661 if (sched_copy_attr(uattr, &attr)) 3684 retval = sched_copy_attr(uattr, &attr);
3662 return -EFAULT; 3685 if (retval)
3686 return retval;
3663 3687
3688 if (attr.sched_policy < 0)
3689 return -EINVAL;
3690
3664 rcu_read_lock(); 3691 rcu_read_lock();
3665 retval = -ESRCH; 3692 retval = -ESRCH;
3666 p = find_process_by_pid(pid); 3693 p = find_process_by_pid(pid);
3667 if (p != NULL) 3694 if (p != NULL)
3668 retval = sched_setattr(p, &attr); 3695 retval = sched_setattr(p, &attr);
3669 rcu_read_unlock(); 3696 rcu_read_unlock();
3670 3697
3671 return retval; 3698 return retval;
3672 } 3699 }
3673 3700
3674 /** 3701 /**
3675 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 3702 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
3676 * @pid: the pid in question. 3703 * @pid: the pid in question.
3677 * 3704 *
3678 * Return: On success, the policy of the thread. Otherwise, a negative error 3705 * Return: On success, the policy of the thread. Otherwise, a negative error
3679 * code. 3706 * code.
3680 */ 3707 */
3681 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 3708 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
3682 { 3709 {
3683 struct task_struct *p; 3710 struct task_struct *p;
3684 int retval; 3711 int retval;
3685 3712
3686 if (pid < 0) 3713 if (pid < 0)
3687 return -EINVAL; 3714 return -EINVAL;
3688 3715
3689 retval = -ESRCH; 3716 retval = -ESRCH;
3690 rcu_read_lock(); 3717 rcu_read_lock();
3691 p = find_process_by_pid(pid); 3718 p = find_process_by_pid(pid);
3692 if (p) { 3719 if (p) {
3693 retval = security_task_getscheduler(p); 3720 retval = security_task_getscheduler(p);
3694 if (!retval) 3721 if (!retval)
3695 retval = p->policy 3722 retval = p->policy
3696 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 3723 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
3697 } 3724 }
3698 rcu_read_unlock(); 3725 rcu_read_unlock();
3699 return retval; 3726 return retval;
3700 } 3727 }
3701 3728
3702 /** 3729 /**
3703 * sys_sched_getparam - get the RT priority of a thread 3730 * sys_sched_getparam - get the RT priority of a thread
3704 * @pid: the pid in question. 3731 * @pid: the pid in question.
3705 * @param: structure containing the RT priority. 3732 * @param: structure containing the RT priority.
3706 * 3733 *
3707 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error 3734 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
3708 * code. 3735 * code.
3709 */ 3736 */
3710 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 3737 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
3711 { 3738 {
3712 struct sched_param lp; 3739 struct sched_param lp = { .sched_priority = 0 };
3713 struct task_struct *p; 3740 struct task_struct *p;
3714 int retval; 3741 int retval;
3715 3742
3716 if (!param || pid < 0) 3743 if (!param || pid < 0)
3717 return -EINVAL; 3744 return -EINVAL;
3718 3745
3719 rcu_read_lock(); 3746 rcu_read_lock();
3720 p = find_process_by_pid(pid); 3747 p = find_process_by_pid(pid);
3721 retval = -ESRCH; 3748 retval = -ESRCH;
3722 if (!p) 3749 if (!p)
3723 goto out_unlock; 3750 goto out_unlock;
3724 3751
3725 retval = security_task_getscheduler(p); 3752 retval = security_task_getscheduler(p);
3726 if (retval) 3753 if (retval)
3727 goto out_unlock; 3754 goto out_unlock;
3728 3755
3729 if (task_has_dl_policy(p)) { 3756 if (task_has_rt_policy(p))
3730 retval = -EINVAL; 3757 lp.sched_priority = p->rt_priority;
3731 goto out_unlock;
3732 }
3733 lp.sched_priority = p->rt_priority;
3734 rcu_read_unlock(); 3758 rcu_read_unlock();
3735 3759
3736 /* 3760 /*
3737 * This one might sleep, we cannot do it with a spinlock held ... 3761 * This one might sleep, we cannot do it with a spinlock held ...
3738 */ 3762 */
3739 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 3763 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
3740 3764
3741 return retval; 3765 return retval;
3742 3766
3743 out_unlock: 3767 out_unlock:
3744 rcu_read_unlock(); 3768 rcu_read_unlock();
3745 return retval; 3769 return retval;
3746 } 3770 }
3747 3771
3748 static int sched_read_attr(struct sched_attr __user *uattr, 3772 static int sched_read_attr(struct sched_attr __user *uattr,
3749 struct sched_attr *attr, 3773 struct sched_attr *attr,
3750 unsigned int usize) 3774 unsigned int usize)
3751 { 3775 {
3752 int ret; 3776 int ret;
3753 3777
3754 if (!access_ok(VERIFY_WRITE, uattr, usize)) 3778 if (!access_ok(VERIFY_WRITE, uattr, usize))
3755 return -EFAULT; 3779 return -EFAULT;
3756 3780
3757 /* 3781 /*
3758 * If we're handed a smaller struct than we know of, 3782 * If we're handed a smaller struct than we know of,
3759 * ensure all the unknown bits are 0 - i.e. old 3783 * ensure all the unknown bits are 0 - i.e. old
3760 * user-space does not get uncomplete information. 3784 * user-space does not get uncomplete information.
3761 */ 3785 */
3762 if (usize < sizeof(*attr)) { 3786 if (usize < sizeof(*attr)) {
3763 unsigned char *addr; 3787 unsigned char *addr;
3764 unsigned char *end; 3788 unsigned char *end;
3765 3789
3766 addr = (void *)attr + usize; 3790 addr = (void *)attr + usize;
3767 end = (void *)attr + sizeof(*attr); 3791 end = (void *)attr + sizeof(*attr);
3768 3792
3769 for (; addr < end; addr++) { 3793 for (; addr < end; addr++) {
3770 if (*addr) 3794 if (*addr)
3771 goto err_size; 3795 goto err_size;
3772 } 3796 }
3773 3797
3774 attr->size = usize; 3798 attr->size = usize;
3775 } 3799 }
3776 3800
3777 ret = copy_to_user(uattr, attr, attr->size); 3801 ret = copy_to_user(uattr, attr, attr->size);
3778 if (ret) 3802 if (ret)
3779 return -EFAULT; 3803 return -EFAULT;
3780 3804
3781 out: 3805 out:
3782 return ret; 3806 return ret;
3783 3807
3784 err_size: 3808 err_size:
3785 ret = -E2BIG; 3809 ret = -E2BIG;
3786 goto out; 3810 goto out;
3787 } 3811 }
3788 3812
3789 /** 3813 /**
3790 * sys_sched_getattr - similar to sched_getparam, but with sched_attr 3814 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
3791 * @pid: the pid in question. 3815 * @pid: the pid in question.
3792 * @uattr: structure containing the extended parameters. 3816 * @uattr: structure containing the extended parameters.
3793 * @size: sizeof(attr) for fwd/bwd comp. 3817 * @size: sizeof(attr) for fwd/bwd comp.
3794 * @flags: for future extension. 3818 * @flags: for future extension.
3795 */ 3819 */
3796 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, 3820 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
3797 unsigned int, size, unsigned int, flags) 3821 unsigned int, size, unsigned int, flags)
3798 { 3822 {
3799 struct sched_attr attr = { 3823 struct sched_attr attr = {
3800 .size = sizeof(struct sched_attr), 3824 .size = sizeof(struct sched_attr),
3801 }; 3825 };
3802 struct task_struct *p; 3826 struct task_struct *p;
3803 int retval; 3827 int retval;
3804 3828
3805 if (!uattr || pid < 0 || size > PAGE_SIZE || 3829 if (!uattr || pid < 0 || size > PAGE_SIZE ||
3806 size < SCHED_ATTR_SIZE_VER0 || flags) 3830 size < SCHED_ATTR_SIZE_VER0 || flags)
3807 return -EINVAL; 3831 return -EINVAL;
3808 3832
3809 rcu_read_lock(); 3833 rcu_read_lock();
3810 p = find_process_by_pid(pid); 3834 p = find_process_by_pid(pid);
3811 retval = -ESRCH; 3835 retval = -ESRCH;
3812 if (!p) 3836 if (!p)
3813 goto out_unlock; 3837 goto out_unlock;
3814 3838
3815 retval = security_task_getscheduler(p); 3839 retval = security_task_getscheduler(p);
3816 if (retval) 3840 if (retval)
3817 goto out_unlock; 3841 goto out_unlock;
3818 3842
3819 attr.sched_policy = p->policy; 3843 attr.sched_policy = p->policy;
3820 if (p->sched_reset_on_fork) 3844 if (p->sched_reset_on_fork)
3821 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 3845 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
3822 if (task_has_dl_policy(p)) 3846 if (task_has_dl_policy(p))
3823 __getparam_dl(p, &attr); 3847 __getparam_dl(p, &attr);
3824 else if (task_has_rt_policy(p)) 3848 else if (task_has_rt_policy(p))
3825 attr.sched_priority = p->rt_priority; 3849 attr.sched_priority = p->rt_priority;
3826 else 3850 else
3827 attr.sched_nice = task_nice(p); 3851 attr.sched_nice = task_nice(p);
3828 3852
3829 rcu_read_unlock(); 3853 rcu_read_unlock();
3830 3854
3831 retval = sched_read_attr(uattr, &attr, size); 3855 retval = sched_read_attr(uattr, &attr, size);
3832 return retval; 3856 return retval;
3833 3857
3834 out_unlock: 3858 out_unlock:
3835 rcu_read_unlock(); 3859 rcu_read_unlock();
3836 return retval; 3860 return retval;
3837 } 3861 }
3838 3862
3839 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 3863 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3840 { 3864 {
3841 cpumask_var_t cpus_allowed, new_mask; 3865 cpumask_var_t cpus_allowed, new_mask;
3842 struct task_struct *p; 3866 struct task_struct *p;
3843 int retval; 3867 int retval;
3844 3868
3845 rcu_read_lock(); 3869 rcu_read_lock();
3846 3870
3847 p = find_process_by_pid(pid); 3871 p = find_process_by_pid(pid);
3848 if (!p) { 3872 if (!p) {
3849 rcu_read_unlock(); 3873 rcu_read_unlock();
3850 return -ESRCH; 3874 return -ESRCH;
3851 } 3875 }
3852 3876
3853 /* Prevent p going away */ 3877 /* Prevent p going away */
3854 get_task_struct(p); 3878 get_task_struct(p);
3855 rcu_read_unlock(); 3879 rcu_read_unlock();
3856 3880
3857 if (p->flags & PF_NO_SETAFFINITY) { 3881 if (p->flags & PF_NO_SETAFFINITY) {
3858 retval = -EINVAL; 3882 retval = -EINVAL;
3859 goto out_put_task; 3883 goto out_put_task;
3860 } 3884 }
3861 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 3885 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
3862 retval = -ENOMEM; 3886 retval = -ENOMEM;
3863 goto out_put_task; 3887 goto out_put_task;
3864 } 3888 }
3865 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 3889 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
3866 retval = -ENOMEM; 3890 retval = -ENOMEM;
3867 goto out_free_cpus_allowed; 3891 goto out_free_cpus_allowed;
3868 } 3892 }
3869 retval = -EPERM; 3893 retval = -EPERM;
3870 if (!check_same_owner(p)) { 3894 if (!check_same_owner(p)) {
3871 rcu_read_lock(); 3895 rcu_read_lock();
3872 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 3896 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3873 rcu_read_unlock(); 3897 rcu_read_unlock();
3874 goto out_unlock; 3898 goto out_unlock;
3875 } 3899 }
3876 rcu_read_unlock(); 3900 rcu_read_unlock();
3877 } 3901 }
3878 3902
3879 retval = security_task_setscheduler(p); 3903 retval = security_task_setscheduler(p);
3880 if (retval) 3904 if (retval)
3881 goto out_unlock; 3905 goto out_unlock;
3882 3906
3883 3907
3884 cpuset_cpus_allowed(p, cpus_allowed); 3908 cpuset_cpus_allowed(p, cpus_allowed);
3885 cpumask_and(new_mask, in_mask, cpus_allowed); 3909 cpumask_and(new_mask, in_mask, cpus_allowed);
3886 3910
3887 /* 3911 /*
3888 * Since bandwidth control happens on root_domain basis, 3912 * Since bandwidth control happens on root_domain basis,
3889 * if admission test is enabled, we only admit -deadline 3913 * if admission test is enabled, we only admit -deadline
3890 * tasks allowed to run on all the CPUs in the task's 3914 * tasks allowed to run on all the CPUs in the task's
3891 * root_domain. 3915 * root_domain.
3892 */ 3916 */
3893 #ifdef CONFIG_SMP 3917 #ifdef CONFIG_SMP
3894 if (task_has_dl_policy(p)) { 3918 if (task_has_dl_policy(p)) {
3895 const struct cpumask *span = task_rq(p)->rd->span; 3919 const struct cpumask *span = task_rq(p)->rd->span;
3896 3920
3897 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) { 3921 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
3898 retval = -EBUSY; 3922 retval = -EBUSY;
3899 goto out_unlock; 3923 goto out_unlock;
3900 } 3924 }
3901 } 3925 }
3902 #endif 3926 #endif
3903 again: 3927 again:
3904 retval = set_cpus_allowed_ptr(p, new_mask); 3928 retval = set_cpus_allowed_ptr(p, new_mask);
3905 3929
3906 if (!retval) { 3930 if (!retval) {
3907 cpuset_cpus_allowed(p, cpus_allowed); 3931 cpuset_cpus_allowed(p, cpus_allowed);
3908 if (!cpumask_subset(new_mask, cpus_allowed)) { 3932 if (!cpumask_subset(new_mask, cpus_allowed)) {
3909 /* 3933 /*
3910 * We must have raced with a concurrent cpuset 3934 * We must have raced with a concurrent cpuset
3911 * update. Just reset the cpus_allowed to the 3935 * update. Just reset the cpus_allowed to the
3912 * cpuset's cpus_allowed 3936 * cpuset's cpus_allowed
3913 */ 3937 */
3914 cpumask_copy(new_mask, cpus_allowed); 3938 cpumask_copy(new_mask, cpus_allowed);
3915 goto again; 3939 goto again;
3916 } 3940 }
3917 } 3941 }
3918 out_unlock: 3942 out_unlock:
3919 free_cpumask_var(new_mask); 3943 free_cpumask_var(new_mask);
3920 out_free_cpus_allowed: 3944 out_free_cpus_allowed:
3921 free_cpumask_var(cpus_allowed); 3945 free_cpumask_var(cpus_allowed);
3922 out_put_task: 3946 out_put_task:
3923 put_task_struct(p); 3947 put_task_struct(p);
3924 return retval; 3948 return retval;
3925 } 3949 }
3926 3950
3927 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 3951 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
3928 struct cpumask *new_mask) 3952 struct cpumask *new_mask)
3929 { 3953 {
3930 if (len < cpumask_size()) 3954 if (len < cpumask_size())
3931 cpumask_clear(new_mask); 3955 cpumask_clear(new_mask);
3932 else if (len > cpumask_size()) 3956 else if (len > cpumask_size())
3933 len = cpumask_size(); 3957 len = cpumask_size();
3934 3958
3935 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 3959 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
3936 } 3960 }
3937 3961
3938 /** 3962 /**
3939 * sys_sched_setaffinity - set the cpu affinity of a process 3963 * sys_sched_setaffinity - set the cpu affinity of a process
3940 * @pid: pid of the process 3964 * @pid: pid of the process
3941 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 3965 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3942 * @user_mask_ptr: user-space pointer to the new cpu mask 3966 * @user_mask_ptr: user-space pointer to the new cpu mask
3943 * 3967 *
3944 * Return: 0 on success. An error code otherwise. 3968 * Return: 0 on success. An error code otherwise.
3945 */ 3969 */
3946 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 3970 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
3947 unsigned long __user *, user_mask_ptr) 3971 unsigned long __user *, user_mask_ptr)
3948 { 3972 {
3949 cpumask_var_t new_mask; 3973 cpumask_var_t new_mask;
3950 int retval; 3974 int retval;
3951 3975
3952 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 3976 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
3953 return -ENOMEM; 3977 return -ENOMEM;
3954 3978
3955 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 3979 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
3956 if (retval == 0) 3980 if (retval == 0)
3957 retval = sched_setaffinity(pid, new_mask); 3981 retval = sched_setaffinity(pid, new_mask);
3958 free_cpumask_var(new_mask); 3982 free_cpumask_var(new_mask);
3959 return retval; 3983 return retval;
3960 } 3984 }
3961 3985
3962 long sched_getaffinity(pid_t pid, struct cpumask *mask) 3986 long sched_getaffinity(pid_t pid, struct cpumask *mask)
3963 { 3987 {
3964 struct task_struct *p; 3988 struct task_struct *p;
3965 unsigned long flags; 3989 unsigned long flags;
3966 int retval; 3990 int retval;
3967 3991
3968 rcu_read_lock(); 3992 rcu_read_lock();
3969 3993
3970 retval = -ESRCH; 3994 retval = -ESRCH;
3971 p = find_process_by_pid(pid); 3995 p = find_process_by_pid(pid);
3972 if (!p) 3996 if (!p)
3973 goto out_unlock; 3997 goto out_unlock;
3974 3998
3975 retval = security_task_getscheduler(p); 3999 retval = security_task_getscheduler(p);
3976 if (retval) 4000 if (retval)
3977 goto out_unlock; 4001 goto out_unlock;
3978 4002
3979 raw_spin_lock_irqsave(&p->pi_lock, flags); 4003 raw_spin_lock_irqsave(&p->pi_lock, flags);
3980 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 4004 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3981 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4005 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3982 4006
3983 out_unlock: 4007 out_unlock:
3984 rcu_read_unlock(); 4008 rcu_read_unlock();
3985 4009
3986 return retval; 4010 return retval;
3987 } 4011 }
3988 4012
3989 /** 4013 /**
3990 * sys_sched_getaffinity - get the cpu affinity of a process 4014 * sys_sched_getaffinity - get the cpu affinity of a process
3991 * @pid: pid of the process 4015 * @pid: pid of the process
3992 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4016 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
3993 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4017 * @user_mask_ptr: user-space pointer to hold the current cpu mask
3994 * 4018 *
3995 * Return: 0 on success. An error code otherwise. 4019 * Return: 0 on success. An error code otherwise.
3996 */ 4020 */
3997 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4021 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
3998 unsigned long __user *, user_mask_ptr) 4022 unsigned long __user *, user_mask_ptr)
3999 { 4023 {
4000 int ret; 4024 int ret;
4001 cpumask_var_t mask; 4025 cpumask_var_t mask;
4002 4026
4003 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4027 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4004 return -EINVAL; 4028 return -EINVAL;
4005 if (len & (sizeof(unsigned long)-1)) 4029 if (len & (sizeof(unsigned long)-1))
4006 return -EINVAL; 4030 return -EINVAL;
4007 4031
4008 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4032 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4009 return -ENOMEM; 4033 return -ENOMEM;
4010 4034
4011 ret = sched_getaffinity(pid, mask); 4035 ret = sched_getaffinity(pid, mask);
4012 if (ret == 0) { 4036 if (ret == 0) {
4013 size_t retlen = min_t(size_t, len, cpumask_size()); 4037 size_t retlen = min_t(size_t, len, cpumask_size());
4014 4038
4015 if (copy_to_user(user_mask_ptr, mask, retlen)) 4039 if (copy_to_user(user_mask_ptr, mask, retlen))
4016 ret = -EFAULT; 4040 ret = -EFAULT;
4017 else 4041 else
4018 ret = retlen; 4042 ret = retlen;
4019 } 4043 }
4020 free_cpumask_var(mask); 4044 free_cpumask_var(mask);
4021 4045
4022 return ret; 4046 return ret;
4023 } 4047 }
4024 4048
4025 /** 4049 /**
4026 * sys_sched_yield - yield the current processor to other threads. 4050 * sys_sched_yield - yield the current processor to other threads.
4027 * 4051 *
4028 * This function yields the current CPU to other tasks. If there are no 4052 * This function yields the current CPU to other tasks. If there are no
4029 * other threads running on this CPU then this function will return. 4053 * other threads running on this CPU then this function will return.
4030 * 4054 *
4031 * Return: 0. 4055 * Return: 0.
4032 */ 4056 */
4033 SYSCALL_DEFINE0(sched_yield) 4057 SYSCALL_DEFINE0(sched_yield)
4034 { 4058 {
4035 struct rq *rq = this_rq_lock(); 4059 struct rq *rq = this_rq_lock();
4036 4060
4037 schedstat_inc(rq, yld_count); 4061 schedstat_inc(rq, yld_count);
4038 current->sched_class->yield_task(rq); 4062 current->sched_class->yield_task(rq);
4039 4063
4040 /* 4064 /*
4041 * Since we are going to call schedule() anyway, there's 4065 * Since we are going to call schedule() anyway, there's
4042 * no need to preempt or enable interrupts: 4066 * no need to preempt or enable interrupts:
4043 */ 4067 */
4044 __release(rq->lock); 4068 __release(rq->lock);
4045 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4069 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4046 do_raw_spin_unlock(&rq->lock); 4070 do_raw_spin_unlock(&rq->lock);
4047 sched_preempt_enable_no_resched(); 4071 sched_preempt_enable_no_resched();
4048 4072
4049 schedule(); 4073 schedule();
4050 4074
4051 return 0; 4075 return 0;
4052 } 4076 }
4053 4077
4054 static void __cond_resched(void) 4078 static void __cond_resched(void)
4055 { 4079 {
4056 __preempt_count_add(PREEMPT_ACTIVE); 4080 __preempt_count_add(PREEMPT_ACTIVE);
4057 __schedule(); 4081 __schedule();
4058 __preempt_count_sub(PREEMPT_ACTIVE); 4082 __preempt_count_sub(PREEMPT_ACTIVE);
4059 } 4083 }
4060 4084
4061 int __sched _cond_resched(void) 4085 int __sched _cond_resched(void)
4062 { 4086 {
4063 if (should_resched()) { 4087 if (should_resched()) {
4064 __cond_resched(); 4088 __cond_resched();
4065 return 1; 4089 return 1;
4066 } 4090 }
4067 return 0; 4091 return 0;
4068 } 4092 }
4069 EXPORT_SYMBOL(_cond_resched); 4093 EXPORT_SYMBOL(_cond_resched);
4070 4094
4071 /* 4095 /*
4072 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4096 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4073 * call schedule, and on return reacquire the lock. 4097 * call schedule, and on return reacquire the lock.
4074 * 4098 *
4075 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4099 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4076 * operations here to prevent schedule() from being called twice (once via 4100 * operations here to prevent schedule() from being called twice (once via
4077 * spin_unlock(), once by hand). 4101 * spin_unlock(), once by hand).
4078 */ 4102 */
4079 int __cond_resched_lock(spinlock_t *lock) 4103 int __cond_resched_lock(spinlock_t *lock)
4080 { 4104 {
4081 int resched = should_resched(); 4105 int resched = should_resched();
4082 int ret = 0; 4106 int ret = 0;
4083 4107
4084 lockdep_assert_held(lock); 4108 lockdep_assert_held(lock);
4085 4109
4086 if (spin_needbreak(lock) || resched) { 4110 if (spin_needbreak(lock) || resched) {
4087 spin_unlock(lock); 4111 spin_unlock(lock);
4088 if (resched) 4112 if (resched)
4089 __cond_resched(); 4113 __cond_resched();
4090 else 4114 else
4091 cpu_relax(); 4115 cpu_relax();
4092 ret = 1; 4116 ret = 1;
4093 spin_lock(lock); 4117 spin_lock(lock);
4094 } 4118 }
4095 return ret; 4119 return ret;
4096 } 4120 }
4097 EXPORT_SYMBOL(__cond_resched_lock); 4121 EXPORT_SYMBOL(__cond_resched_lock);
4098 4122
4099 int __sched __cond_resched_softirq(void) 4123 int __sched __cond_resched_softirq(void)
4100 { 4124 {
4101 BUG_ON(!in_softirq()); 4125 BUG_ON(!in_softirq());
4102 4126
4103 if (should_resched()) { 4127 if (should_resched()) {
4104 local_bh_enable(); 4128 local_bh_enable();
4105 __cond_resched(); 4129 __cond_resched();
4106 local_bh_disable(); 4130 local_bh_disable();
4107 return 1; 4131 return 1;
4108 } 4132 }
4109 return 0; 4133 return 0;
4110 } 4134 }
4111 EXPORT_SYMBOL(__cond_resched_softirq); 4135 EXPORT_SYMBOL(__cond_resched_softirq);
4112 4136
4113 /** 4137 /**
4114 * yield - yield the current processor to other threads. 4138 * yield - yield the current processor to other threads.
4115 * 4139 *
4116 * Do not ever use this function, there's a 99% chance you're doing it wrong. 4140 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4117 * 4141 *
4118 * The scheduler is at all times free to pick the calling task as the most 4142 * The scheduler is at all times free to pick the calling task as the most
4119 * eligible task to run, if removing the yield() call from your code breaks 4143 * eligible task to run, if removing the yield() call from your code breaks
4120 * it, its already broken. 4144 * it, its already broken.
4121 * 4145 *
4122 * Typical broken usage is: 4146 * Typical broken usage is:
4123 * 4147 *
4124 * while (!event) 4148 * while (!event)
4125 * yield(); 4149 * yield();
4126 * 4150 *
4127 * where one assumes that yield() will let 'the other' process run that will 4151 * where one assumes that yield() will let 'the other' process run that will
4128 * make event true. If the current task is a SCHED_FIFO task that will never 4152 * make event true. If the current task is a SCHED_FIFO task that will never
4129 * happen. Never use yield() as a progress guarantee!! 4153 * happen. Never use yield() as a progress guarantee!!
4130 * 4154 *
4131 * If you want to use yield() to wait for something, use wait_event(). 4155 * If you want to use yield() to wait for something, use wait_event().
4132 * If you want to use yield() to be 'nice' for others, use cond_resched(). 4156 * If you want to use yield() to be 'nice' for others, use cond_resched().
4133 * If you still want to use yield(), do not! 4157 * If you still want to use yield(), do not!
4134 */ 4158 */
4135 void __sched yield(void) 4159 void __sched yield(void)
4136 { 4160 {
4137 set_current_state(TASK_RUNNING); 4161 set_current_state(TASK_RUNNING);
4138 sys_sched_yield(); 4162 sys_sched_yield();
4139 } 4163 }
4140 EXPORT_SYMBOL(yield); 4164 EXPORT_SYMBOL(yield);
4141 4165
4142 /** 4166 /**
4143 * yield_to - yield the current processor to another thread in 4167 * yield_to - yield the current processor to another thread in
4144 * your thread group, or accelerate that thread toward the 4168 * your thread group, or accelerate that thread toward the
4145 * processor it's on. 4169 * processor it's on.
4146 * @p: target task 4170 * @p: target task
4147 * @preempt: whether task preemption is allowed or not 4171 * @preempt: whether task preemption is allowed or not
4148 * 4172 *
4149 * It's the caller's job to ensure that the target task struct 4173 * It's the caller's job to ensure that the target task struct
4150 * can't go away on us before we can do any checks. 4174 * can't go away on us before we can do any checks.
4151 * 4175 *
4152 * Return: 4176 * Return:
4153 * true (>0) if we indeed boosted the target task. 4177 * true (>0) if we indeed boosted the target task.
4154 * false (0) if we failed to boost the target. 4178 * false (0) if we failed to boost the target.
4155 * -ESRCH if there's no task to yield to. 4179 * -ESRCH if there's no task to yield to.
4156 */ 4180 */
4157 bool __sched yield_to(struct task_struct *p, bool preempt) 4181 bool __sched yield_to(struct task_struct *p, bool preempt)
4158 { 4182 {
4159 struct task_struct *curr = current; 4183 struct task_struct *curr = current;
4160 struct rq *rq, *p_rq; 4184 struct rq *rq, *p_rq;
4161 unsigned long flags; 4185 unsigned long flags;
4162 int yielded = 0; 4186 int yielded = 0;
4163 4187
4164 local_irq_save(flags); 4188 local_irq_save(flags);
4165 rq = this_rq(); 4189 rq = this_rq();
4166 4190
4167 again: 4191 again:
4168 p_rq = task_rq(p); 4192 p_rq = task_rq(p);
4169 /* 4193 /*
4170 * If we're the only runnable task on the rq and target rq also 4194 * If we're the only runnable task on the rq and target rq also
4171 * has only one task, there's absolutely no point in yielding. 4195 * has only one task, there's absolutely no point in yielding.
4172 */ 4196 */
4173 if (rq->nr_running == 1 && p_rq->nr_running == 1) { 4197 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4174 yielded = -ESRCH; 4198 yielded = -ESRCH;
4175 goto out_irq; 4199 goto out_irq;
4176 } 4200 }
4177 4201
4178 double_rq_lock(rq, p_rq); 4202 double_rq_lock(rq, p_rq);
4179 if (task_rq(p) != p_rq) { 4203 if (task_rq(p) != p_rq) {
4180 double_rq_unlock(rq, p_rq); 4204 double_rq_unlock(rq, p_rq);
4181 goto again; 4205 goto again;
4182 } 4206 }
4183 4207
4184 if (!curr->sched_class->yield_to_task) 4208 if (!curr->sched_class->yield_to_task)
4185 goto out_unlock; 4209 goto out_unlock;
4186 4210
4187 if (curr->sched_class != p->sched_class) 4211 if (curr->sched_class != p->sched_class)
4188 goto out_unlock; 4212 goto out_unlock;
4189 4213
4190 if (task_running(p_rq, p) || p->state) 4214 if (task_running(p_rq, p) || p->state)
4191 goto out_unlock; 4215 goto out_unlock;
4192 4216
4193 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4217 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4194 if (yielded) { 4218 if (yielded) {
4195 schedstat_inc(rq, yld_count); 4219 schedstat_inc(rq, yld_count);
4196 /* 4220 /*
4197 * Make p's CPU reschedule; pick_next_entity takes care of 4221 * Make p's CPU reschedule; pick_next_entity takes care of
4198 * fairness. 4222 * fairness.
4199 */ 4223 */
4200 if (preempt && rq != p_rq) 4224 if (preempt && rq != p_rq)
4201 resched_task(p_rq->curr); 4225 resched_task(p_rq->curr);
4202 } 4226 }
4203 4227
4204 out_unlock: 4228 out_unlock:
4205 double_rq_unlock(rq, p_rq); 4229 double_rq_unlock(rq, p_rq);
4206 out_irq: 4230 out_irq:
4207 local_irq_restore(flags); 4231 local_irq_restore(flags);
4208 4232
4209 if (yielded > 0) 4233 if (yielded > 0)
4210 schedule(); 4234 schedule();
4211 4235
4212 return yielded; 4236 return yielded;
4213 } 4237 }
4214 EXPORT_SYMBOL_GPL(yield_to); 4238 EXPORT_SYMBOL_GPL(yield_to);
4215 4239
4216 /* 4240 /*
4217 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4241 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4218 * that process accounting knows that this is a task in IO wait state. 4242 * that process accounting knows that this is a task in IO wait state.
4219 */ 4243 */
4220 void __sched io_schedule(void) 4244 void __sched io_schedule(void)
4221 { 4245 {
4222 struct rq *rq = raw_rq(); 4246 struct rq *rq = raw_rq();
4223 4247
4224 delayacct_blkio_start(); 4248 delayacct_blkio_start();
4225 atomic_inc(&rq->nr_iowait); 4249 atomic_inc(&rq->nr_iowait);
4226 blk_flush_plug(current); 4250 blk_flush_plug(current);
4227 current->in_iowait = 1; 4251 current->in_iowait = 1;
4228 schedule(); 4252 schedule();
4229 current->in_iowait = 0; 4253 current->in_iowait = 0;
4230 atomic_dec(&rq->nr_iowait); 4254 atomic_dec(&rq->nr_iowait);
4231 delayacct_blkio_end(); 4255 delayacct_blkio_end();
4232 } 4256 }
4233 EXPORT_SYMBOL(io_schedule); 4257 EXPORT_SYMBOL(io_schedule);
4234 4258
4235 long __sched io_schedule_timeout(long timeout) 4259 long __sched io_schedule_timeout(long timeout)
4236 { 4260 {
4237 struct rq *rq = raw_rq(); 4261 struct rq *rq = raw_rq();
4238 long ret; 4262 long ret;
4239 4263
4240 delayacct_blkio_start(); 4264 delayacct_blkio_start();
4241 atomic_inc(&rq->nr_iowait); 4265 atomic_inc(&rq->nr_iowait);
4242 blk_flush_plug(current); 4266 blk_flush_plug(current);
4243 current->in_iowait = 1; 4267 current->in_iowait = 1;
4244 ret = schedule_timeout(timeout); 4268 ret = schedule_timeout(timeout);
4245 current->in_iowait = 0; 4269 current->in_iowait = 0;
4246 atomic_dec(&rq->nr_iowait); 4270 atomic_dec(&rq->nr_iowait);
4247 delayacct_blkio_end(); 4271 delayacct_blkio_end();
4248 return ret; 4272 return ret;
4249 } 4273 }
4250 4274
4251 /** 4275 /**
4252 * sys_sched_get_priority_max - return maximum RT priority. 4276 * sys_sched_get_priority_max - return maximum RT priority.
4253 * @policy: scheduling class. 4277 * @policy: scheduling class.
4254 * 4278 *
4255 * Return: On success, this syscall returns the maximum 4279 * Return: On success, this syscall returns the maximum
4256 * rt_priority that can be used by a given scheduling class. 4280 * rt_priority that can be used by a given scheduling class.
4257 * On failure, a negative error code is returned. 4281 * On failure, a negative error code is returned.
4258 */ 4282 */
4259 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 4283 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4260 { 4284 {
4261 int ret = -EINVAL; 4285 int ret = -EINVAL;
4262 4286
4263 switch (policy) { 4287 switch (policy) {
4264 case SCHED_FIFO: 4288 case SCHED_FIFO:
4265 case SCHED_RR: 4289 case SCHED_RR:
4266 ret = MAX_USER_RT_PRIO-1; 4290 ret = MAX_USER_RT_PRIO-1;
4267 break; 4291 break;
4268 case SCHED_DEADLINE: 4292 case SCHED_DEADLINE:
4269 case SCHED_NORMAL: 4293 case SCHED_NORMAL:
4270 case SCHED_BATCH: 4294 case SCHED_BATCH:
4271 case SCHED_IDLE: 4295 case SCHED_IDLE:
4272 ret = 0; 4296 ret = 0;
4273 break; 4297 break;
4274 } 4298 }
4275 return ret; 4299 return ret;
4276 } 4300 }
4277 4301
4278 /** 4302 /**
4279 * sys_sched_get_priority_min - return minimum RT priority. 4303 * sys_sched_get_priority_min - return minimum RT priority.
4280 * @policy: scheduling class. 4304 * @policy: scheduling class.
4281 * 4305 *
4282 * Return: On success, this syscall returns the minimum 4306 * Return: On success, this syscall returns the minimum
4283 * rt_priority that can be used by a given scheduling class. 4307 * rt_priority that can be used by a given scheduling class.
4284 * On failure, a negative error code is returned. 4308 * On failure, a negative error code is returned.
4285 */ 4309 */
4286 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 4310 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
4287 { 4311 {
4288 int ret = -EINVAL; 4312 int ret = -EINVAL;
4289 4313
4290 switch (policy) { 4314 switch (policy) {
4291 case SCHED_FIFO: 4315 case SCHED_FIFO:
4292 case SCHED_RR: 4316 case SCHED_RR:
4293 ret = 1; 4317 ret = 1;
4294 break; 4318 break;
4295 case SCHED_DEADLINE: 4319 case SCHED_DEADLINE:
4296 case SCHED_NORMAL: 4320 case SCHED_NORMAL:
4297 case SCHED_BATCH: 4321 case SCHED_BATCH:
4298 case SCHED_IDLE: 4322 case SCHED_IDLE:
4299 ret = 0; 4323 ret = 0;
4300 } 4324 }
4301 return ret; 4325 return ret;
4302 } 4326 }
4303 4327
4304 /** 4328 /**
4305 * sys_sched_rr_get_interval - return the default timeslice of a process. 4329 * sys_sched_rr_get_interval - return the default timeslice of a process.
4306 * @pid: pid of the process. 4330 * @pid: pid of the process.
4307 * @interval: userspace pointer to the timeslice value. 4331 * @interval: userspace pointer to the timeslice value.
4308 * 4332 *
4309 * this syscall writes the default timeslice value of a given process 4333 * this syscall writes the default timeslice value of a given process
4310 * into the user-space timespec buffer. A value of '0' means infinity. 4334 * into the user-space timespec buffer. A value of '0' means infinity.
4311 * 4335 *
4312 * Return: On success, 0 and the timeslice is in @interval. Otherwise, 4336 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
4313 * an error code. 4337 * an error code.
4314 */ 4338 */
4315 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 4339 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
4316 struct timespec __user *, interval) 4340 struct timespec __user *, interval)
4317 { 4341 {
4318 struct task_struct *p; 4342 struct task_struct *p;
4319 unsigned int time_slice; 4343 unsigned int time_slice;
4320 unsigned long flags; 4344 unsigned long flags;
4321 struct rq *rq; 4345 struct rq *rq;
4322 int retval; 4346 int retval;
4323 struct timespec t; 4347 struct timespec t;
4324 4348
4325 if (pid < 0) 4349 if (pid < 0)
4326 return -EINVAL; 4350 return -EINVAL;
4327 4351
4328 retval = -ESRCH; 4352 retval = -ESRCH;
4329 rcu_read_lock(); 4353 rcu_read_lock();
4330 p = find_process_by_pid(pid); 4354 p = find_process_by_pid(pid);
4331 if (!p) 4355 if (!p)
4332 goto out_unlock; 4356 goto out_unlock;
4333 4357
4334 retval = security_task_getscheduler(p); 4358 retval = security_task_getscheduler(p);
4335 if (retval) 4359 if (retval)
4336 goto out_unlock; 4360 goto out_unlock;
4337 4361
4338 rq = task_rq_lock(p, &flags); 4362 rq = task_rq_lock(p, &flags);
4339 time_slice = 0; 4363 time_slice = 0;
4340 if (p->sched_class->get_rr_interval) 4364 if (p->sched_class->get_rr_interval)
4341 time_slice = p->sched_class->get_rr_interval(rq, p); 4365 time_slice = p->sched_class->get_rr_interval(rq, p);
4342 task_rq_unlock(rq, p, &flags); 4366 task_rq_unlock(rq, p, &flags);
4343 4367
4344 rcu_read_unlock(); 4368 rcu_read_unlock();
4345 jiffies_to_timespec(time_slice, &t); 4369 jiffies_to_timespec(time_slice, &t);
4346 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4370 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4347 return retval; 4371 return retval;
4348 4372
4349 out_unlock: 4373 out_unlock:
4350 rcu_read_unlock(); 4374 rcu_read_unlock();
4351 return retval; 4375 return retval;
4352 } 4376 }
4353 4377
4354 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 4378 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
4355 4379
4356 void sched_show_task(struct task_struct *p) 4380 void sched_show_task(struct task_struct *p)
4357 { 4381 {
4358 unsigned long free = 0; 4382 unsigned long free = 0;
4359 int ppid; 4383 int ppid;
4360 unsigned state; 4384 unsigned state;
4361 4385
4362 state = p->state ? __ffs(p->state) + 1 : 0; 4386 state = p->state ? __ffs(p->state) + 1 : 0;
4363 printk(KERN_INFO "%-15.15s %c", p->comm, 4387 printk(KERN_INFO "%-15.15s %c", p->comm,
4364 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4388 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4365 #if BITS_PER_LONG == 32 4389 #if BITS_PER_LONG == 32
4366 if (state == TASK_RUNNING) 4390 if (state == TASK_RUNNING)
4367 printk(KERN_CONT " running "); 4391 printk(KERN_CONT " running ");
4368 else 4392 else
4369 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 4393 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
4370 #else 4394 #else
4371 if (state == TASK_RUNNING) 4395 if (state == TASK_RUNNING)
4372 printk(KERN_CONT " running task "); 4396 printk(KERN_CONT " running task ");
4373 else 4397 else
4374 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 4398 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
4375 #endif 4399 #endif
4376 #ifdef CONFIG_DEBUG_STACK_USAGE 4400 #ifdef CONFIG_DEBUG_STACK_USAGE
4377 free = stack_not_used(p); 4401 free = stack_not_used(p);
4378 #endif 4402 #endif
4379 rcu_read_lock(); 4403 rcu_read_lock();
4380 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 4404 ppid = task_pid_nr(rcu_dereference(p->real_parent));
4381 rcu_read_unlock(); 4405 rcu_read_unlock();
4382 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 4406 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
4383 task_pid_nr(p), ppid, 4407 task_pid_nr(p), ppid,
4384 (unsigned long)task_thread_info(p)->flags); 4408 (unsigned long)task_thread_info(p)->flags);
4385 4409
4386 print_worker_info(KERN_INFO, p); 4410 print_worker_info(KERN_INFO, p);
4387 show_stack(p, NULL); 4411 show_stack(p, NULL);
4388 } 4412 }
4389 4413
4390 void show_state_filter(unsigned long state_filter) 4414 void show_state_filter(unsigned long state_filter)
4391 { 4415 {
4392 struct task_struct *g, *p; 4416 struct task_struct *g, *p;
4393 4417
4394 #if BITS_PER_LONG == 32 4418 #if BITS_PER_LONG == 32
4395 printk(KERN_INFO 4419 printk(KERN_INFO
4396 " task PC stack pid father\n"); 4420 " task PC stack pid father\n");
4397 #else 4421 #else
4398 printk(KERN_INFO 4422 printk(KERN_INFO
4399 " task PC stack pid father\n"); 4423 " task PC stack pid father\n");
4400 #endif 4424 #endif
4401 rcu_read_lock(); 4425 rcu_read_lock();
4402 do_each_thread(g, p) { 4426 do_each_thread(g, p) {
4403 /* 4427 /*
4404 * reset the NMI-timeout, listing all files on a slow 4428 * reset the NMI-timeout, listing all files on a slow
4405 * console might take a lot of time: 4429 * console might take a lot of time:
4406 */ 4430 */
4407 touch_nmi_watchdog(); 4431 touch_nmi_watchdog();
4408 if (!state_filter || (p->state & state_filter)) 4432 if (!state_filter || (p->state & state_filter))
4409 sched_show_task(p); 4433 sched_show_task(p);
4410 } while_each_thread(g, p); 4434 } while_each_thread(g, p);
4411 4435
4412 touch_all_softlockup_watchdogs(); 4436 touch_all_softlockup_watchdogs();
4413 4437
4414 #ifdef CONFIG_SCHED_DEBUG 4438 #ifdef CONFIG_SCHED_DEBUG
4415 sysrq_sched_debug_show(); 4439 sysrq_sched_debug_show();
4416 #endif 4440 #endif
4417 rcu_read_unlock(); 4441 rcu_read_unlock();
4418 /* 4442 /*
4419 * Only show locks if all tasks are dumped: 4443 * Only show locks if all tasks are dumped:
4420 */ 4444 */
4421 if (!state_filter) 4445 if (!state_filter)
4422 debug_show_all_locks(); 4446 debug_show_all_locks();
4423 } 4447 }
4424 4448
4425 void init_idle_bootup_task(struct task_struct *idle) 4449 void init_idle_bootup_task(struct task_struct *idle)
4426 { 4450 {
4427 idle->sched_class = &idle_sched_class; 4451 idle->sched_class = &idle_sched_class;
4428 } 4452 }
4429 4453
4430 /** 4454 /**
4431 * init_idle - set up an idle thread for a given CPU 4455 * init_idle - set up an idle thread for a given CPU
4432 * @idle: task in question 4456 * @idle: task in question
4433 * @cpu: cpu the idle task belongs to 4457 * @cpu: cpu the idle task belongs to
4434 * 4458 *
4435 * NOTE: this function does not set the idle thread's NEED_RESCHED 4459 * NOTE: this function does not set the idle thread's NEED_RESCHED
4436 * flag, to make booting more robust. 4460 * flag, to make booting more robust.
4437 */ 4461 */
4438 void init_idle(struct task_struct *idle, int cpu) 4462 void init_idle(struct task_struct *idle, int cpu)
4439 { 4463 {
4440 struct rq *rq = cpu_rq(cpu); 4464 struct rq *rq = cpu_rq(cpu);
4441 unsigned long flags; 4465 unsigned long flags;
4442 4466
4443 raw_spin_lock_irqsave(&rq->lock, flags); 4467 raw_spin_lock_irqsave(&rq->lock, flags);
4444 4468
4445 __sched_fork(0, idle); 4469 __sched_fork(0, idle);
4446 idle->state = TASK_RUNNING; 4470 idle->state = TASK_RUNNING;
4447 idle->se.exec_start = sched_clock(); 4471 idle->se.exec_start = sched_clock();
4448 4472
4449 do_set_cpus_allowed(idle, cpumask_of(cpu)); 4473 do_set_cpus_allowed(idle, cpumask_of(cpu));
4450 /* 4474 /*
4451 * We're having a chicken and egg problem, even though we are 4475 * We're having a chicken and egg problem, even though we are
4452 * holding rq->lock, the cpu isn't yet set to this cpu so the 4476 * holding rq->lock, the cpu isn't yet set to this cpu so the
4453 * lockdep check in task_group() will fail. 4477 * lockdep check in task_group() will fail.
4454 * 4478 *
4455 * Similar case to sched_fork(). / Alternatively we could 4479 * Similar case to sched_fork(). / Alternatively we could
4456 * use task_rq_lock() here and obtain the other rq->lock. 4480 * use task_rq_lock() here and obtain the other rq->lock.
4457 * 4481 *
4458 * Silence PROVE_RCU 4482 * Silence PROVE_RCU
4459 */ 4483 */
4460 rcu_read_lock(); 4484 rcu_read_lock();
4461 __set_task_cpu(idle, cpu); 4485 __set_task_cpu(idle, cpu);
4462 rcu_read_unlock(); 4486 rcu_read_unlock();
4463 4487
4464 rq->curr = rq->idle = idle; 4488 rq->curr = rq->idle = idle;
4465 idle->on_rq = 1; 4489 idle->on_rq = 1;
4466 #if defined(CONFIG_SMP) 4490 #if defined(CONFIG_SMP)
4467 idle->on_cpu = 1; 4491 idle->on_cpu = 1;
4468 #endif 4492 #endif
4469 raw_spin_unlock_irqrestore(&rq->lock, flags); 4493 raw_spin_unlock_irqrestore(&rq->lock, flags);
4470 4494
4471 /* Set the preempt count _outside_ the spinlocks! */ 4495 /* Set the preempt count _outside_ the spinlocks! */
4472 init_idle_preempt_count(idle, cpu); 4496 init_idle_preempt_count(idle, cpu);
4473 4497
4474 /* 4498 /*
4475 * The idle tasks have their own, simple scheduling class: 4499 * The idle tasks have their own, simple scheduling class:
4476 */ 4500 */
4477 idle->sched_class = &idle_sched_class; 4501 idle->sched_class = &idle_sched_class;
4478 ftrace_graph_init_idle_task(idle, cpu); 4502 ftrace_graph_init_idle_task(idle, cpu);
4479 vtime_init_idle(idle, cpu); 4503 vtime_init_idle(idle, cpu);
4480 #if defined(CONFIG_SMP) 4504 #if defined(CONFIG_SMP)
4481 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4505 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4482 #endif 4506 #endif
4483 } 4507 }
4484 4508
4485 #ifdef CONFIG_SMP 4509 #ifdef CONFIG_SMP
4486 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4510 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4487 { 4511 {
4488 if (p->sched_class && p->sched_class->set_cpus_allowed) 4512 if (p->sched_class && p->sched_class->set_cpus_allowed)
4489 p->sched_class->set_cpus_allowed(p, new_mask); 4513 p->sched_class->set_cpus_allowed(p, new_mask);
4490 4514
4491 cpumask_copy(&p->cpus_allowed, new_mask); 4515 cpumask_copy(&p->cpus_allowed, new_mask);
4492 p->nr_cpus_allowed = cpumask_weight(new_mask); 4516 p->nr_cpus_allowed = cpumask_weight(new_mask);
4493 } 4517 }
4494 4518
4495 /* 4519 /*
4496 * This is how migration works: 4520 * This is how migration works:
4497 * 4521 *
4498 * 1) we invoke migration_cpu_stop() on the target CPU using 4522 * 1) we invoke migration_cpu_stop() on the target CPU using
4499 * stop_one_cpu(). 4523 * stop_one_cpu().
4500 * 2) stopper starts to run (implicitly forcing the migrated thread 4524 * 2) stopper starts to run (implicitly forcing the migrated thread
4501 * off the CPU) 4525 * off the CPU)
4502 * 3) it checks whether the migrated task is still in the wrong runqueue. 4526 * 3) it checks whether the migrated task is still in the wrong runqueue.
4503 * 4) if it's in the wrong runqueue then the migration thread removes 4527 * 4) if it's in the wrong runqueue then the migration thread removes
4504 * it and puts it into the right queue. 4528 * it and puts it into the right queue.
4505 * 5) stopper completes and stop_one_cpu() returns and the migration 4529 * 5) stopper completes and stop_one_cpu() returns and the migration
4506 * is done. 4530 * is done.
4507 */ 4531 */
4508 4532
4509 /* 4533 /*
4510 * Change a given task's CPU affinity. Migrate the thread to a 4534 * Change a given task's CPU affinity. Migrate the thread to a
4511 * proper CPU and schedule it away if the CPU it's executing on 4535 * proper CPU and schedule it away if the CPU it's executing on
4512 * is removed from the allowed bitmask. 4536 * is removed from the allowed bitmask.
4513 * 4537 *
4514 * NOTE: the caller must have a valid reference to the task, the 4538 * NOTE: the caller must have a valid reference to the task, the
4515 * task must not exit() & deallocate itself prematurely. The 4539 * task must not exit() & deallocate itself prematurely. The
4516 * call is not atomic; no spinlocks may be held. 4540 * call is not atomic; no spinlocks may be held.
4517 */ 4541 */
4518 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 4542 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4519 { 4543 {
4520 unsigned long flags; 4544 unsigned long flags;
4521 struct rq *rq; 4545 struct rq *rq;
4522 unsigned int dest_cpu; 4546 unsigned int dest_cpu;
4523 int ret = 0; 4547 int ret = 0;
4524 4548
4525 rq = task_rq_lock(p, &flags); 4549 rq = task_rq_lock(p, &flags);
4526 4550
4527 if (cpumask_equal(&p->cpus_allowed, new_mask)) 4551 if (cpumask_equal(&p->cpus_allowed, new_mask))
4528 goto out; 4552 goto out;
4529 4553
4530 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 4554 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
4531 ret = -EINVAL; 4555 ret = -EINVAL;
4532 goto out; 4556 goto out;
4533 } 4557 }
4534 4558
4535 do_set_cpus_allowed(p, new_mask); 4559 do_set_cpus_allowed(p, new_mask);
4536 4560
4537 /* Can the task run on the task's current CPU? If so, we're done */ 4561 /* Can the task run on the task's current CPU? If so, we're done */
4538 if (cpumask_test_cpu(task_cpu(p), new_mask)) 4562 if (cpumask_test_cpu(task_cpu(p), new_mask))
4539 goto out; 4563 goto out;
4540 4564
4541 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4565 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4542 if (p->on_rq) { 4566 if (p->on_rq) {
4543 struct migration_arg arg = { p, dest_cpu }; 4567 struct migration_arg arg = { p, dest_cpu };
4544 /* Need help from migration thread: drop lock and wait. */ 4568 /* Need help from migration thread: drop lock and wait. */
4545 task_rq_unlock(rq, p, &flags); 4569 task_rq_unlock(rq, p, &flags);
4546 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4570 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4547 tlb_migrate_finish(p->mm); 4571 tlb_migrate_finish(p->mm);
4548 return 0; 4572 return 0;
4549 } 4573 }
4550 out: 4574 out:
4551 task_rq_unlock(rq, p, &flags); 4575 task_rq_unlock(rq, p, &flags);
4552 4576
4553 return ret; 4577 return ret;
4554 } 4578 }
4555 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 4579 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4556 4580
4557 /* 4581 /*
4558 * Move (not current) task off this cpu, onto dest cpu. We're doing 4582 * Move (not current) task off this cpu, onto dest cpu. We're doing
4559 * this because either it can't run here any more (set_cpus_allowed() 4583 * this because either it can't run here any more (set_cpus_allowed()
4560 * away from this CPU, or CPU going down), or because we're 4584 * away from this CPU, or CPU going down), or because we're
4561 * attempting to rebalance this task on exec (sched_exec). 4585 * attempting to rebalance this task on exec (sched_exec).
4562 * 4586 *
4563 * So we race with normal scheduler movements, but that's OK, as long 4587 * So we race with normal scheduler movements, but that's OK, as long
4564 * as the task is no longer on this CPU. 4588 * as the task is no longer on this CPU.
4565 * 4589 *
4566 * Returns non-zero if task was successfully migrated. 4590 * Returns non-zero if task was successfully migrated.
4567 */ 4591 */
4568 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4592 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4569 { 4593 {
4570 struct rq *rq_dest, *rq_src; 4594 struct rq *rq_dest, *rq_src;
4571 int ret = 0; 4595 int ret = 0;
4572 4596
4573 if (unlikely(!cpu_active(dest_cpu))) 4597 if (unlikely(!cpu_active(dest_cpu)))
4574 return ret; 4598 return ret;
4575 4599
4576 rq_src = cpu_rq(src_cpu); 4600 rq_src = cpu_rq(src_cpu);
4577 rq_dest = cpu_rq(dest_cpu); 4601 rq_dest = cpu_rq(dest_cpu);
4578 4602
4579 raw_spin_lock(&p->pi_lock); 4603 raw_spin_lock(&p->pi_lock);
4580 double_rq_lock(rq_src, rq_dest); 4604 double_rq_lock(rq_src, rq_dest);
4581 /* Already moved. */ 4605 /* Already moved. */
4582 if (task_cpu(p) != src_cpu) 4606 if (task_cpu(p) != src_cpu)
4583 goto done; 4607 goto done;
4584 /* Affinity changed (again). */ 4608 /* Affinity changed (again). */
4585 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4609 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4586 goto fail; 4610 goto fail;
4587 4611
4588 /* 4612 /*
4589 * If we're not on a rq, the next wake-up will ensure we're 4613 * If we're not on a rq, the next wake-up will ensure we're
4590 * placed properly. 4614 * placed properly.
4591 */ 4615 */
4592 if (p->on_rq) { 4616 if (p->on_rq) {
4593 dequeue_task(rq_src, p, 0); 4617 dequeue_task(rq_src, p, 0);
4594 set_task_cpu(p, dest_cpu); 4618 set_task_cpu(p, dest_cpu);
4595 enqueue_task(rq_dest, p, 0); 4619 enqueue_task(rq_dest, p, 0);
4596 check_preempt_curr(rq_dest, p, 0); 4620 check_preempt_curr(rq_dest, p, 0);
4597 } 4621 }
4598 done: 4622 done:
4599 ret = 1; 4623 ret = 1;
4600 fail: 4624 fail:
4601 double_rq_unlock(rq_src, rq_dest); 4625 double_rq_unlock(rq_src, rq_dest);
4602 raw_spin_unlock(&p->pi_lock); 4626 raw_spin_unlock(&p->pi_lock);
4603 return ret; 4627 return ret;
4604 } 4628 }
4605 4629
4606 #ifdef CONFIG_NUMA_BALANCING 4630 #ifdef CONFIG_NUMA_BALANCING
4607 /* Migrate current task p to target_cpu */ 4631 /* Migrate current task p to target_cpu */
4608 int migrate_task_to(struct task_struct *p, int target_cpu) 4632 int migrate_task_to(struct task_struct *p, int target_cpu)
4609 { 4633 {
4610 struct migration_arg arg = { p, target_cpu }; 4634 struct migration_arg arg = { p, target_cpu };
4611 int curr_cpu = task_cpu(p); 4635 int curr_cpu = task_cpu(p);
4612 4636
4613 if (curr_cpu == target_cpu) 4637 if (curr_cpu == target_cpu)
4614 return 0; 4638 return 0;
4615 4639
4616 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) 4640 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4617 return -EINVAL; 4641 return -EINVAL;
4618 4642
4619 /* TODO: This is not properly updating schedstats */ 4643 /* TODO: This is not properly updating schedstats */
4620 4644
4621 trace_sched_move_numa(p, curr_cpu, target_cpu); 4645 trace_sched_move_numa(p, curr_cpu, target_cpu);
4622 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); 4646 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4623 } 4647 }
4624 4648
4625 /* 4649 /*
4626 * Requeue a task on a given node and accurately track the number of NUMA 4650 * Requeue a task on a given node and accurately track the number of NUMA
4627 * tasks on the runqueues 4651 * tasks on the runqueues
4628 */ 4652 */
4629 void sched_setnuma(struct task_struct *p, int nid) 4653 void sched_setnuma(struct task_struct *p, int nid)
4630 { 4654 {
4631 struct rq *rq; 4655 struct rq *rq;
4632 unsigned long flags; 4656 unsigned long flags;
4633 bool on_rq, running; 4657 bool on_rq, running;
4634 4658
4635 rq = task_rq_lock(p, &flags); 4659 rq = task_rq_lock(p, &flags);
4636 on_rq = p->on_rq; 4660 on_rq = p->on_rq;
4637 running = task_current(rq, p); 4661 running = task_current(rq, p);
4638 4662
4639 if (on_rq) 4663 if (on_rq)
4640 dequeue_task(rq, p, 0); 4664 dequeue_task(rq, p, 0);
4641 if (running) 4665 if (running)
4642 p->sched_class->put_prev_task(rq, p); 4666 p->sched_class->put_prev_task(rq, p);
4643 4667
4644 p->numa_preferred_nid = nid; 4668 p->numa_preferred_nid = nid;
4645 4669
4646 if (running) 4670 if (running)
4647 p->sched_class->set_curr_task(rq); 4671 p->sched_class->set_curr_task(rq);
4648 if (on_rq) 4672 if (on_rq)
4649 enqueue_task(rq, p, 0); 4673 enqueue_task(rq, p, 0);
4650 task_rq_unlock(rq, p, &flags); 4674 task_rq_unlock(rq, p, &flags);
4651 } 4675 }
4652 #endif 4676 #endif
4653 4677
4654 /* 4678 /*
4655 * migration_cpu_stop - this will be executed by a highprio stopper thread 4679 * migration_cpu_stop - this will be executed by a highprio stopper thread
4656 * and performs thread migration by bumping thread off CPU then 4680 * and performs thread migration by bumping thread off CPU then
4657 * 'pushing' onto another runqueue. 4681 * 'pushing' onto another runqueue.
4658 */ 4682 */
4659 static int migration_cpu_stop(void *data) 4683 static int migration_cpu_stop(void *data)
4660 { 4684 {
4661 struct migration_arg *arg = data; 4685 struct migration_arg *arg = data;
4662 4686
4663 /* 4687 /*
4664 * The original target cpu might have gone down and we might 4688 * The original target cpu might have gone down and we might
4665 * be on another cpu but it doesn't matter. 4689 * be on another cpu but it doesn't matter.
4666 */ 4690 */
4667 local_irq_disable(); 4691 local_irq_disable();
4668 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4692 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4669 local_irq_enable(); 4693 local_irq_enable();
4670 return 0; 4694 return 0;
4671 } 4695 }
4672 4696
4673 #ifdef CONFIG_HOTPLUG_CPU 4697 #ifdef CONFIG_HOTPLUG_CPU
4674 4698
4675 /* 4699 /*
4676 * Ensures that the idle task is using init_mm right before its cpu goes 4700 * Ensures that the idle task is using init_mm right before its cpu goes
4677 * offline. 4701 * offline.
4678 */ 4702 */
4679 void idle_task_exit(void) 4703 void idle_task_exit(void)
4680 { 4704 {
4681 struct mm_struct *mm = current->active_mm; 4705 struct mm_struct *mm = current->active_mm;
4682 4706
4683 BUG_ON(cpu_online(smp_processor_id())); 4707 BUG_ON(cpu_online(smp_processor_id()));
4684 4708
4685 if (mm != &init_mm) { 4709 if (mm != &init_mm) {
4686 switch_mm(mm, &init_mm, current); 4710 switch_mm(mm, &init_mm, current);
4687 finish_arch_post_lock_switch(); 4711 finish_arch_post_lock_switch();
4688 } 4712 }
4689 mmdrop(mm); 4713 mmdrop(mm);
4690 } 4714 }
4691 4715
4692 /* 4716 /*
4693 * Since this CPU is going 'away' for a while, fold any nr_active delta 4717 * Since this CPU is going 'away' for a while, fold any nr_active delta
4694 * we might have. Assumes we're called after migrate_tasks() so that the 4718 * we might have. Assumes we're called after migrate_tasks() so that the
4695 * nr_active count is stable. 4719 * nr_active count is stable.
4696 * 4720 *
4697 * Also see the comment "Global load-average calculations". 4721 * Also see the comment "Global load-average calculations".
4698 */ 4722 */
4699 static void calc_load_migrate(struct rq *rq) 4723 static void calc_load_migrate(struct rq *rq)
4700 { 4724 {
4701 long delta = calc_load_fold_active(rq); 4725 long delta = calc_load_fold_active(rq);
4702 if (delta) 4726 if (delta)
4703 atomic_long_add(delta, &calc_load_tasks); 4727 atomic_long_add(delta, &calc_load_tasks);
4704 } 4728 }
4705 4729
4706 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 4730 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
4707 { 4731 {
4708 } 4732 }
4709 4733
4710 static const struct sched_class fake_sched_class = { 4734 static const struct sched_class fake_sched_class = {
4711 .put_prev_task = put_prev_task_fake, 4735 .put_prev_task = put_prev_task_fake,
4712 }; 4736 };
4713 4737
4714 static struct task_struct fake_task = { 4738 static struct task_struct fake_task = {
4715 /* 4739 /*
4716 * Avoid pull_{rt,dl}_task() 4740 * Avoid pull_{rt,dl}_task()
4717 */ 4741 */
4718 .prio = MAX_PRIO + 1, 4742 .prio = MAX_PRIO + 1,
4719 .sched_class = &fake_sched_class, 4743 .sched_class = &fake_sched_class,
4720 }; 4744 };
4721 4745
4722 /* 4746 /*
4723 * Migrate all tasks from the rq, sleeping tasks will be migrated by 4747 * Migrate all tasks from the rq, sleeping tasks will be migrated by
4724 * try_to_wake_up()->select_task_rq(). 4748 * try_to_wake_up()->select_task_rq().
4725 * 4749 *
4726 * Called with rq->lock held even though we'er in stop_machine() and 4750 * Called with rq->lock held even though we'er in stop_machine() and
4727 * there's no concurrency possible, we hold the required locks anyway 4751 * there's no concurrency possible, we hold the required locks anyway
4728 * because of lock validation efforts. 4752 * because of lock validation efforts.
4729 */ 4753 */
4730 static void migrate_tasks(unsigned int dead_cpu) 4754 static void migrate_tasks(unsigned int dead_cpu)
4731 { 4755 {
4732 struct rq *rq = cpu_rq(dead_cpu); 4756 struct rq *rq = cpu_rq(dead_cpu);
4733 struct task_struct *next, *stop = rq->stop; 4757 struct task_struct *next, *stop = rq->stop;
4734 int dest_cpu; 4758 int dest_cpu;
4735 4759
4736 /* 4760 /*
4737 * Fudge the rq selection such that the below task selection loop 4761 * Fudge the rq selection such that the below task selection loop
4738 * doesn't get stuck on the currently eligible stop task. 4762 * doesn't get stuck on the currently eligible stop task.
4739 * 4763 *
4740 * We're currently inside stop_machine() and the rq is either stuck 4764 * We're currently inside stop_machine() and the rq is either stuck
4741 * in the stop_machine_cpu_stop() loop, or we're executing this code, 4765 * in the stop_machine_cpu_stop() loop, or we're executing this code,
4742 * either way we should never end up calling schedule() until we're 4766 * either way we should never end up calling schedule() until we're
4743 * done here. 4767 * done here.
4744 */ 4768 */
4745 rq->stop = NULL; 4769 rq->stop = NULL;
4746 4770
4747 /* 4771 /*
4748 * put_prev_task() and pick_next_task() sched 4772 * put_prev_task() and pick_next_task() sched
4749 * class method both need to have an up-to-date 4773 * class method both need to have an up-to-date
4750 * value of rq->clock[_task] 4774 * value of rq->clock[_task]
4751 */ 4775 */
4752 update_rq_clock(rq); 4776 update_rq_clock(rq);
4753 4777
4754 for ( ; ; ) { 4778 for ( ; ; ) {
4755 /* 4779 /*
4756 * There's this thread running, bail when that's the only 4780 * There's this thread running, bail when that's the only
4757 * remaining thread. 4781 * remaining thread.
4758 */ 4782 */
4759 if (rq->nr_running == 1) 4783 if (rq->nr_running == 1)
4760 break; 4784 break;
4761 4785
4762 next = pick_next_task(rq, &fake_task); 4786 next = pick_next_task(rq, &fake_task);
4763 BUG_ON(!next); 4787 BUG_ON(!next);
4764 next->sched_class->put_prev_task(rq, next); 4788 next->sched_class->put_prev_task(rq, next);
4765 4789
4766 /* Find suitable destination for @next, with force if needed. */ 4790 /* Find suitable destination for @next, with force if needed. */
4767 dest_cpu = select_fallback_rq(dead_cpu, next); 4791 dest_cpu = select_fallback_rq(dead_cpu, next);
4768 raw_spin_unlock(&rq->lock); 4792 raw_spin_unlock(&rq->lock);
4769 4793
4770 __migrate_task(next, dead_cpu, dest_cpu); 4794 __migrate_task(next, dead_cpu, dest_cpu);
4771 4795
4772 raw_spin_lock(&rq->lock); 4796 raw_spin_lock(&rq->lock);
4773 } 4797 }
4774 4798
4775 rq->stop = stop; 4799 rq->stop = stop;
4776 } 4800 }
4777 4801
4778 #endif /* CONFIG_HOTPLUG_CPU */ 4802 #endif /* CONFIG_HOTPLUG_CPU */
4779 4803
4780 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 4804 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
4781 4805
4782 static struct ctl_table sd_ctl_dir[] = { 4806 static struct ctl_table sd_ctl_dir[] = {
4783 { 4807 {
4784 .procname = "sched_domain", 4808 .procname = "sched_domain",
4785 .mode = 0555, 4809 .mode = 0555,
4786 }, 4810 },
4787 {} 4811 {}
4788 }; 4812 };
4789 4813
4790 static struct ctl_table sd_ctl_root[] = { 4814 static struct ctl_table sd_ctl_root[] = {
4791 { 4815 {
4792 .procname = "kernel", 4816 .procname = "kernel",
4793 .mode = 0555, 4817 .mode = 0555,
4794 .child = sd_ctl_dir, 4818 .child = sd_ctl_dir,
4795 }, 4819 },
4796 {} 4820 {}
4797 }; 4821 };
4798 4822
4799 static struct ctl_table *sd_alloc_ctl_entry(int n) 4823 static struct ctl_table *sd_alloc_ctl_entry(int n)
4800 { 4824 {
4801 struct ctl_table *entry = 4825 struct ctl_table *entry =
4802 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 4826 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
4803 4827
4804 return entry; 4828 return entry;
4805 } 4829 }
4806 4830
4807 static void sd_free_ctl_entry(struct ctl_table **tablep) 4831 static void sd_free_ctl_entry(struct ctl_table **tablep)
4808 { 4832 {
4809 struct ctl_table *entry; 4833 struct ctl_table *entry;
4810 4834
4811 /* 4835 /*
4812 * In the intermediate directories, both the child directory and 4836 * In the intermediate directories, both the child directory and
4813 * procname are dynamically allocated and could fail but the mode 4837 * procname are dynamically allocated and could fail but the mode
4814 * will always be set. In the lowest directory the names are 4838 * will always be set. In the lowest directory the names are
4815 * static strings and all have proc handlers. 4839 * static strings and all have proc handlers.
4816 */ 4840 */
4817 for (entry = *tablep; entry->mode; entry++) { 4841 for (entry = *tablep; entry->mode; entry++) {
4818 if (entry->child) 4842 if (entry->child)
4819 sd_free_ctl_entry(&entry->child); 4843 sd_free_ctl_entry(&entry->child);
4820 if (entry->proc_handler == NULL) 4844 if (entry->proc_handler == NULL)
4821 kfree(entry->procname); 4845 kfree(entry->procname);
4822 } 4846 }
4823 4847
4824 kfree(*tablep); 4848 kfree(*tablep);
4825 *tablep = NULL; 4849 *tablep = NULL;
4826 } 4850 }
4827 4851
4828 static int min_load_idx = 0; 4852 static int min_load_idx = 0;
4829 static int max_load_idx = CPU_LOAD_IDX_MAX-1; 4853 static int max_load_idx = CPU_LOAD_IDX_MAX-1;
4830 4854
4831 static void 4855 static void
4832 set_table_entry(struct ctl_table *entry, 4856 set_table_entry(struct ctl_table *entry,
4833 const char *procname, void *data, int maxlen, 4857 const char *procname, void *data, int maxlen,
4834 umode_t mode, proc_handler *proc_handler, 4858 umode_t mode, proc_handler *proc_handler,
4835 bool load_idx) 4859 bool load_idx)
4836 { 4860 {
4837 entry->procname = procname; 4861 entry->procname = procname;
4838 entry->data = data; 4862 entry->data = data;
4839 entry->maxlen = maxlen; 4863 entry->maxlen = maxlen;
4840 entry->mode = mode; 4864 entry->mode = mode;
4841 entry->proc_handler = proc_handler; 4865 entry->proc_handler = proc_handler;
4842 4866
4843 if (load_idx) { 4867 if (load_idx) {
4844 entry->extra1 = &min_load_idx; 4868 entry->extra1 = &min_load_idx;
4845 entry->extra2 = &max_load_idx; 4869 entry->extra2 = &max_load_idx;
4846 } 4870 }
4847 } 4871 }
4848 4872
4849 static struct ctl_table * 4873 static struct ctl_table *
4850 sd_alloc_ctl_domain_table(struct sched_domain *sd) 4874 sd_alloc_ctl_domain_table(struct sched_domain *sd)
4851 { 4875 {
4852 struct ctl_table *table = sd_alloc_ctl_entry(14); 4876 struct ctl_table *table = sd_alloc_ctl_entry(14);
4853 4877
4854 if (table == NULL) 4878 if (table == NULL)
4855 return NULL; 4879 return NULL;
4856 4880
4857 set_table_entry(&table[0], "min_interval", &sd->min_interval, 4881 set_table_entry(&table[0], "min_interval", &sd->min_interval,
4858 sizeof(long), 0644, proc_doulongvec_minmax, false); 4882 sizeof(long), 0644, proc_doulongvec_minmax, false);
4859 set_table_entry(&table[1], "max_interval", &sd->max_interval, 4883 set_table_entry(&table[1], "max_interval", &sd->max_interval,
4860 sizeof(long), 0644, proc_doulongvec_minmax, false); 4884 sizeof(long), 0644, proc_doulongvec_minmax, false);
4861 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 4885 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
4862 sizeof(int), 0644, proc_dointvec_minmax, true); 4886 sizeof(int), 0644, proc_dointvec_minmax, true);
4863 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 4887 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
4864 sizeof(int), 0644, proc_dointvec_minmax, true); 4888 sizeof(int), 0644, proc_dointvec_minmax, true);
4865 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 4889 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
4866 sizeof(int), 0644, proc_dointvec_minmax, true); 4890 sizeof(int), 0644, proc_dointvec_minmax, true);
4867 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 4891 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
4868 sizeof(int), 0644, proc_dointvec_minmax, true); 4892 sizeof(int), 0644, proc_dointvec_minmax, true);
4869 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 4893 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
4870 sizeof(int), 0644, proc_dointvec_minmax, true); 4894 sizeof(int), 0644, proc_dointvec_minmax, true);
4871 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 4895 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
4872 sizeof(int), 0644, proc_dointvec_minmax, false); 4896 sizeof(int), 0644, proc_dointvec_minmax, false);
4873 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 4897 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
4874 sizeof(int), 0644, proc_dointvec_minmax, false); 4898 sizeof(int), 0644, proc_dointvec_minmax, false);
4875 set_table_entry(&table[9], "cache_nice_tries", 4899 set_table_entry(&table[9], "cache_nice_tries",
4876 &sd->cache_nice_tries, 4900 &sd->cache_nice_tries,
4877 sizeof(int), 0644, proc_dointvec_minmax, false); 4901 sizeof(int), 0644, proc_dointvec_minmax, false);
4878 set_table_entry(&table[10], "flags", &sd->flags, 4902 set_table_entry(&table[10], "flags", &sd->flags,
4879 sizeof(int), 0644, proc_dointvec_minmax, false); 4903 sizeof(int), 0644, proc_dointvec_minmax, false);
4880 set_table_entry(&table[11], "max_newidle_lb_cost", 4904 set_table_entry(&table[11], "max_newidle_lb_cost",
4881 &sd->max_newidle_lb_cost, 4905 &sd->max_newidle_lb_cost,
4882 sizeof(long), 0644, proc_doulongvec_minmax, false); 4906 sizeof(long), 0644, proc_doulongvec_minmax, false);
4883 set_table_entry(&table[12], "name", sd->name, 4907 set_table_entry(&table[12], "name", sd->name,
4884 CORENAME_MAX_SIZE, 0444, proc_dostring, false); 4908 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
4885 /* &table[13] is terminator */ 4909 /* &table[13] is terminator */
4886 4910
4887 return table; 4911 return table;
4888 } 4912 }
4889 4913
4890 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) 4914 static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
4891 { 4915 {
4892 struct ctl_table *entry, *table; 4916 struct ctl_table *entry, *table;
4893 struct sched_domain *sd; 4917 struct sched_domain *sd;
4894 int domain_num = 0, i; 4918 int domain_num = 0, i;
4895 char buf[32]; 4919 char buf[32];
4896 4920
4897 for_each_domain(cpu, sd) 4921 for_each_domain(cpu, sd)
4898 domain_num++; 4922 domain_num++;
4899 entry = table = sd_alloc_ctl_entry(domain_num + 1); 4923 entry = table = sd_alloc_ctl_entry(domain_num + 1);
4900 if (table == NULL) 4924 if (table == NULL)
4901 return NULL; 4925 return NULL;
4902 4926
4903 i = 0; 4927 i = 0;
4904 for_each_domain(cpu, sd) { 4928 for_each_domain(cpu, sd) {
4905 snprintf(buf, 32, "domain%d", i); 4929 snprintf(buf, 32, "domain%d", i);
4906 entry->procname = kstrdup(buf, GFP_KERNEL); 4930 entry->procname = kstrdup(buf, GFP_KERNEL);
4907 entry->mode = 0555; 4931 entry->mode = 0555;
4908 entry->child = sd_alloc_ctl_domain_table(sd); 4932 entry->child = sd_alloc_ctl_domain_table(sd);
4909 entry++; 4933 entry++;
4910 i++; 4934 i++;
4911 } 4935 }
4912 return table; 4936 return table;
4913 } 4937 }
4914 4938
4915 static struct ctl_table_header *sd_sysctl_header; 4939 static struct ctl_table_header *sd_sysctl_header;
4916 static void register_sched_domain_sysctl(void) 4940 static void register_sched_domain_sysctl(void)
4917 { 4941 {
4918 int i, cpu_num = num_possible_cpus(); 4942 int i, cpu_num = num_possible_cpus();
4919 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 4943 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
4920 char buf[32]; 4944 char buf[32];
4921 4945
4922 WARN_ON(sd_ctl_dir[0].child); 4946 WARN_ON(sd_ctl_dir[0].child);
4923 sd_ctl_dir[0].child = entry; 4947 sd_ctl_dir[0].child = entry;
4924 4948
4925 if (entry == NULL) 4949 if (entry == NULL)
4926 return; 4950 return;
4927 4951
4928 for_each_possible_cpu(i) { 4952 for_each_possible_cpu(i) {
4929 snprintf(buf, 32, "cpu%d", i); 4953 snprintf(buf, 32, "cpu%d", i);
4930 entry->procname = kstrdup(buf, GFP_KERNEL); 4954 entry->procname = kstrdup(buf, GFP_KERNEL);
4931 entry->mode = 0555; 4955 entry->mode = 0555;
4932 entry->child = sd_alloc_ctl_cpu_table(i); 4956 entry->child = sd_alloc_ctl_cpu_table(i);
4933 entry++; 4957 entry++;
4934 } 4958 }
4935 4959
4936 WARN_ON(sd_sysctl_header); 4960 WARN_ON(sd_sysctl_header);
4937 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 4961 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
4938 } 4962 }
4939 4963
4940 /* may be called multiple times per register */ 4964 /* may be called multiple times per register */
4941 static void unregister_sched_domain_sysctl(void) 4965 static void unregister_sched_domain_sysctl(void)
4942 { 4966 {
4943 if (sd_sysctl_header) 4967 if (sd_sysctl_header)
4944 unregister_sysctl_table(sd_sysctl_header); 4968 unregister_sysctl_table(sd_sysctl_header);
4945 sd_sysctl_header = NULL; 4969 sd_sysctl_header = NULL;
4946 if (sd_ctl_dir[0].child) 4970 if (sd_ctl_dir[0].child)
4947 sd_free_ctl_entry(&sd_ctl_dir[0].child); 4971 sd_free_ctl_entry(&sd_ctl_dir[0].child);
4948 } 4972 }
4949 #else 4973 #else
4950 static void register_sched_domain_sysctl(void) 4974 static void register_sched_domain_sysctl(void)
4951 { 4975 {
4952 } 4976 }
4953 static void unregister_sched_domain_sysctl(void) 4977 static void unregister_sched_domain_sysctl(void)
4954 { 4978 {
4955 } 4979 }
4956 #endif 4980 #endif
4957 4981
4958 static void set_rq_online(struct rq *rq) 4982 static void set_rq_online(struct rq *rq)
4959 { 4983 {
4960 if (!rq->online) { 4984 if (!rq->online) {
4961 const struct sched_class *class; 4985 const struct sched_class *class;
4962 4986
4963 cpumask_set_cpu(rq->cpu, rq->rd->online); 4987 cpumask_set_cpu(rq->cpu, rq->rd->online);
4964 rq->online = 1; 4988 rq->online = 1;
4965 4989
4966 for_each_class(class) { 4990 for_each_class(class) {
4967 if (class->rq_online) 4991 if (class->rq_online)
4968 class->rq_online(rq); 4992 class->rq_online(rq);
4969 } 4993 }
4970 } 4994 }
4971 } 4995 }
4972 4996
4973 static void set_rq_offline(struct rq *rq) 4997 static void set_rq_offline(struct rq *rq)
4974 { 4998 {
4975 if (rq->online) { 4999 if (rq->online) {
4976 const struct sched_class *class; 5000 const struct sched_class *class;
4977 5001
4978 for_each_class(class) { 5002 for_each_class(class) {
4979 if (class->rq_offline) 5003 if (class->rq_offline)
4980 class->rq_offline(rq); 5004 class->rq_offline(rq);
4981 } 5005 }
4982 5006
4983 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5007 cpumask_clear_cpu(rq->cpu, rq->rd->online);
4984 rq->online = 0; 5008 rq->online = 0;
4985 } 5009 }
4986 } 5010 }
4987 5011
4988 /* 5012 /*
4989 * migration_call - callback that gets triggered when a CPU is added. 5013 * migration_call - callback that gets triggered when a CPU is added.
4990 * Here we can start up the necessary migration thread for the new CPU. 5014 * Here we can start up the necessary migration thread for the new CPU.
4991 */ 5015 */
4992 static int 5016 static int
4993 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5017 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4994 { 5018 {
4995 int cpu = (long)hcpu; 5019 int cpu = (long)hcpu;
4996 unsigned long flags; 5020 unsigned long flags;
4997 struct rq *rq = cpu_rq(cpu); 5021 struct rq *rq = cpu_rq(cpu);
4998 5022
4999 switch (action & ~CPU_TASKS_FROZEN) { 5023 switch (action & ~CPU_TASKS_FROZEN) {
5000 5024
5001 case CPU_UP_PREPARE: 5025 case CPU_UP_PREPARE:
5002 rq->calc_load_update = calc_load_update; 5026 rq->calc_load_update = calc_load_update;
5003 break; 5027 break;
5004 5028
5005 case CPU_ONLINE: 5029 case CPU_ONLINE:
5006 /* Update our root-domain */ 5030 /* Update our root-domain */
5007 raw_spin_lock_irqsave(&rq->lock, flags); 5031 raw_spin_lock_irqsave(&rq->lock, flags);
5008 if (rq->rd) { 5032 if (rq->rd) {
5009 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5033 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5010 5034
5011 set_rq_online(rq); 5035 set_rq_online(rq);
5012 } 5036 }
5013 raw_spin_unlock_irqrestore(&rq->lock, flags); 5037 raw_spin_unlock_irqrestore(&rq->lock, flags);
5014 break; 5038 break;
5015 5039
5016 #ifdef CONFIG_HOTPLUG_CPU 5040 #ifdef CONFIG_HOTPLUG_CPU
5017 case CPU_DYING: 5041 case CPU_DYING:
5018 sched_ttwu_pending(); 5042 sched_ttwu_pending();
5019 /* Update our root-domain */ 5043 /* Update our root-domain */
5020 raw_spin_lock_irqsave(&rq->lock, flags); 5044 raw_spin_lock_irqsave(&rq->lock, flags);
5021 if (rq->rd) { 5045 if (rq->rd) {
5022 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5046 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5023 set_rq_offline(rq); 5047 set_rq_offline(rq);
5024 } 5048 }
5025 migrate_tasks(cpu); 5049 migrate_tasks(cpu);
5026 BUG_ON(rq->nr_running != 1); /* the migration thread */ 5050 BUG_ON(rq->nr_running != 1); /* the migration thread */
5027 raw_spin_unlock_irqrestore(&rq->lock, flags); 5051 raw_spin_unlock_irqrestore(&rq->lock, flags);
5028 break; 5052 break;
5029 5053
5030 case CPU_DEAD: 5054 case CPU_DEAD:
5031 calc_load_migrate(rq); 5055 calc_load_migrate(rq);
5032 break; 5056 break;
5033 #endif 5057 #endif
5034 } 5058 }
5035 5059
5036 update_max_interval(); 5060 update_max_interval();
5037 5061
5038 return NOTIFY_OK; 5062 return NOTIFY_OK;
5039 } 5063 }
5040 5064
5041 /* 5065 /*
5042 * Register at high priority so that task migration (migrate_all_tasks) 5066 * Register at high priority so that task migration (migrate_all_tasks)
5043 * happens before everything else. This has to be lower priority than 5067 * happens before everything else. This has to be lower priority than
5044 * the notifier in the perf_event subsystem, though. 5068 * the notifier in the perf_event subsystem, though.
5045 */ 5069 */
5046 static struct notifier_block migration_notifier = { 5070 static struct notifier_block migration_notifier = {
5047 .notifier_call = migration_call, 5071 .notifier_call = migration_call,
5048 .priority = CPU_PRI_MIGRATION, 5072 .priority = CPU_PRI_MIGRATION,
5049 }; 5073 };
5050 5074
5051 static int sched_cpu_active(struct notifier_block *nfb, 5075 static int sched_cpu_active(struct notifier_block *nfb,
5052 unsigned long action, void *hcpu) 5076 unsigned long action, void *hcpu)
5053 { 5077 {
5054 switch (action & ~CPU_TASKS_FROZEN) { 5078 switch (action & ~CPU_TASKS_FROZEN) {
5055 case CPU_STARTING:
5056 case CPU_DOWN_FAILED: 5079 case CPU_DOWN_FAILED:
5057 set_cpu_active((long)hcpu, true); 5080 set_cpu_active((long)hcpu, true);
5058 return NOTIFY_OK; 5081 return NOTIFY_OK;
5059 default: 5082 default:
5060 return NOTIFY_DONE; 5083 return NOTIFY_DONE;
5061 } 5084 }
5062 } 5085 }
5063 5086
5064 static int sched_cpu_inactive(struct notifier_block *nfb, 5087 static int sched_cpu_inactive(struct notifier_block *nfb,
5065 unsigned long action, void *hcpu) 5088 unsigned long action, void *hcpu)
5066 { 5089 {
5067 unsigned long flags; 5090 unsigned long flags;
5068 long cpu = (long)hcpu; 5091 long cpu = (long)hcpu;
5069 5092
5070 switch (action & ~CPU_TASKS_FROZEN) { 5093 switch (action & ~CPU_TASKS_FROZEN) {
5071 case CPU_DOWN_PREPARE: 5094 case CPU_DOWN_PREPARE:
5072 set_cpu_active(cpu, false); 5095 set_cpu_active(cpu, false);
5073 5096
5074 /* explicitly allow suspend */ 5097 /* explicitly allow suspend */
5075 if (!(action & CPU_TASKS_FROZEN)) { 5098 if (!(action & CPU_TASKS_FROZEN)) {
5076 struct dl_bw *dl_b = dl_bw_of(cpu); 5099 struct dl_bw *dl_b = dl_bw_of(cpu);
5077 bool overflow; 5100 bool overflow;
5078 int cpus; 5101 int cpus;
5079 5102
5080 raw_spin_lock_irqsave(&dl_b->lock, flags); 5103 raw_spin_lock_irqsave(&dl_b->lock, flags);
5081 cpus = dl_bw_cpus(cpu); 5104 cpus = dl_bw_cpus(cpu);
5082 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5105 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5083 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5106 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5084 5107
5085 if (overflow) 5108 if (overflow)
5086 return notifier_from_errno(-EBUSY); 5109 return notifier_from_errno(-EBUSY);
5087 } 5110 }
5088 return NOTIFY_OK; 5111 return NOTIFY_OK;
5089 } 5112 }
5090 5113
5091 return NOTIFY_DONE; 5114 return NOTIFY_DONE;
5092 } 5115 }
5093 5116
5094 static int __init migration_init(void) 5117 static int __init migration_init(void)
5095 { 5118 {
5096 void *cpu = (void *)(long)smp_processor_id(); 5119 void *cpu = (void *)(long)smp_processor_id();
5097 int err; 5120 int err;
5098 5121
5099 /* Initialize migration for the boot CPU */ 5122 /* Initialize migration for the boot CPU */
5100 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5123 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5101 BUG_ON(err == NOTIFY_BAD); 5124 BUG_ON(err == NOTIFY_BAD);
5102 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5125 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5103 register_cpu_notifier(&migration_notifier); 5126 register_cpu_notifier(&migration_notifier);
5104 5127
5105 /* Register cpu active notifiers */ 5128 /* Register cpu active notifiers */
5106 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); 5129 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
5107 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); 5130 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
5108 5131
5109 return 0; 5132 return 0;
5110 } 5133 }
5111 early_initcall(migration_init); 5134 early_initcall(migration_init);
5112 #endif 5135 #endif
5113 5136
5114 #ifdef CONFIG_SMP 5137 #ifdef CONFIG_SMP
5115 5138
5116 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ 5139 static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5117 5140
5118 #ifdef CONFIG_SCHED_DEBUG 5141 #ifdef CONFIG_SCHED_DEBUG
5119 5142
5120 static __read_mostly int sched_debug_enabled; 5143 static __read_mostly int sched_debug_enabled;
5121 5144
5122 static int __init sched_debug_setup(char *str) 5145 static int __init sched_debug_setup(char *str)
5123 { 5146 {
5124 sched_debug_enabled = 1; 5147 sched_debug_enabled = 1;
5125 5148
5126 return 0; 5149 return 0;
5127 } 5150 }
5128 early_param("sched_debug", sched_debug_setup); 5151 early_param("sched_debug", sched_debug_setup);
5129 5152
5130 static inline bool sched_debug(void) 5153 static inline bool sched_debug(void)
5131 { 5154 {
5132 return sched_debug_enabled; 5155 return sched_debug_enabled;
5133 } 5156 }
5134 5157
5135 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5158 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5136 struct cpumask *groupmask) 5159 struct cpumask *groupmask)
5137 { 5160 {
5138 struct sched_group *group = sd->groups; 5161 struct sched_group *group = sd->groups;
5139 char str[256]; 5162 char str[256];
5140 5163
5141 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 5164 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5142 cpumask_clear(groupmask); 5165 cpumask_clear(groupmask);
5143 5166
5144 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 5167 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5145 5168
5146 if (!(sd->flags & SD_LOAD_BALANCE)) { 5169 if (!(sd->flags & SD_LOAD_BALANCE)) {
5147 printk("does not load-balance\n"); 5170 printk("does not load-balance\n");
5148 if (sd->parent) 5171 if (sd->parent)
5149 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5172 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5150 " has parent"); 5173 " has parent");
5151 return -1; 5174 return -1;
5152 } 5175 }
5153 5176
5154 printk(KERN_CONT "span %s level %s\n", str, sd->name); 5177 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5155 5178
5156 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 5179 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5157 printk(KERN_ERR "ERROR: domain->span does not contain " 5180 printk(KERN_ERR "ERROR: domain->span does not contain "
5158 "CPU%d\n", cpu); 5181 "CPU%d\n", cpu);
5159 } 5182 }
5160 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 5183 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5161 printk(KERN_ERR "ERROR: domain->groups does not contain" 5184 printk(KERN_ERR "ERROR: domain->groups does not contain"
5162 " CPU%d\n", cpu); 5185 " CPU%d\n", cpu);
5163 } 5186 }
5164 5187
5165 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 5188 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5166 do { 5189 do {
5167 if (!group) { 5190 if (!group) {
5168 printk("\n"); 5191 printk("\n");
5169 printk(KERN_ERR "ERROR: group is NULL\n"); 5192 printk(KERN_ERR "ERROR: group is NULL\n");
5170 break; 5193 break;
5171 } 5194 }
5172 5195
5173 /* 5196 /*
5174 * Even though we initialize ->power to something semi-sane, 5197 * Even though we initialize ->power to something semi-sane,
5175 * we leave power_orig unset. This allows us to detect if 5198 * we leave power_orig unset. This allows us to detect if
5176 * domain iteration is still funny without causing /0 traps. 5199 * domain iteration is still funny without causing /0 traps.
5177 */ 5200 */
5178 if (!group->sgp->power_orig) { 5201 if (!group->sgp->power_orig) {
5179 printk(KERN_CONT "\n"); 5202 printk(KERN_CONT "\n");
5180 printk(KERN_ERR "ERROR: domain->cpu_power not " 5203 printk(KERN_ERR "ERROR: domain->cpu_power not "
5181 "set\n"); 5204 "set\n");
5182 break; 5205 break;
5183 } 5206 }
5184 5207
5185 if (!cpumask_weight(sched_group_cpus(group))) { 5208 if (!cpumask_weight(sched_group_cpus(group))) {
5186 printk(KERN_CONT "\n"); 5209 printk(KERN_CONT "\n");
5187 printk(KERN_ERR "ERROR: empty group\n"); 5210 printk(KERN_ERR "ERROR: empty group\n");
5188 break; 5211 break;
5189 } 5212 }
5190 5213
5191 if (!(sd->flags & SD_OVERLAP) && 5214 if (!(sd->flags & SD_OVERLAP) &&
5192 cpumask_intersects(groupmask, sched_group_cpus(group))) { 5215 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5193 printk(KERN_CONT "\n"); 5216 printk(KERN_CONT "\n");
5194 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5217 printk(KERN_ERR "ERROR: repeated CPUs\n");
5195 break; 5218 break;
5196 } 5219 }
5197 5220
5198 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 5221 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
5199 5222
5200 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 5223 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
5201 5224
5202 printk(KERN_CONT " %s", str); 5225 printk(KERN_CONT " %s", str);
5203 if (group->sgp->power != SCHED_POWER_SCALE) { 5226 if (group->sgp->power != SCHED_POWER_SCALE) {
5204 printk(KERN_CONT " (cpu_power = %d)", 5227 printk(KERN_CONT " (cpu_power = %d)",
5205 group->sgp->power); 5228 group->sgp->power);
5206 } 5229 }
5207 5230
5208 group = group->next; 5231 group = group->next;
5209 } while (group != sd->groups); 5232 } while (group != sd->groups);
5210 printk(KERN_CONT "\n"); 5233 printk(KERN_CONT "\n");
5211 5234
5212 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 5235 if (!cpumask_equal(sched_domain_span(sd), groupmask))
5213 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 5236 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
5214 5237
5215 if (sd->parent && 5238 if (sd->parent &&
5216 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 5239 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
5217 printk(KERN_ERR "ERROR: parent span is not a superset " 5240 printk(KERN_ERR "ERROR: parent span is not a superset "
5218 "of domain->span\n"); 5241 "of domain->span\n");
5219 return 0; 5242 return 0;
5220 } 5243 }
5221 5244
5222 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5245 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5223 { 5246 {
5224 int level = 0; 5247 int level = 0;
5225 5248
5226 if (!sched_debug_enabled) 5249 if (!sched_debug_enabled)
5227 return; 5250 return;
5228 5251
5229 if (!sd) { 5252 if (!sd) {
5230 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5253 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5231 return; 5254 return;
5232 } 5255 }
5233 5256
5234 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5257 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5235 5258
5236 for (;;) { 5259 for (;;) {
5237 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) 5260 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
5238 break; 5261 break;
5239 level++; 5262 level++;
5240 sd = sd->parent; 5263 sd = sd->parent;
5241 if (!sd) 5264 if (!sd)
5242 break; 5265 break;
5243 } 5266 }
5244 } 5267 }
5245 #else /* !CONFIG_SCHED_DEBUG */ 5268 #else /* !CONFIG_SCHED_DEBUG */
5246 # define sched_domain_debug(sd, cpu) do { } while (0) 5269 # define sched_domain_debug(sd, cpu) do { } while (0)
5247 static inline bool sched_debug(void) 5270 static inline bool sched_debug(void)
5248 { 5271 {
5249 return false; 5272 return false;
5250 } 5273 }
5251 #endif /* CONFIG_SCHED_DEBUG */ 5274 #endif /* CONFIG_SCHED_DEBUG */
5252 5275
5253 static int sd_degenerate(struct sched_domain *sd) 5276 static int sd_degenerate(struct sched_domain *sd)
5254 { 5277 {
5255 if (cpumask_weight(sched_domain_span(sd)) == 1) 5278 if (cpumask_weight(sched_domain_span(sd)) == 1)
5256 return 1; 5279 return 1;
5257 5280
5258 /* Following flags need at least 2 groups */ 5281 /* Following flags need at least 2 groups */
5259 if (sd->flags & (SD_LOAD_BALANCE | 5282 if (sd->flags & (SD_LOAD_BALANCE |
5260 SD_BALANCE_NEWIDLE | 5283 SD_BALANCE_NEWIDLE |
5261 SD_BALANCE_FORK | 5284 SD_BALANCE_FORK |
5262 SD_BALANCE_EXEC | 5285 SD_BALANCE_EXEC |
5263 SD_SHARE_CPUPOWER | 5286 SD_SHARE_CPUPOWER |
5264 SD_SHARE_PKG_RESOURCES)) { 5287 SD_SHARE_PKG_RESOURCES)) {
5265 if (sd->groups != sd->groups->next) 5288 if (sd->groups != sd->groups->next)
5266 return 0; 5289 return 0;
5267 } 5290 }
5268 5291
5269 /* Following flags don't use groups */ 5292 /* Following flags don't use groups */
5270 if (sd->flags & (SD_WAKE_AFFINE)) 5293 if (sd->flags & (SD_WAKE_AFFINE))
5271 return 0; 5294 return 0;
5272 5295
5273 return 1; 5296 return 1;
5274 } 5297 }
5275 5298
5276 static int 5299 static int
5277 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5300 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5278 { 5301 {
5279 unsigned long cflags = sd->flags, pflags = parent->flags; 5302 unsigned long cflags = sd->flags, pflags = parent->flags;
5280 5303
5281 if (sd_degenerate(parent)) 5304 if (sd_degenerate(parent))
5282 return 1; 5305 return 1;
5283 5306
5284 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 5307 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
5285 return 0; 5308 return 0;
5286 5309
5287 /* Flags needing groups don't count if only 1 group in parent */ 5310 /* Flags needing groups don't count if only 1 group in parent */
5288 if (parent->groups == parent->groups->next) { 5311 if (parent->groups == parent->groups->next) {
5289 pflags &= ~(SD_LOAD_BALANCE | 5312 pflags &= ~(SD_LOAD_BALANCE |
5290 SD_BALANCE_NEWIDLE | 5313 SD_BALANCE_NEWIDLE |
5291 SD_BALANCE_FORK | 5314 SD_BALANCE_FORK |
5292 SD_BALANCE_EXEC | 5315 SD_BALANCE_EXEC |
5293 SD_SHARE_CPUPOWER | 5316 SD_SHARE_CPUPOWER |
5294 SD_SHARE_PKG_RESOURCES | 5317 SD_SHARE_PKG_RESOURCES |
5295 SD_PREFER_SIBLING); 5318 SD_PREFER_SIBLING);
5296 if (nr_node_ids == 1) 5319 if (nr_node_ids == 1)
5297 pflags &= ~SD_SERIALIZE; 5320 pflags &= ~SD_SERIALIZE;
5298 } 5321 }
5299 if (~cflags & pflags) 5322 if (~cflags & pflags)
5300 return 0; 5323 return 0;
5301 5324
5302 return 1; 5325 return 1;
5303 } 5326 }
5304 5327
5305 static void free_rootdomain(struct rcu_head *rcu) 5328 static void free_rootdomain(struct rcu_head *rcu)
5306 { 5329 {
5307 struct root_domain *rd = container_of(rcu, struct root_domain, rcu); 5330 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
5308 5331
5309 cpupri_cleanup(&rd->cpupri); 5332 cpupri_cleanup(&rd->cpupri);
5310 cpudl_cleanup(&rd->cpudl); 5333 cpudl_cleanup(&rd->cpudl);
5311 free_cpumask_var(rd->dlo_mask); 5334 free_cpumask_var(rd->dlo_mask);
5312 free_cpumask_var(rd->rto_mask); 5335 free_cpumask_var(rd->rto_mask);
5313 free_cpumask_var(rd->online); 5336 free_cpumask_var(rd->online);
5314 free_cpumask_var(rd->span); 5337 free_cpumask_var(rd->span);
5315 kfree(rd); 5338 kfree(rd);
5316 } 5339 }
5317 5340
5318 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 5341 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
5319 { 5342 {
5320 struct root_domain *old_rd = NULL; 5343 struct root_domain *old_rd = NULL;
5321 unsigned long flags; 5344 unsigned long flags;
5322 5345
5323 raw_spin_lock_irqsave(&rq->lock, flags); 5346 raw_spin_lock_irqsave(&rq->lock, flags);
5324 5347
5325 if (rq->rd) { 5348 if (rq->rd) {
5326 old_rd = rq->rd; 5349 old_rd = rq->rd;
5327 5350
5328 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 5351 if (cpumask_test_cpu(rq->cpu, old_rd->online))
5329 set_rq_offline(rq); 5352 set_rq_offline(rq);
5330 5353
5331 cpumask_clear_cpu(rq->cpu, old_rd->span); 5354 cpumask_clear_cpu(rq->cpu, old_rd->span);
5332 5355
5333 /* 5356 /*
5334 * If we dont want to free the old_rd yet then 5357 * If we dont want to free the old_rd yet then
5335 * set old_rd to NULL to skip the freeing later 5358 * set old_rd to NULL to skip the freeing later
5336 * in this function: 5359 * in this function:
5337 */ 5360 */
5338 if (!atomic_dec_and_test(&old_rd->refcount)) 5361 if (!atomic_dec_and_test(&old_rd->refcount))
5339 old_rd = NULL; 5362 old_rd = NULL;
5340 } 5363 }
5341 5364
5342 atomic_inc(&rd->refcount); 5365 atomic_inc(&rd->refcount);
5343 rq->rd = rd; 5366 rq->rd = rd;
5344 5367
5345 cpumask_set_cpu(rq->cpu, rd->span); 5368 cpumask_set_cpu(rq->cpu, rd->span);
5346 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 5369 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
5347 set_rq_online(rq); 5370 set_rq_online(rq);
5348 5371
5349 raw_spin_unlock_irqrestore(&rq->lock, flags); 5372 raw_spin_unlock_irqrestore(&rq->lock, flags);
5350 5373
5351 if (old_rd) 5374 if (old_rd)
5352 call_rcu_sched(&old_rd->rcu, free_rootdomain); 5375 call_rcu_sched(&old_rd->rcu, free_rootdomain);
5353 } 5376 }
5354 5377
5355 static int init_rootdomain(struct root_domain *rd) 5378 static int init_rootdomain(struct root_domain *rd)
5356 { 5379 {
5357 memset(rd, 0, sizeof(*rd)); 5380 memset(rd, 0, sizeof(*rd));
5358 5381
5359 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) 5382 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
5360 goto out; 5383 goto out;
5361 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) 5384 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
5362 goto free_span; 5385 goto free_span;
5363 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) 5386 if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
5364 goto free_online; 5387 goto free_online;
5365 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) 5388 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
5366 goto free_dlo_mask; 5389 goto free_dlo_mask;
5367 5390
5368 init_dl_bw(&rd->dl_bw); 5391 init_dl_bw(&rd->dl_bw);
5369 if (cpudl_init(&rd->cpudl) != 0) 5392 if (cpudl_init(&rd->cpudl) != 0)
5370 goto free_dlo_mask; 5393 goto free_dlo_mask;
5371 5394
5372 if (cpupri_init(&rd->cpupri) != 0) 5395 if (cpupri_init(&rd->cpupri) != 0)
5373 goto free_rto_mask; 5396 goto free_rto_mask;
5374 return 0; 5397 return 0;
5375 5398
5376 free_rto_mask: 5399 free_rto_mask:
5377 free_cpumask_var(rd->rto_mask); 5400 free_cpumask_var(rd->rto_mask);
5378 free_dlo_mask: 5401 free_dlo_mask:
5379 free_cpumask_var(rd->dlo_mask); 5402 free_cpumask_var(rd->dlo_mask);
5380 free_online: 5403 free_online:
5381 free_cpumask_var(rd->online); 5404 free_cpumask_var(rd->online);
5382 free_span: 5405 free_span:
5383 free_cpumask_var(rd->span); 5406 free_cpumask_var(rd->span);
5384 out: 5407 out:
5385 return -ENOMEM; 5408 return -ENOMEM;
5386 } 5409 }
5387 5410
5388 /* 5411 /*
5389 * By default the system creates a single root-domain with all cpus as 5412 * By default the system creates a single root-domain with all cpus as
5390 * members (mimicking the global state we have today). 5413 * members (mimicking the global state we have today).
5391 */ 5414 */
5392 struct root_domain def_root_domain; 5415 struct root_domain def_root_domain;
5393 5416
5394 static void init_defrootdomain(void) 5417 static void init_defrootdomain(void)
5395 { 5418 {
5396 init_rootdomain(&def_root_domain); 5419 init_rootdomain(&def_root_domain);
5397 5420
5398 atomic_set(&def_root_domain.refcount, 1); 5421 atomic_set(&def_root_domain.refcount, 1);
5399 } 5422 }
5400 5423
5401 static struct root_domain *alloc_rootdomain(void) 5424 static struct root_domain *alloc_rootdomain(void)
5402 { 5425 {
5403 struct root_domain *rd; 5426 struct root_domain *rd;
5404 5427
5405 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 5428 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
5406 if (!rd) 5429 if (!rd)
5407 return NULL; 5430 return NULL;
5408 5431
5409 if (init_rootdomain(rd) != 0) { 5432 if (init_rootdomain(rd) != 0) {
5410 kfree(rd); 5433 kfree(rd);
5411 return NULL; 5434 return NULL;
5412 } 5435 }
5413 5436
5414 return rd; 5437 return rd;
5415 } 5438 }
5416 5439
5417 static void free_sched_groups(struct sched_group *sg, int free_sgp) 5440 static void free_sched_groups(struct sched_group *sg, int free_sgp)
5418 { 5441 {
5419 struct sched_group *tmp, *first; 5442 struct sched_group *tmp, *first;
5420 5443
5421 if (!sg) 5444 if (!sg)
5422 return; 5445 return;
5423 5446
5424 first = sg; 5447 first = sg;
5425 do { 5448 do {
5426 tmp = sg->next; 5449 tmp = sg->next;
5427 5450
5428 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) 5451 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
5429 kfree(sg->sgp); 5452 kfree(sg->sgp);
5430 5453
5431 kfree(sg); 5454 kfree(sg);
5432 sg = tmp; 5455 sg = tmp;
5433 } while (sg != first); 5456 } while (sg != first);
5434 } 5457 }
5435 5458
5436 static void free_sched_domain(struct rcu_head *rcu) 5459 static void free_sched_domain(struct rcu_head *rcu)
5437 { 5460 {
5438 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); 5461 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
5439 5462
5440 /* 5463 /*
5441 * If its an overlapping domain it has private groups, iterate and 5464 * If its an overlapping domain it has private groups, iterate and
5442 * nuke them all. 5465 * nuke them all.
5443 */ 5466 */
5444 if (sd->flags & SD_OVERLAP) { 5467 if (sd->flags & SD_OVERLAP) {
5445 free_sched_groups(sd->groups, 1); 5468 free_sched_groups(sd->groups, 1);
5446 } else if (atomic_dec_and_test(&sd->groups->ref)) { 5469 } else if (atomic_dec_and_test(&sd->groups->ref)) {
5447 kfree(sd->groups->sgp); 5470 kfree(sd->groups->sgp);
5448 kfree(sd->groups); 5471 kfree(sd->groups);
5449 } 5472 }
5450 kfree(sd); 5473 kfree(sd);
5451 } 5474 }
5452 5475
5453 static void destroy_sched_domain(struct sched_domain *sd, int cpu) 5476 static void destroy_sched_domain(struct sched_domain *sd, int cpu)
5454 { 5477 {
5455 call_rcu(&sd->rcu, free_sched_domain); 5478 call_rcu(&sd->rcu, free_sched_domain);
5456 } 5479 }
5457 5480
5458 static void destroy_sched_domains(struct sched_domain *sd, int cpu) 5481 static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5459 { 5482 {
5460 for (; sd; sd = sd->parent) 5483 for (; sd; sd = sd->parent)
5461 destroy_sched_domain(sd, cpu); 5484 destroy_sched_domain(sd, cpu);
5462 } 5485 }
5463 5486
5464 /* 5487 /*
5465 * Keep a special pointer to the highest sched_domain that has 5488 * Keep a special pointer to the highest sched_domain that has
5466 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this 5489 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
5467 * allows us to avoid some pointer chasing select_idle_sibling(). 5490 * allows us to avoid some pointer chasing select_idle_sibling().
5468 * 5491 *
5469 * Also keep a unique ID per domain (we use the first cpu number in 5492 * Also keep a unique ID per domain (we use the first cpu number in
5470 * the cpumask of the domain), this allows us to quickly tell if 5493 * the cpumask of the domain), this allows us to quickly tell if
5471 * two cpus are in the same cache domain, see cpus_share_cache(). 5494 * two cpus are in the same cache domain, see cpus_share_cache().
5472 */ 5495 */
5473 DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5496 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5474 DEFINE_PER_CPU(int, sd_llc_size); 5497 DEFINE_PER_CPU(int, sd_llc_size);
5475 DEFINE_PER_CPU(int, sd_llc_id); 5498 DEFINE_PER_CPU(int, sd_llc_id);
5476 DEFINE_PER_CPU(struct sched_domain *, sd_numa); 5499 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
5477 DEFINE_PER_CPU(struct sched_domain *, sd_busy); 5500 DEFINE_PER_CPU(struct sched_domain *, sd_busy);
5478 DEFINE_PER_CPU(struct sched_domain *, sd_asym); 5501 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5479 5502
5480 static void update_top_cache_domain(int cpu) 5503 static void update_top_cache_domain(int cpu)
5481 { 5504 {
5482 struct sched_domain *sd; 5505 struct sched_domain *sd;
5483 struct sched_domain *busy_sd = NULL; 5506 struct sched_domain *busy_sd = NULL;
5484 int id = cpu; 5507 int id = cpu;
5485 int size = 1; 5508 int size = 1;
5486 5509
5487 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); 5510 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
5488 if (sd) { 5511 if (sd) {
5489 id = cpumask_first(sched_domain_span(sd)); 5512 id = cpumask_first(sched_domain_span(sd));
5490 size = cpumask_weight(sched_domain_span(sd)); 5513 size = cpumask_weight(sched_domain_span(sd));
5491 busy_sd = sd->parent; /* sd_busy */ 5514 busy_sd = sd->parent; /* sd_busy */
5492 } 5515 }
5493 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); 5516 rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
5494 5517
5495 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 5518 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5496 per_cpu(sd_llc_size, cpu) = size; 5519 per_cpu(sd_llc_size, cpu) = size;
5497 per_cpu(sd_llc_id, cpu) = id; 5520 per_cpu(sd_llc_id, cpu) = id;
5498 5521
5499 sd = lowest_flag_domain(cpu, SD_NUMA); 5522 sd = lowest_flag_domain(cpu, SD_NUMA);
5500 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); 5523 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
5501 5524
5502 sd = highest_flag_domain(cpu, SD_ASYM_PACKING); 5525 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
5503 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); 5526 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5504 } 5527 }
5505 5528
5506 /* 5529 /*
5507 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5530 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5508 * hold the hotplug lock. 5531 * hold the hotplug lock.
5509 */ 5532 */
5510 static void 5533 static void
5511 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 5534 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5512 { 5535 {
5513 struct rq *rq = cpu_rq(cpu); 5536 struct rq *rq = cpu_rq(cpu);
5514 struct sched_domain *tmp; 5537 struct sched_domain *tmp;
5515 5538
5516 /* Remove the sched domains which do not contribute to scheduling. */ 5539 /* Remove the sched domains which do not contribute to scheduling. */
5517 for (tmp = sd; tmp; ) { 5540 for (tmp = sd; tmp; ) {
5518 struct sched_domain *parent = tmp->parent; 5541 struct sched_domain *parent = tmp->parent;
5519 if (!parent) 5542 if (!parent)
5520 break; 5543 break;
5521 5544
5522 if (sd_parent_degenerate(tmp, parent)) { 5545 if (sd_parent_degenerate(tmp, parent)) {
5523 tmp->parent = parent->parent; 5546 tmp->parent = parent->parent;
5524 if (parent->parent) 5547 if (parent->parent)
5525 parent->parent->child = tmp; 5548 parent->parent->child = tmp;
5526 /* 5549 /*
5527 * Transfer SD_PREFER_SIBLING down in case of a 5550 * Transfer SD_PREFER_SIBLING down in case of a
5528 * degenerate parent; the spans match for this 5551 * degenerate parent; the spans match for this
5529 * so the property transfers. 5552 * so the property transfers.
5530 */ 5553 */
5531 if (parent->flags & SD_PREFER_SIBLING) 5554 if (parent->flags & SD_PREFER_SIBLING)
5532 tmp->flags |= SD_PREFER_SIBLING; 5555 tmp->flags |= SD_PREFER_SIBLING;
5533 destroy_sched_domain(parent, cpu); 5556 destroy_sched_domain(parent, cpu);
5534 } else 5557 } else
5535 tmp = tmp->parent; 5558 tmp = tmp->parent;
5536 } 5559 }
5537 5560
5538 if (sd && sd_degenerate(sd)) { 5561 if (sd && sd_degenerate(sd)) {
5539 tmp = sd; 5562 tmp = sd;
5540 sd = sd->parent; 5563 sd = sd->parent;
5541 destroy_sched_domain(tmp, cpu); 5564 destroy_sched_domain(tmp, cpu);
5542 if (sd) 5565 if (sd)
5543 sd->child = NULL; 5566 sd->child = NULL;
5544 } 5567 }
5545 5568
5546 sched_domain_debug(sd, cpu); 5569 sched_domain_debug(sd, cpu);
5547 5570
5548 rq_attach_root(rq, rd); 5571 rq_attach_root(rq, rd);
5549 tmp = rq->sd; 5572 tmp = rq->sd;
5550 rcu_assign_pointer(rq->sd, sd); 5573 rcu_assign_pointer(rq->sd, sd);
5551 destroy_sched_domains(tmp, cpu); 5574 destroy_sched_domains(tmp, cpu);
5552 5575
5553 update_top_cache_domain(cpu); 5576 update_top_cache_domain(cpu);
5554 } 5577 }
5555 5578
5556 /* cpus with isolated domains */ 5579 /* cpus with isolated domains */
5557 static cpumask_var_t cpu_isolated_map; 5580 static cpumask_var_t cpu_isolated_map;
5558 5581
5559 /* Setup the mask of cpus configured for isolated domains */ 5582 /* Setup the mask of cpus configured for isolated domains */
5560 static int __init isolated_cpu_setup(char *str) 5583 static int __init isolated_cpu_setup(char *str)
5561 { 5584 {
5562 alloc_bootmem_cpumask_var(&cpu_isolated_map); 5585 alloc_bootmem_cpumask_var(&cpu_isolated_map);
5563 cpulist_parse(str, cpu_isolated_map); 5586 cpulist_parse(str, cpu_isolated_map);
5564 return 1; 5587 return 1;
5565 } 5588 }
5566 5589
5567 __setup("isolcpus=", isolated_cpu_setup); 5590 __setup("isolcpus=", isolated_cpu_setup);
5568 5591
5569 static const struct cpumask *cpu_cpu_mask(int cpu) 5592 static const struct cpumask *cpu_cpu_mask(int cpu)
5570 { 5593 {
5571 return cpumask_of_node(cpu_to_node(cpu)); 5594 return cpumask_of_node(cpu_to_node(cpu));
5572 } 5595 }
5573 5596
5574 struct sd_data { 5597 struct sd_data {
5575 struct sched_domain **__percpu sd; 5598 struct sched_domain **__percpu sd;
5576 struct sched_group **__percpu sg; 5599 struct sched_group **__percpu sg;
5577 struct sched_group_power **__percpu sgp; 5600 struct sched_group_power **__percpu sgp;
5578 }; 5601 };
5579 5602
5580 struct s_data { 5603 struct s_data {
5581 struct sched_domain ** __percpu sd; 5604 struct sched_domain ** __percpu sd;
5582 struct root_domain *rd; 5605 struct root_domain *rd;
5583 }; 5606 };
5584 5607
5585 enum s_alloc { 5608 enum s_alloc {
5586 sa_rootdomain, 5609 sa_rootdomain,
5587 sa_sd, 5610 sa_sd,
5588 sa_sd_storage, 5611 sa_sd_storage,
5589 sa_none, 5612 sa_none,
5590 }; 5613 };
5591 5614
5592 struct sched_domain_topology_level; 5615 struct sched_domain_topology_level;
5593 5616
5594 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); 5617 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
5595 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); 5618 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
5596 5619
5597 #define SDTL_OVERLAP 0x01 5620 #define SDTL_OVERLAP 0x01
5598 5621
5599 struct sched_domain_topology_level { 5622 struct sched_domain_topology_level {
5600 sched_domain_init_f init; 5623 sched_domain_init_f init;
5601 sched_domain_mask_f mask; 5624 sched_domain_mask_f mask;
5602 int flags; 5625 int flags;
5603 int numa_level; 5626 int numa_level;
5604 struct sd_data data; 5627 struct sd_data data;
5605 }; 5628 };
5606 5629
5607 /* 5630 /*
5608 * Build an iteration mask that can exclude certain CPUs from the upwards 5631 * Build an iteration mask that can exclude certain CPUs from the upwards
5609 * domain traversal. 5632 * domain traversal.
5610 * 5633 *
5611 * Asymmetric node setups can result in situations where the domain tree is of 5634 * Asymmetric node setups can result in situations where the domain tree is of
5612 * unequal depth, make sure to skip domains that already cover the entire 5635 * unequal depth, make sure to skip domains that already cover the entire
5613 * range. 5636 * range.
5614 * 5637 *
5615 * In that case build_sched_domains() will have terminated the iteration early 5638 * In that case build_sched_domains() will have terminated the iteration early
5616 * and our sibling sd spans will be empty. Domains should always include the 5639 * and our sibling sd spans will be empty. Domains should always include the
5617 * cpu they're built on, so check that. 5640 * cpu they're built on, so check that.
5618 * 5641 *
5619 */ 5642 */
5620 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) 5643 static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
5621 { 5644 {
5622 const struct cpumask *span = sched_domain_span(sd); 5645 const struct cpumask *span = sched_domain_span(sd);
5623 struct sd_data *sdd = sd->private; 5646 struct sd_data *sdd = sd->private;
5624 struct sched_domain *sibling; 5647 struct sched_domain *sibling;
5625 int i; 5648 int i;
5626 5649
5627 for_each_cpu(i, span) { 5650 for_each_cpu(i, span) {
5628 sibling = *per_cpu_ptr(sdd->sd, i); 5651 sibling = *per_cpu_ptr(sdd->sd, i);
5629 if (!cpumask_test_cpu(i, sched_domain_span(sibling))) 5652 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5630 continue; 5653 continue;
5631 5654
5632 cpumask_set_cpu(i, sched_group_mask(sg)); 5655 cpumask_set_cpu(i, sched_group_mask(sg));
5633 } 5656 }
5634 } 5657 }
5635 5658
5636 /* 5659 /*
5637 * Return the canonical balance cpu for this group, this is the first cpu 5660 * Return the canonical balance cpu for this group, this is the first cpu
5638 * of this group that's also in the iteration mask. 5661 * of this group that's also in the iteration mask.
5639 */ 5662 */
5640 int group_balance_cpu(struct sched_group *sg) 5663 int group_balance_cpu(struct sched_group *sg)
5641 { 5664 {
5642 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); 5665 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
5643 } 5666 }
5644 5667
5645 static int 5668 static int
5646 build_overlap_sched_groups(struct sched_domain *sd, int cpu) 5669 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5647 { 5670 {
5648 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; 5671 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
5649 const struct cpumask *span = sched_domain_span(sd); 5672 const struct cpumask *span = sched_domain_span(sd);
5650 struct cpumask *covered = sched_domains_tmpmask; 5673 struct cpumask *covered = sched_domains_tmpmask;
5651 struct sd_data *sdd = sd->private; 5674 struct sd_data *sdd = sd->private;
5652 struct sched_domain *child; 5675 struct sched_domain *child;
5653 int i; 5676 int i;
5654 5677
5655 cpumask_clear(covered); 5678 cpumask_clear(covered);
5656 5679
5657 for_each_cpu(i, span) { 5680 for_each_cpu(i, span) {
5658 struct cpumask *sg_span; 5681 struct cpumask *sg_span;
5659 5682
5660 if (cpumask_test_cpu(i, covered)) 5683 if (cpumask_test_cpu(i, covered))
5661 continue; 5684 continue;
5662 5685
5663 child = *per_cpu_ptr(sdd->sd, i); 5686 child = *per_cpu_ptr(sdd->sd, i);
5664 5687
5665 /* See the comment near build_group_mask(). */ 5688 /* See the comment near build_group_mask(). */
5666 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5689 if (!cpumask_test_cpu(i, sched_domain_span(child)))
5667 continue; 5690 continue;
5668 5691
5669 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5692 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5670 GFP_KERNEL, cpu_to_node(cpu)); 5693 GFP_KERNEL, cpu_to_node(cpu));
5671 5694
5672 if (!sg) 5695 if (!sg)
5673 goto fail; 5696 goto fail;
5674 5697
5675 sg_span = sched_group_cpus(sg); 5698 sg_span = sched_group_cpus(sg);
5676 if (child->child) { 5699 if (child->child) {
5677 child = child->child; 5700 child = child->child;
5678 cpumask_copy(sg_span, sched_domain_span(child)); 5701 cpumask_copy(sg_span, sched_domain_span(child));
5679 } else 5702 } else
5680 cpumask_set_cpu(i, sg_span); 5703 cpumask_set_cpu(i, sg_span);
5681 5704
5682 cpumask_or(covered, covered, sg_span); 5705 cpumask_or(covered, covered, sg_span);
5683 5706
5684 sg->sgp = *per_cpu_ptr(sdd->sgp, i); 5707 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
5685 if (atomic_inc_return(&sg->sgp->ref) == 1) 5708 if (atomic_inc_return(&sg->sgp->ref) == 1)
5686 build_group_mask(sd, sg); 5709 build_group_mask(sd, sg);
5687 5710
5688 /* 5711 /*
5689 * Initialize sgp->power such that even if we mess up the 5712 * Initialize sgp->power such that even if we mess up the
5690 * domains and no possible iteration will get us here, we won't 5713 * domains and no possible iteration will get us here, we won't
5691 * die on a /0 trap. 5714 * die on a /0 trap.
5692 */ 5715 */
5693 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); 5716 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
5694 sg->sgp->power_orig = sg->sgp->power; 5717 sg->sgp->power_orig = sg->sgp->power;
5695 5718
5696 /* 5719 /*
5697 * Make sure the first group of this domain contains the 5720 * Make sure the first group of this domain contains the
5698 * canonical balance cpu. Otherwise the sched_domain iteration 5721 * canonical balance cpu. Otherwise the sched_domain iteration
5699 * breaks. See update_sg_lb_stats(). 5722 * breaks. See update_sg_lb_stats().
5700 */ 5723 */
5701 if ((!groups && cpumask_test_cpu(cpu, sg_span)) || 5724 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
5702 group_balance_cpu(sg) == cpu) 5725 group_balance_cpu(sg) == cpu)
5703 groups = sg; 5726 groups = sg;
5704 5727
5705 if (!first) 5728 if (!first)
5706 first = sg; 5729 first = sg;
5707 if (last) 5730 if (last)
5708 last->next = sg; 5731 last->next = sg;
5709 last = sg; 5732 last = sg;
5710 last->next = first; 5733 last->next = first;
5711 } 5734 }
5712 sd->groups = groups; 5735 sd->groups = groups;
5713 5736
5714 return 0; 5737 return 0;
5715 5738
5716 fail: 5739 fail:
5717 free_sched_groups(first, 0); 5740 free_sched_groups(first, 0);
5718 5741
5719 return -ENOMEM; 5742 return -ENOMEM;
5720 } 5743 }
5721 5744
5722 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) 5745 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
5723 { 5746 {
5724 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 5747 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
5725 struct sched_domain *child = sd->child; 5748 struct sched_domain *child = sd->child;
5726 5749
5727 if (child) 5750 if (child)
5728 cpu = cpumask_first(sched_domain_span(child)); 5751 cpu = cpumask_first(sched_domain_span(child));
5729 5752
5730 if (sg) { 5753 if (sg) {
5731 *sg = *per_cpu_ptr(sdd->sg, cpu); 5754 *sg = *per_cpu_ptr(sdd->sg, cpu);
5732 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); 5755 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
5733 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ 5756 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
5734 } 5757 }
5735 5758
5736 return cpu; 5759 return cpu;
5737 } 5760 }
5738 5761
5739 /* 5762 /*
5740 * build_sched_groups will build a circular linked list of the groups 5763 * build_sched_groups will build a circular linked list of the groups
5741 * covered by the given span, and will set each group's ->cpumask correctly, 5764 * covered by the given span, and will set each group's ->cpumask correctly,
5742 * and ->cpu_power to 0. 5765 * and ->cpu_power to 0.
5743 * 5766 *
5744 * Assumes the sched_domain tree is fully constructed 5767 * Assumes the sched_domain tree is fully constructed
5745 */ 5768 */
5746 static int 5769 static int
5747 build_sched_groups(struct sched_domain *sd, int cpu) 5770 build_sched_groups(struct sched_domain *sd, int cpu)
5748 { 5771 {
5749 struct sched_group *first = NULL, *last = NULL; 5772 struct sched_group *first = NULL, *last = NULL;
5750 struct sd_data *sdd = sd->private; 5773 struct sd_data *sdd = sd->private;
5751 const struct cpumask *span = sched_domain_span(sd); 5774 const struct cpumask *span = sched_domain_span(sd);
5752 struct cpumask *covered; 5775 struct cpumask *covered;
5753 int i; 5776 int i;
5754 5777
5755 get_group(cpu, sdd, &sd->groups); 5778 get_group(cpu, sdd, &sd->groups);
5756 atomic_inc(&sd->groups->ref); 5779 atomic_inc(&sd->groups->ref);
5757 5780
5758 if (cpu != cpumask_first(span)) 5781 if (cpu != cpumask_first(span))
5759 return 0; 5782 return 0;
5760 5783
5761 lockdep_assert_held(&sched_domains_mutex); 5784 lockdep_assert_held(&sched_domains_mutex);
5762 covered = sched_domains_tmpmask; 5785 covered = sched_domains_tmpmask;
5763 5786
5764 cpumask_clear(covered); 5787 cpumask_clear(covered);
5765 5788
5766 for_each_cpu(i, span) { 5789 for_each_cpu(i, span) {
5767 struct sched_group *sg; 5790 struct sched_group *sg;
5768 int group, j; 5791 int group, j;
5769 5792
5770 if (cpumask_test_cpu(i, covered)) 5793 if (cpumask_test_cpu(i, covered))
5771 continue; 5794 continue;
5772 5795
5773 group = get_group(i, sdd, &sg); 5796 group = get_group(i, sdd, &sg);
5774 cpumask_clear(sched_group_cpus(sg)); 5797 cpumask_clear(sched_group_cpus(sg));
5775 sg->sgp->power = 0; 5798 sg->sgp->power = 0;
5776 cpumask_setall(sched_group_mask(sg)); 5799 cpumask_setall(sched_group_mask(sg));
5777 5800
5778 for_each_cpu(j, span) { 5801 for_each_cpu(j, span) {
5779 if (get_group(j, sdd, NULL) != group) 5802 if (get_group(j, sdd, NULL) != group)
5780 continue; 5803 continue;
5781 5804
5782 cpumask_set_cpu(j, covered); 5805 cpumask_set_cpu(j, covered);
5783 cpumask_set_cpu(j, sched_group_cpus(sg)); 5806 cpumask_set_cpu(j, sched_group_cpus(sg));
5784 } 5807 }
5785 5808
5786 if (!first) 5809 if (!first)
5787 first = sg; 5810 first = sg;
5788 if (last) 5811 if (last)
5789 last->next = sg; 5812 last->next = sg;
5790 last = sg; 5813 last = sg;
5791 } 5814 }
5792 last->next = first; 5815 last->next = first;
5793 5816
5794 return 0; 5817 return 0;
5795 } 5818 }
5796 5819
5797 /* 5820 /*
5798 * Initialize sched groups cpu_power. 5821 * Initialize sched groups cpu_power.
5799 * 5822 *
5800 * cpu_power indicates the capacity of sched group, which is used while 5823 * cpu_power indicates the capacity of sched group, which is used while
5801 * distributing the load between different sched groups in a sched domain. 5824 * distributing the load between different sched groups in a sched domain.
5802 * Typically cpu_power for all the groups in a sched domain will be same unless 5825 * Typically cpu_power for all the groups in a sched domain will be same unless
5803 * there are asymmetries in the topology. If there are asymmetries, group 5826 * there are asymmetries in the topology. If there are asymmetries, group
5804 * having more cpu_power will pickup more load compared to the group having 5827 * having more cpu_power will pickup more load compared to the group having
5805 * less cpu_power. 5828 * less cpu_power.
5806 */ 5829 */
5807 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5830 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5808 { 5831 {
5809 struct sched_group *sg = sd->groups; 5832 struct sched_group *sg = sd->groups;
5810 5833
5811 WARN_ON(!sg); 5834 WARN_ON(!sg);
5812 5835
5813 do { 5836 do {
5814 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 5837 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
5815 sg = sg->next; 5838 sg = sg->next;
5816 } while (sg != sd->groups); 5839 } while (sg != sd->groups);
5817 5840
5818 if (cpu != group_balance_cpu(sg)) 5841 if (cpu != group_balance_cpu(sg))
5819 return; 5842 return;
5820 5843
5821 update_group_power(sd, cpu); 5844 update_group_power(sd, cpu);
5822 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); 5845 atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
5823 } 5846 }
5824 5847
5825 int __weak arch_sd_sibling_asym_packing(void) 5848 int __weak arch_sd_sibling_asym_packing(void)
5826 { 5849 {
5827 return 0*SD_ASYM_PACKING; 5850 return 0*SD_ASYM_PACKING;
5828 } 5851 }
5829 5852
5830 /* 5853 /*
5831 * Initializers for schedule domains 5854 * Initializers for schedule domains
5832 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 5855 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
5833 */ 5856 */
5834 5857
5835 #ifdef CONFIG_SCHED_DEBUG 5858 #ifdef CONFIG_SCHED_DEBUG
5836 # define SD_INIT_NAME(sd, type) sd->name = #type 5859 # define SD_INIT_NAME(sd, type) sd->name = #type
5837 #else 5860 #else
5838 # define SD_INIT_NAME(sd, type) do { } while (0) 5861 # define SD_INIT_NAME(sd, type) do { } while (0)
5839 #endif 5862 #endif
5840 5863
5841 #define SD_INIT_FUNC(type) \ 5864 #define SD_INIT_FUNC(type) \
5842 static noinline struct sched_domain * \ 5865 static noinline struct sched_domain * \
5843 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ 5866 sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
5844 { \ 5867 { \
5845 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ 5868 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
5846 *sd = SD_##type##_INIT; \ 5869 *sd = SD_##type##_INIT; \
5847 SD_INIT_NAME(sd, type); \ 5870 SD_INIT_NAME(sd, type); \
5848 sd->private = &tl->data; \ 5871 sd->private = &tl->data; \
5849 return sd; \ 5872 return sd; \
5850 } 5873 }
5851 5874
5852 SD_INIT_FUNC(CPU) 5875 SD_INIT_FUNC(CPU)
5853 #ifdef CONFIG_SCHED_SMT 5876 #ifdef CONFIG_SCHED_SMT
5854 SD_INIT_FUNC(SIBLING) 5877 SD_INIT_FUNC(SIBLING)
5855 #endif 5878 #endif
5856 #ifdef CONFIG_SCHED_MC 5879 #ifdef CONFIG_SCHED_MC
5857 SD_INIT_FUNC(MC) 5880 SD_INIT_FUNC(MC)
5858 #endif 5881 #endif
5859 #ifdef CONFIG_SCHED_BOOK 5882 #ifdef CONFIG_SCHED_BOOK
5860 SD_INIT_FUNC(BOOK) 5883 SD_INIT_FUNC(BOOK)
5861 #endif 5884 #endif
5862 5885
5863 static int default_relax_domain_level = -1; 5886 static int default_relax_domain_level = -1;
5864 int sched_domain_level_max; 5887 int sched_domain_level_max;
5865 5888
5866 static int __init setup_relax_domain_level(char *str) 5889 static int __init setup_relax_domain_level(char *str)
5867 { 5890 {
5868 if (kstrtoint(str, 0, &default_relax_domain_level)) 5891 if (kstrtoint(str, 0, &default_relax_domain_level))
5869 pr_warn("Unable to set relax_domain_level\n"); 5892 pr_warn("Unable to set relax_domain_level\n");
5870 5893
5871 return 1; 5894 return 1;
5872 } 5895 }
5873 __setup("relax_domain_level=", setup_relax_domain_level); 5896 __setup("relax_domain_level=", setup_relax_domain_level);
5874 5897
5875 static void set_domain_attribute(struct sched_domain *sd, 5898 static void set_domain_attribute(struct sched_domain *sd,
5876 struct sched_domain_attr *attr) 5899 struct sched_domain_attr *attr)
5877 { 5900 {
5878 int request; 5901 int request;
5879 5902
5880 if (!attr || attr->relax_domain_level < 0) { 5903 if (!attr || attr->relax_domain_level < 0) {
5881 if (default_relax_domain_level < 0) 5904 if (default_relax_domain_level < 0)
5882 return; 5905 return;
5883 else 5906 else
5884 request = default_relax_domain_level; 5907 request = default_relax_domain_level;
5885 } else 5908 } else
5886 request = attr->relax_domain_level; 5909 request = attr->relax_domain_level;
5887 if (request < sd->level) { 5910 if (request < sd->level) {
5888 /* turn off idle balance on this domain */ 5911 /* turn off idle balance on this domain */
5889 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5912 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5890 } else { 5913 } else {
5891 /* turn on idle balance on this domain */ 5914 /* turn on idle balance on this domain */
5892 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 5915 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
5893 } 5916 }
5894 } 5917 }
5895 5918
5896 static void __sdt_free(const struct cpumask *cpu_map); 5919 static void __sdt_free(const struct cpumask *cpu_map);
5897 static int __sdt_alloc(const struct cpumask *cpu_map); 5920 static int __sdt_alloc(const struct cpumask *cpu_map);
5898 5921
5899 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 5922 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
5900 const struct cpumask *cpu_map) 5923 const struct cpumask *cpu_map)
5901 { 5924 {
5902 switch (what) { 5925 switch (what) {
5903 case sa_rootdomain: 5926 case sa_rootdomain:
5904 if (!atomic_read(&d->rd->refcount)) 5927 if (!atomic_read(&d->rd->refcount))
5905 free_rootdomain(&d->rd->rcu); /* fall through */ 5928 free_rootdomain(&d->rd->rcu); /* fall through */
5906 case sa_sd: 5929 case sa_sd:
5907 free_percpu(d->sd); /* fall through */ 5930 free_percpu(d->sd); /* fall through */
5908 case sa_sd_storage: 5931 case sa_sd_storage:
5909 __sdt_free(cpu_map); /* fall through */ 5932 __sdt_free(cpu_map); /* fall through */
5910 case sa_none: 5933 case sa_none:
5911 break; 5934 break;
5912 } 5935 }
5913 } 5936 }
5914 5937
5915 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 5938 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
5916 const struct cpumask *cpu_map) 5939 const struct cpumask *cpu_map)
5917 { 5940 {
5918 memset(d, 0, sizeof(*d)); 5941 memset(d, 0, sizeof(*d));
5919 5942
5920 if (__sdt_alloc(cpu_map)) 5943 if (__sdt_alloc(cpu_map))
5921 return sa_sd_storage; 5944 return sa_sd_storage;
5922 d->sd = alloc_percpu(struct sched_domain *); 5945 d->sd = alloc_percpu(struct sched_domain *);
5923 if (!d->sd) 5946 if (!d->sd)
5924 return sa_sd_storage; 5947 return sa_sd_storage;
5925 d->rd = alloc_rootdomain(); 5948 d->rd = alloc_rootdomain();
5926 if (!d->rd) 5949 if (!d->rd)
5927 return sa_sd; 5950 return sa_sd;
5928 return sa_rootdomain; 5951 return sa_rootdomain;
5929 } 5952 }
5930 5953
5931 /* 5954 /*
5932 * NULL the sd_data elements we've used to build the sched_domain and 5955 * NULL the sd_data elements we've used to build the sched_domain and
5933 * sched_group structure so that the subsequent __free_domain_allocs() 5956 * sched_group structure so that the subsequent __free_domain_allocs()
5934 * will not free the data we're using. 5957 * will not free the data we're using.
5935 */ 5958 */
5936 static void claim_allocations(int cpu, struct sched_domain *sd) 5959 static void claim_allocations(int cpu, struct sched_domain *sd)
5937 { 5960 {
5938 struct sd_data *sdd = sd->private; 5961 struct sd_data *sdd = sd->private;
5939 5962
5940 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); 5963 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
5941 *per_cpu_ptr(sdd->sd, cpu) = NULL; 5964 *per_cpu_ptr(sdd->sd, cpu) = NULL;
5942 5965
5943 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) 5966 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
5944 *per_cpu_ptr(sdd->sg, cpu) = NULL; 5967 *per_cpu_ptr(sdd->sg, cpu) = NULL;
5945 5968
5946 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) 5969 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
5947 *per_cpu_ptr(sdd->sgp, cpu) = NULL; 5970 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
5948 } 5971 }
5949 5972
5950 #ifdef CONFIG_SCHED_SMT 5973 #ifdef CONFIG_SCHED_SMT
5951 static const struct cpumask *cpu_smt_mask(int cpu) 5974 static const struct cpumask *cpu_smt_mask(int cpu)
5952 { 5975 {
5953 return topology_thread_cpumask(cpu); 5976 return topology_thread_cpumask(cpu);
5954 } 5977 }
5955 #endif 5978 #endif
5956 5979
5957 /* 5980 /*
5958 * Topology list, bottom-up. 5981 * Topology list, bottom-up.
5959 */ 5982 */
5960 static struct sched_domain_topology_level default_topology[] = { 5983 static struct sched_domain_topology_level default_topology[] = {
5961 #ifdef CONFIG_SCHED_SMT 5984 #ifdef CONFIG_SCHED_SMT
5962 { sd_init_SIBLING, cpu_smt_mask, }, 5985 { sd_init_SIBLING, cpu_smt_mask, },
5963 #endif 5986 #endif
5964 #ifdef CONFIG_SCHED_MC 5987 #ifdef CONFIG_SCHED_MC
5965 { sd_init_MC, cpu_coregroup_mask, }, 5988 { sd_init_MC, cpu_coregroup_mask, },
5966 #endif 5989 #endif
5967 #ifdef CONFIG_SCHED_BOOK 5990 #ifdef CONFIG_SCHED_BOOK
5968 { sd_init_BOOK, cpu_book_mask, }, 5991 { sd_init_BOOK, cpu_book_mask, },
5969 #endif 5992 #endif
5970 { sd_init_CPU, cpu_cpu_mask, }, 5993 { sd_init_CPU, cpu_cpu_mask, },
5971 { NULL, }, 5994 { NULL, },
5972 }; 5995 };
5973 5996
5974 static struct sched_domain_topology_level *sched_domain_topology = default_topology; 5997 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
5975 5998
5976 #define for_each_sd_topology(tl) \ 5999 #define for_each_sd_topology(tl) \
5977 for (tl = sched_domain_topology; tl->init; tl++) 6000 for (tl = sched_domain_topology; tl->init; tl++)
5978 6001
5979 #ifdef CONFIG_NUMA 6002 #ifdef CONFIG_NUMA
5980 6003
5981 static int sched_domains_numa_levels; 6004 static int sched_domains_numa_levels;
5982 static int *sched_domains_numa_distance; 6005 static int *sched_domains_numa_distance;
5983 static struct cpumask ***sched_domains_numa_masks; 6006 static struct cpumask ***sched_domains_numa_masks;
5984 static int sched_domains_curr_level; 6007 static int sched_domains_curr_level;
5985 6008
5986 static inline int sd_local_flags(int level) 6009 static inline int sd_local_flags(int level)
5987 { 6010 {
5988 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) 6011 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
5989 return 0; 6012 return 0;
5990 6013
5991 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6014 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
5992 } 6015 }
5993 6016
5994 static struct sched_domain * 6017 static struct sched_domain *
5995 sd_numa_init(struct sched_domain_topology_level *tl, int cpu) 6018 sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5996 { 6019 {
5997 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); 6020 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
5998 int level = tl->numa_level; 6021 int level = tl->numa_level;
5999 int sd_weight = cpumask_weight( 6022 int sd_weight = cpumask_weight(
6000 sched_domains_numa_masks[level][cpu_to_node(cpu)]); 6023 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6001 6024
6002 *sd = (struct sched_domain){ 6025 *sd = (struct sched_domain){
6003 .min_interval = sd_weight, 6026 .min_interval = sd_weight,
6004 .max_interval = 2*sd_weight, 6027 .max_interval = 2*sd_weight,
6005 .busy_factor = 32, 6028 .busy_factor = 32,
6006 .imbalance_pct = 125, 6029 .imbalance_pct = 125,
6007 .cache_nice_tries = 2, 6030 .cache_nice_tries = 2,
6008 .busy_idx = 3, 6031 .busy_idx = 3,
6009 .idle_idx = 2, 6032 .idle_idx = 2,
6010 .newidle_idx = 0, 6033 .newidle_idx = 0,
6011 .wake_idx = 0, 6034 .wake_idx = 0,
6012 .forkexec_idx = 0, 6035 .forkexec_idx = 0,
6013 6036
6014 .flags = 1*SD_LOAD_BALANCE 6037 .flags = 1*SD_LOAD_BALANCE
6015 | 1*SD_BALANCE_NEWIDLE 6038 | 1*SD_BALANCE_NEWIDLE
6016 | 0*SD_BALANCE_EXEC 6039 | 0*SD_BALANCE_EXEC
6017 | 0*SD_BALANCE_FORK 6040 | 0*SD_BALANCE_FORK
6018 | 0*SD_BALANCE_WAKE 6041 | 0*SD_BALANCE_WAKE
6019 | 0*SD_WAKE_AFFINE 6042 | 0*SD_WAKE_AFFINE
6020 | 0*SD_SHARE_CPUPOWER 6043 | 0*SD_SHARE_CPUPOWER
6021 | 0*SD_SHARE_PKG_RESOURCES 6044 | 0*SD_SHARE_PKG_RESOURCES
6022 | 1*SD_SERIALIZE 6045 | 1*SD_SERIALIZE
6023 | 0*SD_PREFER_SIBLING 6046 | 0*SD_PREFER_SIBLING
6024 | 1*SD_NUMA 6047 | 1*SD_NUMA
6025 | sd_local_flags(level) 6048 | sd_local_flags(level)
6026 , 6049 ,
6027 .last_balance = jiffies, 6050 .last_balance = jiffies,
6028 .balance_interval = sd_weight, 6051 .balance_interval = sd_weight,
6029 .max_newidle_lb_cost = 0, 6052 .max_newidle_lb_cost = 0,
6030 .next_decay_max_lb_cost = jiffies, 6053 .next_decay_max_lb_cost = jiffies,
6031 }; 6054 };
6032 SD_INIT_NAME(sd, NUMA); 6055 SD_INIT_NAME(sd, NUMA);
6033 sd->private = &tl->data; 6056 sd->private = &tl->data;
6034 6057
6035 /* 6058 /*
6036 * Ugly hack to pass state to sd_numa_mask()... 6059 * Ugly hack to pass state to sd_numa_mask()...
6037 */ 6060 */
6038 sched_domains_curr_level = tl->numa_level; 6061 sched_domains_curr_level = tl->numa_level;
6039 6062
6040 return sd; 6063 return sd;
6041 } 6064 }
6042 6065
6043 static const struct cpumask *sd_numa_mask(int cpu) 6066 static const struct cpumask *sd_numa_mask(int cpu)
6044 { 6067 {
6045 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6068 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6046 } 6069 }
6047 6070
6048 static void sched_numa_warn(const char *str) 6071 static void sched_numa_warn(const char *str)
6049 { 6072 {
6050 static int done = false; 6073 static int done = false;
6051 int i,j; 6074 int i,j;
6052 6075
6053 if (done) 6076 if (done)
6054 return; 6077 return;
6055 6078
6056 done = true; 6079 done = true;
6057 6080
6058 printk(KERN_WARNING "ERROR: %s\n\n", str); 6081 printk(KERN_WARNING "ERROR: %s\n\n", str);
6059 6082
6060 for (i = 0; i < nr_node_ids; i++) { 6083 for (i = 0; i < nr_node_ids; i++) {
6061 printk(KERN_WARNING " "); 6084 printk(KERN_WARNING " ");
6062 for (j = 0; j < nr_node_ids; j++) 6085 for (j = 0; j < nr_node_ids; j++)
6063 printk(KERN_CONT "%02d ", node_distance(i,j)); 6086 printk(KERN_CONT "%02d ", node_distance(i,j));
6064 printk(KERN_CONT "\n"); 6087 printk(KERN_CONT "\n");
6065 } 6088 }
6066 printk(KERN_WARNING "\n"); 6089 printk(KERN_WARNING "\n");
6067 } 6090 }
6068 6091
6069 static bool find_numa_distance(int distance) 6092 static bool find_numa_distance(int distance)
6070 { 6093 {
6071 int i; 6094 int i;
6072 6095
6073 if (distance == node_distance(0, 0)) 6096 if (distance == node_distance(0, 0))
6074 return true; 6097 return true;
6075 6098
6076 for (i = 0; i < sched_domains_numa_levels; i++) { 6099 for (i = 0; i < sched_domains_numa_levels; i++) {
6077 if (sched_domains_numa_distance[i] == distance) 6100 if (sched_domains_numa_distance[i] == distance)
6078 return true; 6101 return true;
6079 } 6102 }
6080 6103
6081 return false; 6104 return false;
6082 } 6105 }
6083 6106
6084 static void sched_init_numa(void) 6107 static void sched_init_numa(void)
6085 { 6108 {
6086 int next_distance, curr_distance = node_distance(0, 0); 6109 int next_distance, curr_distance = node_distance(0, 0);
6087 struct sched_domain_topology_level *tl; 6110 struct sched_domain_topology_level *tl;
6088 int level = 0; 6111 int level = 0;
6089 int i, j, k; 6112 int i, j, k;
6090 6113
6091 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6114 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6092 if (!sched_domains_numa_distance) 6115 if (!sched_domains_numa_distance)
6093 return; 6116 return;
6094 6117
6095 /* 6118 /*
6096 * O(nr_nodes^2) deduplicating selection sort -- in order to find the 6119 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6097 * unique distances in the node_distance() table. 6120 * unique distances in the node_distance() table.
6098 * 6121 *
6099 * Assumes node_distance(0,j) includes all distances in 6122 * Assumes node_distance(0,j) includes all distances in
6100 * node_distance(i,j) in order to avoid cubic time. 6123 * node_distance(i,j) in order to avoid cubic time.
6101 */ 6124 */
6102 next_distance = curr_distance; 6125 next_distance = curr_distance;
6103 for (i = 0; i < nr_node_ids; i++) { 6126 for (i = 0; i < nr_node_ids; i++) {
6104 for (j = 0; j < nr_node_ids; j++) { 6127 for (j = 0; j < nr_node_ids; j++) {
6105 for (k = 0; k < nr_node_ids; k++) { 6128 for (k = 0; k < nr_node_ids; k++) {
6106 int distance = node_distance(i, k); 6129 int distance = node_distance(i, k);
6107 6130
6108 if (distance > curr_distance && 6131 if (distance > curr_distance &&
6109 (distance < next_distance || 6132 (distance < next_distance ||
6110 next_distance == curr_distance)) 6133 next_distance == curr_distance))
6111 next_distance = distance; 6134 next_distance = distance;
6112 6135
6113 /* 6136 /*
6114 * While not a strong assumption it would be nice to know 6137 * While not a strong assumption it would be nice to know
6115 * about cases where if node A is connected to B, B is not 6138 * about cases where if node A is connected to B, B is not
6116 * equally connected to A. 6139 * equally connected to A.
6117 */ 6140 */
6118 if (sched_debug() && node_distance(k, i) != distance) 6141 if (sched_debug() && node_distance(k, i) != distance)
6119 sched_numa_warn("Node-distance not symmetric"); 6142 sched_numa_warn("Node-distance not symmetric");
6120 6143
6121 if (sched_debug() && i && !find_numa_distance(distance)) 6144 if (sched_debug() && i && !find_numa_distance(distance))
6122 sched_numa_warn("Node-0 not representative"); 6145 sched_numa_warn("Node-0 not representative");
6123 } 6146 }
6124 if (next_distance != curr_distance) { 6147 if (next_distance != curr_distance) {
6125 sched_domains_numa_distance[level++] = next_distance; 6148 sched_domains_numa_distance[level++] = next_distance;
6126 sched_domains_numa_levels = level; 6149 sched_domains_numa_levels = level;
6127 curr_distance = next_distance; 6150 curr_distance = next_distance;
6128 } else break; 6151 } else break;
6129 } 6152 }
6130 6153
6131 /* 6154 /*
6132 * In case of sched_debug() we verify the above assumption. 6155 * In case of sched_debug() we verify the above assumption.
6133 */ 6156 */
6134 if (!sched_debug()) 6157 if (!sched_debug())
6135 break; 6158 break;
6136 } 6159 }
6137 /* 6160 /*
6138 * 'level' contains the number of unique distances, excluding the 6161 * 'level' contains the number of unique distances, excluding the
6139 * identity distance node_distance(i,i). 6162 * identity distance node_distance(i,i).
6140 * 6163 *
6141 * The sched_domains_numa_distance[] array includes the actual distance 6164 * The sched_domains_numa_distance[] array includes the actual distance
6142 * numbers. 6165 * numbers.
6143 */ 6166 */
6144 6167
6145 /* 6168 /*
6146 * Here, we should temporarily reset sched_domains_numa_levels to 0. 6169 * Here, we should temporarily reset sched_domains_numa_levels to 0.
6147 * If it fails to allocate memory for array sched_domains_numa_masks[][], 6170 * If it fails to allocate memory for array sched_domains_numa_masks[][],
6148 * the array will contain less then 'level' members. This could be 6171 * the array will contain less then 'level' members. This could be
6149 * dangerous when we use it to iterate array sched_domains_numa_masks[][] 6172 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
6150 * in other functions. 6173 * in other functions.
6151 * 6174 *
6152 * We reset it to 'level' at the end of this function. 6175 * We reset it to 'level' at the end of this function.
6153 */ 6176 */
6154 sched_domains_numa_levels = 0; 6177 sched_domains_numa_levels = 0;
6155 6178
6156 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); 6179 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6157 if (!sched_domains_numa_masks) 6180 if (!sched_domains_numa_masks)
6158 return; 6181 return;
6159 6182
6160 /* 6183 /*
6161 * Now for each level, construct a mask per node which contains all 6184 * Now for each level, construct a mask per node which contains all
6162 * cpus of nodes that are that many hops away from us. 6185 * cpus of nodes that are that many hops away from us.
6163 */ 6186 */
6164 for (i = 0; i < level; i++) { 6187 for (i = 0; i < level; i++) {
6165 sched_domains_numa_masks[i] = 6188 sched_domains_numa_masks[i] =
6166 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); 6189 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6167 if (!sched_domains_numa_masks[i]) 6190 if (!sched_domains_numa_masks[i])
6168 return; 6191 return;
6169 6192
6170 for (j = 0; j < nr_node_ids; j++) { 6193 for (j = 0; j < nr_node_ids; j++) {
6171 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); 6194 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6172 if (!mask) 6195 if (!mask)
6173 return; 6196 return;
6174 6197
6175 sched_domains_numa_masks[i][j] = mask; 6198 sched_domains_numa_masks[i][j] = mask;
6176 6199
6177 for (k = 0; k < nr_node_ids; k++) { 6200 for (k = 0; k < nr_node_ids; k++) {
6178 if (node_distance(j, k) > sched_domains_numa_distance[i]) 6201 if (node_distance(j, k) > sched_domains_numa_distance[i])
6179 continue; 6202 continue;
6180 6203
6181 cpumask_or(mask, mask, cpumask_of_node(k)); 6204 cpumask_or(mask, mask, cpumask_of_node(k));
6182 } 6205 }
6183 } 6206 }
6184 } 6207 }
6185 6208
6186 tl = kzalloc((ARRAY_SIZE(default_topology) + level) * 6209 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6187 sizeof(struct sched_domain_topology_level), GFP_KERNEL); 6210 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6188 if (!tl) 6211 if (!tl)
6189 return; 6212 return;
6190 6213
6191 /* 6214 /*
6192 * Copy the default topology bits.. 6215 * Copy the default topology bits..
6193 */ 6216 */
6194 for (i = 0; default_topology[i].init; i++) 6217 for (i = 0; default_topology[i].init; i++)
6195 tl[i] = default_topology[i]; 6218 tl[i] = default_topology[i];
6196 6219
6197 /* 6220 /*
6198 * .. and append 'j' levels of NUMA goodness. 6221 * .. and append 'j' levels of NUMA goodness.
6199 */ 6222 */
6200 for (j = 0; j < level; i++, j++) { 6223 for (j = 0; j < level; i++, j++) {
6201 tl[i] = (struct sched_domain_topology_level){ 6224 tl[i] = (struct sched_domain_topology_level){
6202 .init = sd_numa_init, 6225 .init = sd_numa_init,
6203 .mask = sd_numa_mask, 6226 .mask = sd_numa_mask,
6204 .flags = SDTL_OVERLAP, 6227 .flags = SDTL_OVERLAP,
6205 .numa_level = j, 6228 .numa_level = j,
6206 }; 6229 };
6207 } 6230 }
6208 6231
6209 sched_domain_topology = tl; 6232 sched_domain_topology = tl;
6210 6233
6211 sched_domains_numa_levels = level; 6234 sched_domains_numa_levels = level;
6212 } 6235 }
6213 6236
6214 static void sched_domains_numa_masks_set(int cpu) 6237 static void sched_domains_numa_masks_set(int cpu)
6215 { 6238 {
6216 int i, j; 6239 int i, j;
6217 int node = cpu_to_node(cpu); 6240 int node = cpu_to_node(cpu);
6218 6241
6219 for (i = 0; i < sched_domains_numa_levels; i++) { 6242 for (i = 0; i < sched_domains_numa_levels; i++) {
6220 for (j = 0; j < nr_node_ids; j++) { 6243 for (j = 0; j < nr_node_ids; j++) {
6221 if (node_distance(j, node) <= sched_domains_numa_distance[i]) 6244 if (node_distance(j, node) <= sched_domains_numa_distance[i])
6222 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 6245 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
6223 } 6246 }
6224 } 6247 }
6225 } 6248 }
6226 6249
6227 static void sched_domains_numa_masks_clear(int cpu) 6250 static void sched_domains_numa_masks_clear(int cpu)
6228 { 6251 {
6229 int i, j; 6252 int i, j;
6230 for (i = 0; i < sched_domains_numa_levels; i++) { 6253 for (i = 0; i < sched_domains_numa_levels; i++) {
6231 for (j = 0; j < nr_node_ids; j++) 6254 for (j = 0; j < nr_node_ids; j++)
6232 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]); 6255 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
6233 } 6256 }
6234 } 6257 }
6235 6258
6236 /* 6259 /*
6237 * Update sched_domains_numa_masks[level][node] array when new cpus 6260 * Update sched_domains_numa_masks[level][node] array when new cpus
6238 * are onlined. 6261 * are onlined.
6239 */ 6262 */
6240 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6263 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6241 unsigned long action, 6264 unsigned long action,
6242 void *hcpu) 6265 void *hcpu)
6243 { 6266 {
6244 int cpu = (long)hcpu; 6267 int cpu = (long)hcpu;
6245 6268
6246 switch (action & ~CPU_TASKS_FROZEN) { 6269 switch (action & ~CPU_TASKS_FROZEN) {
6247 case CPU_ONLINE: 6270 case CPU_ONLINE:
6248 sched_domains_numa_masks_set(cpu); 6271 sched_domains_numa_masks_set(cpu);
6249 break; 6272 break;
6250 6273
6251 case CPU_DEAD: 6274 case CPU_DEAD:
6252 sched_domains_numa_masks_clear(cpu); 6275 sched_domains_numa_masks_clear(cpu);
6253 break; 6276 break;
6254 6277
6255 default: 6278 default:
6256 return NOTIFY_DONE; 6279 return NOTIFY_DONE;
6257 } 6280 }
6258 6281
6259 return NOTIFY_OK; 6282 return NOTIFY_OK;
6260 } 6283 }
6261 #else 6284 #else
6262 static inline void sched_init_numa(void) 6285 static inline void sched_init_numa(void)
6263 { 6286 {
6264 } 6287 }
6265 6288
6266 static int sched_domains_numa_masks_update(struct notifier_block *nfb, 6289 static int sched_domains_numa_masks_update(struct notifier_block *nfb,
6267 unsigned long action, 6290 unsigned long action,
6268 void *hcpu) 6291 void *hcpu)
6269 { 6292 {
6270 return 0; 6293 return 0;
6271 } 6294 }
6272 #endif /* CONFIG_NUMA */ 6295 #endif /* CONFIG_NUMA */
6273 6296
6274 static int __sdt_alloc(const struct cpumask *cpu_map) 6297 static int __sdt_alloc(const struct cpumask *cpu_map)
6275 { 6298 {
6276 struct sched_domain_topology_level *tl; 6299 struct sched_domain_topology_level *tl;
6277 int j; 6300 int j;
6278 6301
6279 for_each_sd_topology(tl) { 6302 for_each_sd_topology(tl) {
6280 struct sd_data *sdd = &tl->data; 6303 struct sd_data *sdd = &tl->data;
6281 6304
6282 sdd->sd = alloc_percpu(struct sched_domain *); 6305 sdd->sd = alloc_percpu(struct sched_domain *);
6283 if (!sdd->sd) 6306 if (!sdd->sd)
6284 return -ENOMEM; 6307 return -ENOMEM;
6285 6308
6286 sdd->sg = alloc_percpu(struct sched_group *); 6309 sdd->sg = alloc_percpu(struct sched_group *);
6287 if (!sdd->sg) 6310 if (!sdd->sg)
6288 return -ENOMEM; 6311 return -ENOMEM;
6289 6312
6290 sdd->sgp = alloc_percpu(struct sched_group_power *); 6313 sdd->sgp = alloc_percpu(struct sched_group_power *);
6291 if (!sdd->sgp) 6314 if (!sdd->sgp)
6292 return -ENOMEM; 6315 return -ENOMEM;
6293 6316
6294 for_each_cpu(j, cpu_map) { 6317 for_each_cpu(j, cpu_map) {
6295 struct sched_domain *sd; 6318 struct sched_domain *sd;
6296 struct sched_group *sg; 6319 struct sched_group *sg;
6297 struct sched_group_power *sgp; 6320 struct sched_group_power *sgp;
6298 6321
6299 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), 6322 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
6300 GFP_KERNEL, cpu_to_node(j)); 6323 GFP_KERNEL, cpu_to_node(j));
6301 if (!sd) 6324 if (!sd)
6302 return -ENOMEM; 6325 return -ENOMEM;
6303 6326
6304 *per_cpu_ptr(sdd->sd, j) = sd; 6327 *per_cpu_ptr(sdd->sd, j) = sd;
6305 6328
6306 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6329 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6307 GFP_KERNEL, cpu_to_node(j)); 6330 GFP_KERNEL, cpu_to_node(j));
6308 if (!sg) 6331 if (!sg)
6309 return -ENOMEM; 6332 return -ENOMEM;
6310 6333
6311 sg->next = sg; 6334 sg->next = sg;
6312 6335
6313 *per_cpu_ptr(sdd->sg, j) = sg; 6336 *per_cpu_ptr(sdd->sg, j) = sg;
6314 6337
6315 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), 6338 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6316 GFP_KERNEL, cpu_to_node(j)); 6339 GFP_KERNEL, cpu_to_node(j));
6317 if (!sgp) 6340 if (!sgp)
6318 return -ENOMEM; 6341 return -ENOMEM;
6319 6342
6320 *per_cpu_ptr(sdd->sgp, j) = sgp; 6343 *per_cpu_ptr(sdd->sgp, j) = sgp;
6321 } 6344 }
6322 } 6345 }
6323 6346
6324 return 0; 6347 return 0;
6325 } 6348 }
6326 6349
6327 static void __sdt_free(const struct cpumask *cpu_map) 6350 static void __sdt_free(const struct cpumask *cpu_map)
6328 { 6351 {
6329 struct sched_domain_topology_level *tl; 6352 struct sched_domain_topology_level *tl;
6330 int j; 6353 int j;
6331 6354
6332 for_each_sd_topology(tl) { 6355 for_each_sd_topology(tl) {
6333 struct sd_data *sdd = &tl->data; 6356 struct sd_data *sdd = &tl->data;
6334 6357
6335 for_each_cpu(j, cpu_map) { 6358 for_each_cpu(j, cpu_map) {
6336 struct sched_domain *sd; 6359 struct sched_domain *sd;
6337 6360
6338 if (sdd->sd) { 6361 if (sdd->sd) {
6339 sd = *per_cpu_ptr(sdd->sd, j); 6362 sd = *per_cpu_ptr(sdd->sd, j);
6340 if (sd && (sd->flags & SD_OVERLAP)) 6363 if (sd && (sd->flags & SD_OVERLAP))
6341 free_sched_groups(sd->groups, 0); 6364 free_sched_groups(sd->groups, 0);
6342 kfree(*per_cpu_ptr(sdd->sd, j)); 6365 kfree(*per_cpu_ptr(sdd->sd, j));
6343 } 6366 }
6344 6367
6345 if (sdd->sg) 6368 if (sdd->sg)
6346 kfree(*per_cpu_ptr(sdd->sg, j)); 6369 kfree(*per_cpu_ptr(sdd->sg, j));
6347 if (sdd->sgp) 6370 if (sdd->sgp)
6348 kfree(*per_cpu_ptr(sdd->sgp, j)); 6371 kfree(*per_cpu_ptr(sdd->sgp, j));
6349 } 6372 }
6350 free_percpu(sdd->sd); 6373 free_percpu(sdd->sd);
6351 sdd->sd = NULL; 6374 sdd->sd = NULL;
6352 free_percpu(sdd->sg); 6375 free_percpu(sdd->sg);
6353 sdd->sg = NULL; 6376 sdd->sg = NULL;
6354 free_percpu(sdd->sgp); 6377 free_percpu(sdd->sgp);
6355 sdd->sgp = NULL; 6378 sdd->sgp = NULL;
6356 } 6379 }
6357 } 6380 }
6358 6381
6359 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, 6382 struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6360 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6383 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6361 struct sched_domain *child, int cpu) 6384 struct sched_domain *child, int cpu)
6362 { 6385 {
6363 struct sched_domain *sd = tl->init(tl, cpu); 6386 struct sched_domain *sd = tl->init(tl, cpu);
6364 if (!sd) 6387 if (!sd)
6365 return child; 6388 return child;
6366 6389
6367 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6390 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6368 if (child) { 6391 if (child) {
6369 sd->level = child->level + 1; 6392 sd->level = child->level + 1;
6370 sched_domain_level_max = max(sched_domain_level_max, sd->level); 6393 sched_domain_level_max = max(sched_domain_level_max, sd->level);
6371 child->parent = sd; 6394 child->parent = sd;
6372 sd->child = child; 6395 sd->child = child;
6373 } 6396 }
6374 set_domain_attribute(sd, attr); 6397 set_domain_attribute(sd, attr);
6375 6398
6376 return sd; 6399 return sd;
6377 } 6400 }
6378 6401
6379 /* 6402 /*
6380 * Build sched domains for a given set of cpus and attach the sched domains 6403 * Build sched domains for a given set of cpus and attach the sched domains
6381 * to the individual cpus 6404 * to the individual cpus
6382 */ 6405 */
6383 static int build_sched_domains(const struct cpumask *cpu_map, 6406 static int build_sched_domains(const struct cpumask *cpu_map,
6384 struct sched_domain_attr *attr) 6407 struct sched_domain_attr *attr)
6385 { 6408 {
6386 enum s_alloc alloc_state; 6409 enum s_alloc alloc_state;
6387 struct sched_domain *sd; 6410 struct sched_domain *sd;
6388 struct s_data d; 6411 struct s_data d;
6389 int i, ret = -ENOMEM; 6412 int i, ret = -ENOMEM;
6390 6413
6391 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 6414 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6392 if (alloc_state != sa_rootdomain) 6415 if (alloc_state != sa_rootdomain)
6393 goto error; 6416 goto error;
6394 6417
6395 /* Set up domains for cpus specified by the cpu_map. */ 6418 /* Set up domains for cpus specified by the cpu_map. */
6396 for_each_cpu(i, cpu_map) { 6419 for_each_cpu(i, cpu_map) {
6397 struct sched_domain_topology_level *tl; 6420 struct sched_domain_topology_level *tl;
6398 6421
6399 sd = NULL; 6422 sd = NULL;
6400 for_each_sd_topology(tl) { 6423 for_each_sd_topology(tl) {
6401 sd = build_sched_domain(tl, cpu_map, attr, sd, i); 6424 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
6402 if (tl == sched_domain_topology) 6425 if (tl == sched_domain_topology)
6403 *per_cpu_ptr(d.sd, i) = sd; 6426 *per_cpu_ptr(d.sd, i) = sd;
6404 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) 6427 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
6405 sd->flags |= SD_OVERLAP; 6428 sd->flags |= SD_OVERLAP;
6406 if (cpumask_equal(cpu_map, sched_domain_span(sd))) 6429 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
6407 break; 6430 break;
6408 } 6431 }
6409 } 6432 }
6410 6433
6411 /* Build the groups for the domains */ 6434 /* Build the groups for the domains */
6412 for_each_cpu(i, cpu_map) { 6435 for_each_cpu(i, cpu_map) {
6413 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6436 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6414 sd->span_weight = cpumask_weight(sched_domain_span(sd)); 6437 sd->span_weight = cpumask_weight(sched_domain_span(sd));
6415 if (sd->flags & SD_OVERLAP) { 6438 if (sd->flags & SD_OVERLAP) {
6416 if (build_overlap_sched_groups(sd, i)) 6439 if (build_overlap_sched_groups(sd, i))
6417 goto error; 6440 goto error;
6418 } else { 6441 } else {
6419 if (build_sched_groups(sd, i)) 6442 if (build_sched_groups(sd, i))
6420 goto error; 6443 goto error;
6421 } 6444 }
6422 } 6445 }
6423 } 6446 }
6424 6447
6425 /* Calculate CPU power for physical packages and nodes */ 6448 /* Calculate CPU power for physical packages and nodes */
6426 for (i = nr_cpumask_bits-1; i >= 0; i--) { 6449 for (i = nr_cpumask_bits-1; i >= 0; i--) {
6427 if (!cpumask_test_cpu(i, cpu_map)) 6450 if (!cpumask_test_cpu(i, cpu_map))
6428 continue; 6451 continue;
6429 6452
6430 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { 6453 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
6431 claim_allocations(i, sd); 6454 claim_allocations(i, sd);
6432 init_sched_groups_power(i, sd); 6455 init_sched_groups_power(i, sd);
6433 } 6456 }
6434 } 6457 }
6435 6458
6436 /* Attach the domains */ 6459 /* Attach the domains */
6437 rcu_read_lock(); 6460 rcu_read_lock();
6438 for_each_cpu(i, cpu_map) { 6461 for_each_cpu(i, cpu_map) {
6439 sd = *per_cpu_ptr(d.sd, i); 6462 sd = *per_cpu_ptr(d.sd, i);
6440 cpu_attach_domain(sd, d.rd, i); 6463 cpu_attach_domain(sd, d.rd, i);
6441 } 6464 }
6442 rcu_read_unlock(); 6465 rcu_read_unlock();
6443 6466
6444 ret = 0; 6467 ret = 0;
6445 error: 6468 error:
6446 __free_domain_allocs(&d, alloc_state, cpu_map); 6469 __free_domain_allocs(&d, alloc_state, cpu_map);
6447 return ret; 6470 return ret;
6448 } 6471 }
6449 6472
6450 static cpumask_var_t *doms_cur; /* current sched domains */ 6473 static cpumask_var_t *doms_cur; /* current sched domains */
6451 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 6474 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6452 static struct sched_domain_attr *dattr_cur; 6475 static struct sched_domain_attr *dattr_cur;
6453 /* attribues of custom domains in 'doms_cur' */ 6476 /* attribues of custom domains in 'doms_cur' */
6454 6477
6455 /* 6478 /*
6456 * Special case: If a kmalloc of a doms_cur partition (array of 6479 * Special case: If a kmalloc of a doms_cur partition (array of
6457 * cpumask) fails, then fallback to a single sched domain, 6480 * cpumask) fails, then fallback to a single sched domain,
6458 * as determined by the single cpumask fallback_doms. 6481 * as determined by the single cpumask fallback_doms.
6459 */ 6482 */
6460 static cpumask_var_t fallback_doms; 6483 static cpumask_var_t fallback_doms;
6461 6484
6462 /* 6485 /*
6463 * arch_update_cpu_topology lets virtualized architectures update the 6486 * arch_update_cpu_topology lets virtualized architectures update the
6464 * cpu core maps. It is supposed to return 1 if the topology changed 6487 * cpu core maps. It is supposed to return 1 if the topology changed
6465 * or 0 if it stayed the same. 6488 * or 0 if it stayed the same.
6466 */ 6489 */
6467 int __weak arch_update_cpu_topology(void) 6490 int __weak arch_update_cpu_topology(void)
6468 { 6491 {
6469 return 0; 6492 return 0;
6470 } 6493 }
6471 6494
6472 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 6495 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
6473 { 6496 {
6474 int i; 6497 int i;
6475 cpumask_var_t *doms; 6498 cpumask_var_t *doms;
6476 6499
6477 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 6500 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
6478 if (!doms) 6501 if (!doms)
6479 return NULL; 6502 return NULL;
6480 for (i = 0; i < ndoms; i++) { 6503 for (i = 0; i < ndoms; i++) {
6481 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 6504 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
6482 free_sched_domains(doms, i); 6505 free_sched_domains(doms, i);
6483 return NULL; 6506 return NULL;
6484 } 6507 }
6485 } 6508 }
6486 return doms; 6509 return doms;
6487 } 6510 }
6488 6511
6489 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 6512 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
6490 { 6513 {
6491 unsigned int i; 6514 unsigned int i;
6492 for (i = 0; i < ndoms; i++) 6515 for (i = 0; i < ndoms; i++)
6493 free_cpumask_var(doms[i]); 6516 free_cpumask_var(doms[i]);
6494 kfree(doms); 6517 kfree(doms);
6495 } 6518 }
6496 6519
6497 /* 6520 /*
6498 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6521 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6499 * For now this just excludes isolated cpus, but could be used to 6522 * For now this just excludes isolated cpus, but could be used to
6500 * exclude other special cases in the future. 6523 * exclude other special cases in the future.
6501 */ 6524 */
6502 static int init_sched_domains(const struct cpumask *cpu_map) 6525 static int init_sched_domains(const struct cpumask *cpu_map)
6503 { 6526 {
6504 int err; 6527 int err;
6505 6528
6506 arch_update_cpu_topology(); 6529 arch_update_cpu_topology();
6507 ndoms_cur = 1; 6530 ndoms_cur = 1;
6508 doms_cur = alloc_sched_domains(ndoms_cur); 6531 doms_cur = alloc_sched_domains(ndoms_cur);
6509 if (!doms_cur) 6532 if (!doms_cur)
6510 doms_cur = &fallback_doms; 6533 doms_cur = &fallback_doms;
6511 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6534 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6512 err = build_sched_domains(doms_cur[0], NULL); 6535 err = build_sched_domains(doms_cur[0], NULL);
6513 register_sched_domain_sysctl(); 6536 register_sched_domain_sysctl();
6514 6537
6515 return err; 6538 return err;
6516 } 6539 }
6517 6540
6518 /* 6541 /*
6519 * Detach sched domains from a group of cpus specified in cpu_map 6542 * Detach sched domains from a group of cpus specified in cpu_map
6520 * These cpus will now be attached to the NULL domain 6543 * These cpus will now be attached to the NULL domain
6521 */ 6544 */
6522 static void detach_destroy_domains(const struct cpumask *cpu_map) 6545 static void detach_destroy_domains(const struct cpumask *cpu_map)
6523 { 6546 {
6524 int i; 6547 int i;
6525 6548
6526 rcu_read_lock(); 6549 rcu_read_lock();
6527 for_each_cpu(i, cpu_map) 6550 for_each_cpu(i, cpu_map)
6528 cpu_attach_domain(NULL, &def_root_domain, i); 6551 cpu_attach_domain(NULL, &def_root_domain, i);
6529 rcu_read_unlock(); 6552 rcu_read_unlock();
6530 } 6553 }
6531 6554
6532 /* handle null as "default" */ 6555 /* handle null as "default" */
6533 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 6556 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
6534 struct sched_domain_attr *new, int idx_new) 6557 struct sched_domain_attr *new, int idx_new)
6535 { 6558 {
6536 struct sched_domain_attr tmp; 6559 struct sched_domain_attr tmp;
6537 6560
6538 /* fast path */ 6561 /* fast path */
6539 if (!new && !cur) 6562 if (!new && !cur)
6540 return 1; 6563 return 1;
6541 6564
6542 tmp = SD_ATTR_INIT; 6565 tmp = SD_ATTR_INIT;
6543 return !memcmp(cur ? (cur + idx_cur) : &tmp, 6566 return !memcmp(cur ? (cur + idx_cur) : &tmp,
6544 new ? (new + idx_new) : &tmp, 6567 new ? (new + idx_new) : &tmp,
6545 sizeof(struct sched_domain_attr)); 6568 sizeof(struct sched_domain_attr));
6546 } 6569 }
6547 6570
6548 /* 6571 /*
6549 * Partition sched domains as specified by the 'ndoms_new' 6572 * Partition sched domains as specified by the 'ndoms_new'
6550 * cpumasks in the array doms_new[] of cpumasks. This compares 6573 * cpumasks in the array doms_new[] of cpumasks. This compares
6551 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6574 * doms_new[] to the current sched domain partitioning, doms_cur[].
6552 * It destroys each deleted domain and builds each new domain. 6575 * It destroys each deleted domain and builds each new domain.
6553 * 6576 *
6554 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 6577 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
6555 * The masks don't intersect (don't overlap.) We should setup one 6578 * The masks don't intersect (don't overlap.) We should setup one
6556 * sched domain for each mask. CPUs not in any of the cpumasks will 6579 * sched domain for each mask. CPUs not in any of the cpumasks will
6557 * not be load balanced. If the same cpumask appears both in the 6580 * not be load balanced. If the same cpumask appears both in the
6558 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6581 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6559 * it as it is. 6582 * it as it is.
6560 * 6583 *
6561 * The passed in 'doms_new' should be allocated using 6584 * The passed in 'doms_new' should be allocated using
6562 * alloc_sched_domains. This routine takes ownership of it and will 6585 * alloc_sched_domains. This routine takes ownership of it and will
6563 * free_sched_domains it when done with it. If the caller failed the 6586 * free_sched_domains it when done with it. If the caller failed the
6564 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 6587 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
6565 * and partition_sched_domains() will fallback to the single partition 6588 * and partition_sched_domains() will fallback to the single partition
6566 * 'fallback_doms', it also forces the domains to be rebuilt. 6589 * 'fallback_doms', it also forces the domains to be rebuilt.
6567 * 6590 *
6568 * If doms_new == NULL it will be replaced with cpu_online_mask. 6591 * If doms_new == NULL it will be replaced with cpu_online_mask.
6569 * ndoms_new == 0 is a special case for destroying existing domains, 6592 * ndoms_new == 0 is a special case for destroying existing domains,
6570 * and it will not create the default domain. 6593 * and it will not create the default domain.
6571 * 6594 *
6572 * Call with hotplug lock held 6595 * Call with hotplug lock held
6573 */ 6596 */
6574 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 6597 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
6575 struct sched_domain_attr *dattr_new) 6598 struct sched_domain_attr *dattr_new)
6576 { 6599 {
6577 int i, j, n; 6600 int i, j, n;
6578 int new_topology; 6601 int new_topology;
6579 6602
6580 mutex_lock(&sched_domains_mutex); 6603 mutex_lock(&sched_domains_mutex);
6581 6604
6582 /* always unregister in case we don't destroy any domains */ 6605 /* always unregister in case we don't destroy any domains */
6583 unregister_sched_domain_sysctl(); 6606 unregister_sched_domain_sysctl();
6584 6607
6585 /* Let architecture update cpu core mappings. */ 6608 /* Let architecture update cpu core mappings. */
6586 new_topology = arch_update_cpu_topology(); 6609 new_topology = arch_update_cpu_topology();
6587 6610
6588 n = doms_new ? ndoms_new : 0; 6611 n = doms_new ? ndoms_new : 0;
6589 6612
6590 /* Destroy deleted domains */ 6613 /* Destroy deleted domains */
6591 for (i = 0; i < ndoms_cur; i++) { 6614 for (i = 0; i < ndoms_cur; i++) {
6592 for (j = 0; j < n && !new_topology; j++) { 6615 for (j = 0; j < n && !new_topology; j++) {
6593 if (cpumask_equal(doms_cur[i], doms_new[j]) 6616 if (cpumask_equal(doms_cur[i], doms_new[j])
6594 && dattrs_equal(dattr_cur, i, dattr_new, j)) 6617 && dattrs_equal(dattr_cur, i, dattr_new, j))
6595 goto match1; 6618 goto match1;
6596 } 6619 }
6597 /* no match - a current sched domain not in new doms_new[] */ 6620 /* no match - a current sched domain not in new doms_new[] */
6598 detach_destroy_domains(doms_cur[i]); 6621 detach_destroy_domains(doms_cur[i]);
6599 match1: 6622 match1:
6600 ; 6623 ;
6601 } 6624 }
6602 6625
6603 n = ndoms_cur; 6626 n = ndoms_cur;
6604 if (doms_new == NULL) { 6627 if (doms_new == NULL) {
6605 n = 0; 6628 n = 0;
6606 doms_new = &fallback_doms; 6629 doms_new = &fallback_doms;
6607 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 6630 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
6608 WARN_ON_ONCE(dattr_new); 6631 WARN_ON_ONCE(dattr_new);
6609 } 6632 }
6610 6633
6611 /* Build new domains */ 6634 /* Build new domains */
6612 for (i = 0; i < ndoms_new; i++) { 6635 for (i = 0; i < ndoms_new; i++) {
6613 for (j = 0; j < n && !new_topology; j++) { 6636 for (j = 0; j < n && !new_topology; j++) {
6614 if (cpumask_equal(doms_new[i], doms_cur[j]) 6637 if (cpumask_equal(doms_new[i], doms_cur[j])
6615 && dattrs_equal(dattr_new, i, dattr_cur, j)) 6638 && dattrs_equal(dattr_new, i, dattr_cur, j))
6616 goto match2; 6639 goto match2;
6617 } 6640 }
6618 /* no match - add a new doms_new */ 6641 /* no match - add a new doms_new */
6619 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); 6642 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
6620 match2: 6643 match2:
6621 ; 6644 ;
6622 } 6645 }
6623 6646
6624 /* Remember the new sched domains */ 6647 /* Remember the new sched domains */
6625 if (doms_cur != &fallback_doms) 6648 if (doms_cur != &fallback_doms)
6626 free_sched_domains(doms_cur, ndoms_cur); 6649 free_sched_domains(doms_cur, ndoms_cur);
6627 kfree(dattr_cur); /* kfree(NULL) is safe */ 6650 kfree(dattr_cur); /* kfree(NULL) is safe */
6628 doms_cur = doms_new; 6651 doms_cur = doms_new;
6629 dattr_cur = dattr_new; 6652 dattr_cur = dattr_new;
6630 ndoms_cur = ndoms_new; 6653 ndoms_cur = ndoms_new;
6631 6654
6632 register_sched_domain_sysctl(); 6655 register_sched_domain_sysctl();
6633 6656
6634 mutex_unlock(&sched_domains_mutex); 6657 mutex_unlock(&sched_domains_mutex);
6635 } 6658 }
6636 6659
6637 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */ 6660 static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
6638 6661
6639 /* 6662 /*
6640 * Update cpusets according to cpu_active mask. If cpusets are 6663 * Update cpusets according to cpu_active mask. If cpusets are
6641 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 6664 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
6642 * around partition_sched_domains(). 6665 * around partition_sched_domains().
6643 * 6666 *
6644 * If we come here as part of a suspend/resume, don't touch cpusets because we 6667 * If we come here as part of a suspend/resume, don't touch cpusets because we
6645 * want to restore it back to its original state upon resume anyway. 6668 * want to restore it back to its original state upon resume anyway.
6646 */ 6669 */
6647 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, 6670 static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
6648 void *hcpu) 6671 void *hcpu)
6649 { 6672 {
6650 switch (action) { 6673 switch (action) {
6651 case CPU_ONLINE_FROZEN: 6674 case CPU_ONLINE_FROZEN:
6652 case CPU_DOWN_FAILED_FROZEN: 6675 case CPU_DOWN_FAILED_FROZEN:
6653 6676
6654 /* 6677 /*
6655 * num_cpus_frozen tracks how many CPUs are involved in suspend 6678 * num_cpus_frozen tracks how many CPUs are involved in suspend
6656 * resume sequence. As long as this is not the last online 6679 * resume sequence. As long as this is not the last online
6657 * operation in the resume sequence, just build a single sched 6680 * operation in the resume sequence, just build a single sched
6658 * domain, ignoring cpusets. 6681 * domain, ignoring cpusets.
6659 */ 6682 */
6660 num_cpus_frozen--; 6683 num_cpus_frozen--;
6661 if (likely(num_cpus_frozen)) { 6684 if (likely(num_cpus_frozen)) {
6662 partition_sched_domains(1, NULL, NULL); 6685 partition_sched_domains(1, NULL, NULL);
6663 break; 6686 break;
6664 } 6687 }
6665 6688
6666 /* 6689 /*
6667 * This is the last CPU online operation. So fall through and 6690 * This is the last CPU online operation. So fall through and
6668 * restore the original sched domains by considering the 6691 * restore the original sched domains by considering the
6669 * cpuset configurations. 6692 * cpuset configurations.
6670 */ 6693 */
6671 6694
6672 case CPU_ONLINE: 6695 case CPU_ONLINE:
6673 case CPU_DOWN_FAILED: 6696 case CPU_DOWN_FAILED:
6674 cpuset_update_active_cpus(true); 6697 cpuset_update_active_cpus(true);
6675 break; 6698 break;
6676 default: 6699 default:
6677 return NOTIFY_DONE; 6700 return NOTIFY_DONE;
6678 } 6701 }
6679 return NOTIFY_OK; 6702 return NOTIFY_OK;
6680 } 6703 }
6681 6704
6682 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, 6705 static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
6683 void *hcpu) 6706 void *hcpu)
6684 { 6707 {
6685 switch (action) { 6708 switch (action) {
6686 case CPU_DOWN_PREPARE: 6709 case CPU_DOWN_PREPARE:
6687 cpuset_update_active_cpus(false); 6710 cpuset_update_active_cpus(false);
6688 break; 6711 break;
6689 case CPU_DOWN_PREPARE_FROZEN: 6712 case CPU_DOWN_PREPARE_FROZEN:
6690 num_cpus_frozen++; 6713 num_cpus_frozen++;
6691 partition_sched_domains(1, NULL, NULL); 6714 partition_sched_domains(1, NULL, NULL);
6692 break; 6715 break;
6693 default: 6716 default:
6694 return NOTIFY_DONE; 6717 return NOTIFY_DONE;
6695 } 6718 }
6696 return NOTIFY_OK; 6719 return NOTIFY_OK;
6697 } 6720 }
6698 6721
6699 void __init sched_init_smp(void) 6722 void __init sched_init_smp(void)
6700 { 6723 {
6701 cpumask_var_t non_isolated_cpus; 6724 cpumask_var_t non_isolated_cpus;
6702 6725
6703 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 6726 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6704 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 6727 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6705 6728
6706 sched_init_numa(); 6729 sched_init_numa();
6707 6730
6708 /* 6731 /*
6709 * There's no userspace yet to cause hotplug operations; hence all the 6732 * There's no userspace yet to cause hotplug operations; hence all the
6710 * cpu masks are stable and all blatant races in the below code cannot 6733 * cpu masks are stable and all blatant races in the below code cannot
6711 * happen. 6734 * happen.
6712 */ 6735 */
6713 mutex_lock(&sched_domains_mutex); 6736 mutex_lock(&sched_domains_mutex);
6714 init_sched_domains(cpu_active_mask); 6737 init_sched_domains(cpu_active_mask);
6715 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6738 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6716 if (cpumask_empty(non_isolated_cpus)) 6739 if (cpumask_empty(non_isolated_cpus))
6717 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6740 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6718 mutex_unlock(&sched_domains_mutex); 6741 mutex_unlock(&sched_domains_mutex);
6719 6742
6720 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6743 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6721 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6744 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
6722 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); 6745 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
6723 6746
6724 init_hrtick(); 6747 init_hrtick();
6725 6748
6726 /* Move init over to a non-isolated CPU */ 6749 /* Move init over to a non-isolated CPU */
6727 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 6750 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
6728 BUG(); 6751 BUG();
6729 sched_init_granularity(); 6752 sched_init_granularity();
6730 free_cpumask_var(non_isolated_cpus); 6753 free_cpumask_var(non_isolated_cpus);
6731 6754
6732 init_sched_rt_class(); 6755 init_sched_rt_class();
6733 init_sched_dl_class(); 6756 init_sched_dl_class();
6734 } 6757 }
6735 #else 6758 #else
6736 void __init sched_init_smp(void) 6759 void __init sched_init_smp(void)
6737 { 6760 {
6738 sched_init_granularity(); 6761 sched_init_granularity();
6739 } 6762 }
6740 #endif /* CONFIG_SMP */ 6763 #endif /* CONFIG_SMP */
6741 6764
6742 const_debug unsigned int sysctl_timer_migration = 1; 6765 const_debug unsigned int sysctl_timer_migration = 1;
6743 6766
6744 int in_sched_functions(unsigned long addr) 6767 int in_sched_functions(unsigned long addr)
6745 { 6768 {
6746 return in_lock_functions(addr) || 6769 return in_lock_functions(addr) ||
6747 (addr >= (unsigned long)__sched_text_start 6770 (addr >= (unsigned long)__sched_text_start
6748 && addr < (unsigned long)__sched_text_end); 6771 && addr < (unsigned long)__sched_text_end);
6749 } 6772 }
6750 6773
6751 #ifdef CONFIG_CGROUP_SCHED 6774 #ifdef CONFIG_CGROUP_SCHED
6752 /* 6775 /*
6753 * Default task group. 6776 * Default task group.
6754 * Every task in system belongs to this group at bootup. 6777 * Every task in system belongs to this group at bootup.
6755 */ 6778 */
6756 struct task_group root_task_group; 6779 struct task_group root_task_group;
6757 LIST_HEAD(task_groups); 6780 LIST_HEAD(task_groups);
6758 #endif 6781 #endif
6759 6782
6760 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 6783 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
6761 6784
6762 void __init sched_init(void) 6785 void __init sched_init(void)
6763 { 6786 {
6764 int i, j; 6787 int i, j;
6765 unsigned long alloc_size = 0, ptr; 6788 unsigned long alloc_size = 0, ptr;
6766 6789
6767 #ifdef CONFIG_FAIR_GROUP_SCHED 6790 #ifdef CONFIG_FAIR_GROUP_SCHED
6768 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6791 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6769 #endif 6792 #endif
6770 #ifdef CONFIG_RT_GROUP_SCHED 6793 #ifdef CONFIG_RT_GROUP_SCHED
6771 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6794 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
6772 #endif 6795 #endif
6773 #ifdef CONFIG_CPUMASK_OFFSTACK 6796 #ifdef CONFIG_CPUMASK_OFFSTACK
6774 alloc_size += num_possible_cpus() * cpumask_size(); 6797 alloc_size += num_possible_cpus() * cpumask_size();
6775 #endif 6798 #endif
6776 if (alloc_size) { 6799 if (alloc_size) {
6777 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6800 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
6778 6801
6779 #ifdef CONFIG_FAIR_GROUP_SCHED 6802 #ifdef CONFIG_FAIR_GROUP_SCHED
6780 root_task_group.se = (struct sched_entity **)ptr; 6803 root_task_group.se = (struct sched_entity **)ptr;
6781 ptr += nr_cpu_ids * sizeof(void **); 6804 ptr += nr_cpu_ids * sizeof(void **);
6782 6805
6783 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 6806 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
6784 ptr += nr_cpu_ids * sizeof(void **); 6807 ptr += nr_cpu_ids * sizeof(void **);
6785 6808
6786 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6809 #endif /* CONFIG_FAIR_GROUP_SCHED */
6787 #ifdef CONFIG_RT_GROUP_SCHED 6810 #ifdef CONFIG_RT_GROUP_SCHED
6788 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 6811 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
6789 ptr += nr_cpu_ids * sizeof(void **); 6812 ptr += nr_cpu_ids * sizeof(void **);
6790 6813
6791 root_task_group.rt_rq = (struct rt_rq **)ptr; 6814 root_task_group.rt_rq = (struct rt_rq **)ptr;
6792 ptr += nr_cpu_ids * sizeof(void **); 6815 ptr += nr_cpu_ids * sizeof(void **);
6793 6816
6794 #endif /* CONFIG_RT_GROUP_SCHED */ 6817 #endif /* CONFIG_RT_GROUP_SCHED */
6795 #ifdef CONFIG_CPUMASK_OFFSTACK 6818 #ifdef CONFIG_CPUMASK_OFFSTACK
6796 for_each_possible_cpu(i) { 6819 for_each_possible_cpu(i) {
6797 per_cpu(load_balance_mask, i) = (void *)ptr; 6820 per_cpu(load_balance_mask, i) = (void *)ptr;
6798 ptr += cpumask_size(); 6821 ptr += cpumask_size();
6799 } 6822 }
6800 #endif /* CONFIG_CPUMASK_OFFSTACK */ 6823 #endif /* CONFIG_CPUMASK_OFFSTACK */
6801 } 6824 }
6802 6825
6803 init_rt_bandwidth(&def_rt_bandwidth, 6826 init_rt_bandwidth(&def_rt_bandwidth,
6804 global_rt_period(), global_rt_runtime()); 6827 global_rt_period(), global_rt_runtime());
6805 init_dl_bandwidth(&def_dl_bandwidth, 6828 init_dl_bandwidth(&def_dl_bandwidth,
6806 global_rt_period(), global_rt_runtime()); 6829 global_rt_period(), global_rt_runtime());
6807 6830
6808 #ifdef CONFIG_SMP 6831 #ifdef CONFIG_SMP
6809 init_defrootdomain(); 6832 init_defrootdomain();
6810 #endif 6833 #endif
6811 6834
6812 #ifdef CONFIG_RT_GROUP_SCHED 6835 #ifdef CONFIG_RT_GROUP_SCHED
6813 init_rt_bandwidth(&root_task_group.rt_bandwidth, 6836 init_rt_bandwidth(&root_task_group.rt_bandwidth,
6814 global_rt_period(), global_rt_runtime()); 6837 global_rt_period(), global_rt_runtime());
6815 #endif /* CONFIG_RT_GROUP_SCHED */ 6838 #endif /* CONFIG_RT_GROUP_SCHED */
6816 6839
6817 #ifdef CONFIG_CGROUP_SCHED 6840 #ifdef CONFIG_CGROUP_SCHED
6818 list_add(&root_task_group.list, &task_groups); 6841 list_add(&root_task_group.list, &task_groups);
6819 INIT_LIST_HEAD(&root_task_group.children); 6842 INIT_LIST_HEAD(&root_task_group.children);
6820 INIT_LIST_HEAD(&root_task_group.siblings); 6843 INIT_LIST_HEAD(&root_task_group.siblings);
6821 autogroup_init(&init_task); 6844 autogroup_init(&init_task);
6822 6845
6823 #endif /* CONFIG_CGROUP_SCHED */ 6846 #endif /* CONFIG_CGROUP_SCHED */
6824 6847
6825 for_each_possible_cpu(i) { 6848 for_each_possible_cpu(i) {
6826 struct rq *rq; 6849 struct rq *rq;
6827 6850
6828 rq = cpu_rq(i); 6851 rq = cpu_rq(i);
6829 raw_spin_lock_init(&rq->lock); 6852 raw_spin_lock_init(&rq->lock);
6830 rq->nr_running = 0; 6853 rq->nr_running = 0;
6831 rq->calc_load_active = 0; 6854 rq->calc_load_active = 0;
6832 rq->calc_load_update = jiffies + LOAD_FREQ; 6855 rq->calc_load_update = jiffies + LOAD_FREQ;
6833 init_cfs_rq(&rq->cfs); 6856 init_cfs_rq(&rq->cfs);
6834 init_rt_rq(&rq->rt, rq); 6857 init_rt_rq(&rq->rt, rq);
6835 init_dl_rq(&rq->dl, rq); 6858 init_dl_rq(&rq->dl, rq);
6836 #ifdef CONFIG_FAIR_GROUP_SCHED 6859 #ifdef CONFIG_FAIR_GROUP_SCHED
6837 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 6860 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
6838 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6861 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6839 /* 6862 /*
6840 * How much cpu bandwidth does root_task_group get? 6863 * How much cpu bandwidth does root_task_group get?
6841 * 6864 *
6842 * In case of task-groups formed thr' the cgroup filesystem, it 6865 * In case of task-groups formed thr' the cgroup filesystem, it
6843 * gets 100% of the cpu resources in the system. This overall 6866 * gets 100% of the cpu resources in the system. This overall
6844 * system cpu resource is divided among the tasks of 6867 * system cpu resource is divided among the tasks of
6845 * root_task_group and its child task-groups in a fair manner, 6868 * root_task_group and its child task-groups in a fair manner,
6846 * based on each entity's (task or task-group's) weight 6869 * based on each entity's (task or task-group's) weight
6847 * (se->load.weight). 6870 * (se->load.weight).
6848 * 6871 *
6849 * In other words, if root_task_group has 10 tasks of weight 6872 * In other words, if root_task_group has 10 tasks of weight
6850 * 1024) and two child groups A0 and A1 (of weight 1024 each), 6873 * 1024) and two child groups A0 and A1 (of weight 1024 each),
6851 * then A0's share of the cpu resource is: 6874 * then A0's share of the cpu resource is:
6852 * 6875 *
6853 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 6876 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
6854 * 6877 *
6855 * We achieve this by letting root_task_group's tasks sit 6878 * We achieve this by letting root_task_group's tasks sit
6856 * directly in rq->cfs (i.e root_task_group->se[] = NULL). 6879 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
6857 */ 6880 */
6858 init_cfs_bandwidth(&root_task_group.cfs_bandwidth); 6881 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
6859 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); 6882 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
6860 #endif /* CONFIG_FAIR_GROUP_SCHED */ 6883 #endif /* CONFIG_FAIR_GROUP_SCHED */
6861 6884
6862 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 6885 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
6863 #ifdef CONFIG_RT_GROUP_SCHED 6886 #ifdef CONFIG_RT_GROUP_SCHED
6864 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6887 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6865 #endif 6888 #endif
6866 6889
6867 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6890 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6868 rq->cpu_load[j] = 0; 6891 rq->cpu_load[j] = 0;
6869 6892
6870 rq->last_load_update_tick = jiffies; 6893 rq->last_load_update_tick = jiffies;
6871 6894
6872 #ifdef CONFIG_SMP 6895 #ifdef CONFIG_SMP
6873 rq->sd = NULL; 6896 rq->sd = NULL;
6874 rq->rd = NULL; 6897 rq->rd = NULL;
6875 rq->cpu_power = SCHED_POWER_SCALE; 6898 rq->cpu_power = SCHED_POWER_SCALE;
6876 rq->post_schedule = 0; 6899 rq->post_schedule = 0;
6877 rq->active_balance = 0; 6900 rq->active_balance = 0;
6878 rq->next_balance = jiffies; 6901 rq->next_balance = jiffies;
6879 rq->push_cpu = 0; 6902 rq->push_cpu = 0;
6880 rq->cpu = i; 6903 rq->cpu = i;
6881 rq->online = 0; 6904 rq->online = 0;
6882 rq->idle_stamp = 0; 6905 rq->idle_stamp = 0;
6883 rq->avg_idle = 2*sysctl_sched_migration_cost; 6906 rq->avg_idle = 2*sysctl_sched_migration_cost;
6884 rq->max_idle_balance_cost = sysctl_sched_migration_cost; 6907 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6885 6908
6886 INIT_LIST_HEAD(&rq->cfs_tasks); 6909 INIT_LIST_HEAD(&rq->cfs_tasks);
6887 6910
6888 rq_attach_root(rq, &def_root_domain); 6911 rq_attach_root(rq, &def_root_domain);
6889 #ifdef CONFIG_NO_HZ_COMMON 6912 #ifdef CONFIG_NO_HZ_COMMON
6890 rq->nohz_flags = 0; 6913 rq->nohz_flags = 0;
6891 #endif 6914 #endif
6892 #ifdef CONFIG_NO_HZ_FULL 6915 #ifdef CONFIG_NO_HZ_FULL
6893 rq->last_sched_tick = 0; 6916 rq->last_sched_tick = 0;
6894 #endif 6917 #endif
6895 #endif 6918 #endif
6896 init_rq_hrtick(rq); 6919 init_rq_hrtick(rq);
6897 atomic_set(&rq->nr_iowait, 0); 6920 atomic_set(&rq->nr_iowait, 0);
6898 } 6921 }
6899 6922
6900 set_load_weight(&init_task); 6923 set_load_weight(&init_task);
6901 6924
6902 #ifdef CONFIG_PREEMPT_NOTIFIERS 6925 #ifdef CONFIG_PREEMPT_NOTIFIERS
6903 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6926 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6904 #endif 6927 #endif
6905 6928
6906 /* 6929 /*
6907 * The boot idle thread does lazy MMU switching as well: 6930 * The boot idle thread does lazy MMU switching as well:
6908 */ 6931 */
6909 atomic_inc(&init_mm.mm_count); 6932 atomic_inc(&init_mm.mm_count);
6910 enter_lazy_tlb(&init_mm, current); 6933 enter_lazy_tlb(&init_mm, current);
6911 6934
6912 /* 6935 /*
6913 * Make us the idle thread. Technically, schedule() should not be 6936 * Make us the idle thread. Technically, schedule() should not be
6914 * called from this thread, however somewhere below it might be, 6937 * called from this thread, however somewhere below it might be,
6915 * but because we are the idle thread, we just pick up running again 6938 * but because we are the idle thread, we just pick up running again
6916 * when this runqueue becomes "idle". 6939 * when this runqueue becomes "idle".
6917 */ 6940 */
6918 init_idle(current, smp_processor_id()); 6941 init_idle(current, smp_processor_id());
6919 6942
6920 calc_load_update = jiffies + LOAD_FREQ; 6943 calc_load_update = jiffies + LOAD_FREQ;
6921 6944
6922 /* 6945 /*
6923 * During early bootup we pretend to be a normal task: 6946 * During early bootup we pretend to be a normal task:
6924 */ 6947 */
6925 current->sched_class = &fair_sched_class; 6948 current->sched_class = &fair_sched_class;
6926 6949
6927 #ifdef CONFIG_SMP 6950 #ifdef CONFIG_SMP
6928 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 6951 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
6929 /* May be allocated at isolcpus cmdline parse time */ 6952 /* May be allocated at isolcpus cmdline parse time */
6930 if (cpu_isolated_map == NULL) 6953 if (cpu_isolated_map == NULL)
6931 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 6954 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6932 idle_thread_set_boot_cpu(); 6955 idle_thread_set_boot_cpu();
6933 #endif 6956 #endif
6934 init_sched_fair_class(); 6957 init_sched_fair_class();
6935 6958
6936 scheduler_running = 1; 6959 scheduler_running = 1;
6937 } 6960 }
6938 6961
6939 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 6962 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6940 static inline int preempt_count_equals(int preempt_offset) 6963 static inline int preempt_count_equals(int preempt_offset)
6941 { 6964 {
6942 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 6965 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
6943 6966
6944 return (nested == preempt_offset); 6967 return (nested == preempt_offset);
6945 } 6968 }
6946 6969
6947 void __might_sleep(const char *file, int line, int preempt_offset) 6970 void __might_sleep(const char *file, int line, int preempt_offset)
6948 { 6971 {
6949 static unsigned long prev_jiffy; /* ratelimiting */ 6972 static unsigned long prev_jiffy; /* ratelimiting */
6950 6973
6951 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ 6974 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
6952 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && 6975 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6953 !is_idle_task(current)) || 6976 !is_idle_task(current)) ||
6954 system_state != SYSTEM_RUNNING || oops_in_progress) 6977 system_state != SYSTEM_RUNNING || oops_in_progress)
6955 return; 6978 return;
6956 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6979 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6957 return; 6980 return;
6958 prev_jiffy = jiffies; 6981 prev_jiffy = jiffies;
6959 6982
6960 printk(KERN_ERR 6983 printk(KERN_ERR
6961 "BUG: sleeping function called from invalid context at %s:%d\n", 6984 "BUG: sleeping function called from invalid context at %s:%d\n",
6962 file, line); 6985 file, line);
6963 printk(KERN_ERR 6986 printk(KERN_ERR
6964 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 6987 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6965 in_atomic(), irqs_disabled(), 6988 in_atomic(), irqs_disabled(),
6966 current->pid, current->comm); 6989 current->pid, current->comm);
6967 6990
6968 debug_show_held_locks(current); 6991 debug_show_held_locks(current);
6969 if (irqs_disabled()) 6992 if (irqs_disabled())
6970 print_irqtrace_events(current); 6993 print_irqtrace_events(current);
6971 #ifdef CONFIG_DEBUG_PREEMPT 6994 #ifdef CONFIG_DEBUG_PREEMPT
6972 if (!preempt_count_equals(preempt_offset)) { 6995 if (!preempt_count_equals(preempt_offset)) {
6973 pr_err("Preemption disabled at:"); 6996 pr_err("Preemption disabled at:");
6974 print_ip_sym(current->preempt_disable_ip); 6997 print_ip_sym(current->preempt_disable_ip);
6975 pr_cont("\n"); 6998 pr_cont("\n");
6976 } 6999 }
6977 #endif 7000 #endif
6978 dump_stack(); 7001 dump_stack();
6979 } 7002 }
6980 EXPORT_SYMBOL(__might_sleep); 7003 EXPORT_SYMBOL(__might_sleep);
6981 #endif 7004 #endif
6982 7005
6983 #ifdef CONFIG_MAGIC_SYSRQ 7006 #ifdef CONFIG_MAGIC_SYSRQ
6984 static void normalize_task(struct rq *rq, struct task_struct *p) 7007 static void normalize_task(struct rq *rq, struct task_struct *p)
6985 { 7008 {
6986 const struct sched_class *prev_class = p->sched_class; 7009 const struct sched_class *prev_class = p->sched_class;
6987 struct sched_attr attr = { 7010 struct sched_attr attr = {
6988 .sched_policy = SCHED_NORMAL, 7011 .sched_policy = SCHED_NORMAL,
6989 }; 7012 };
6990 int old_prio = p->prio; 7013 int old_prio = p->prio;
6991 int on_rq; 7014 int on_rq;
6992 7015
6993 on_rq = p->on_rq; 7016 on_rq = p->on_rq;
6994 if (on_rq) 7017 if (on_rq)
6995 dequeue_task(rq, p, 0); 7018 dequeue_task(rq, p, 0);
6996 __setscheduler(rq, p, &attr); 7019 __setscheduler(rq, p, &attr);
6997 if (on_rq) { 7020 if (on_rq) {
6998 enqueue_task(rq, p, 0); 7021 enqueue_task(rq, p, 0);
6999 resched_task(rq->curr); 7022 resched_task(rq->curr);
7000 } 7023 }
7001 7024
7002 check_class_changed(rq, p, prev_class, old_prio); 7025 check_class_changed(rq, p, prev_class, old_prio);
7003 } 7026 }
7004 7027
7005 void normalize_rt_tasks(void) 7028 void normalize_rt_tasks(void)
7006 { 7029 {
7007 struct task_struct *g, *p; 7030 struct task_struct *g, *p;
7008 unsigned long flags; 7031 unsigned long flags;
7009 struct rq *rq; 7032 struct rq *rq;
7010 7033
7011 read_lock_irqsave(&tasklist_lock, flags); 7034 read_lock_irqsave(&tasklist_lock, flags);
7012 do_each_thread(g, p) { 7035 do_each_thread(g, p) {
7013 /* 7036 /*
7014 * Only normalize user tasks: 7037 * Only normalize user tasks:
7015 */ 7038 */
7016 if (!p->mm) 7039 if (!p->mm)
7017 continue; 7040 continue;
7018 7041
7019 p->se.exec_start = 0; 7042 p->se.exec_start = 0;
7020 #ifdef CONFIG_SCHEDSTATS 7043 #ifdef CONFIG_SCHEDSTATS
7021 p->se.statistics.wait_start = 0; 7044 p->se.statistics.wait_start = 0;
7022 p->se.statistics.sleep_start = 0; 7045 p->se.statistics.sleep_start = 0;
7023 p->se.statistics.block_start = 0; 7046 p->se.statistics.block_start = 0;
7024 #endif 7047 #endif
7025 7048
7026 if (!dl_task(p) && !rt_task(p)) { 7049 if (!dl_task(p) && !rt_task(p)) {
7027 /* 7050 /*
7028 * Renice negative nice level userspace 7051 * Renice negative nice level userspace
7029 * tasks back to 0: 7052 * tasks back to 0:
7030 */ 7053 */
7031 if (task_nice(p) < 0 && p->mm) 7054 if (task_nice(p) < 0 && p->mm)
7032 set_user_nice(p, 0); 7055 set_user_nice(p, 0);
7033 continue; 7056 continue;
7034 } 7057 }
7035 7058
7036 raw_spin_lock(&p->pi_lock); 7059 raw_spin_lock(&p->pi_lock);
7037 rq = __task_rq_lock(p); 7060 rq = __task_rq_lock(p);
7038 7061
7039 normalize_task(rq, p); 7062 normalize_task(rq, p);
7040 7063
7041 __task_rq_unlock(rq); 7064 __task_rq_unlock(rq);
7042 raw_spin_unlock(&p->pi_lock); 7065 raw_spin_unlock(&p->pi_lock);
7043 } while_each_thread(g, p); 7066 } while_each_thread(g, p);
7044 7067
7045 read_unlock_irqrestore(&tasklist_lock, flags); 7068 read_unlock_irqrestore(&tasklist_lock, flags);
7046 } 7069 }
7047 7070
7048 #endif /* CONFIG_MAGIC_SYSRQ */ 7071 #endif /* CONFIG_MAGIC_SYSRQ */
7049 7072
7050 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) 7073 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7051 /* 7074 /*
7052 * These functions are only useful for the IA64 MCA handling, or kdb. 7075 * These functions are only useful for the IA64 MCA handling, or kdb.
7053 * 7076 *
7054 * They can only be called when the whole system has been 7077 * They can only be called when the whole system has been
7055 * stopped - every CPU needs to be quiescent, and no scheduling 7078 * stopped - every CPU needs to be quiescent, and no scheduling
7056 * activity can take place. Using them for anything else would 7079 * activity can take place. Using them for anything else would
7057 * be a serious bug, and as a result, they aren't even visible 7080 * be a serious bug, and as a result, they aren't even visible
7058 * under any other configuration. 7081 * under any other configuration.
7059 */ 7082 */
7060 7083
7061 /** 7084 /**
7062 * curr_task - return the current task for a given cpu. 7085 * curr_task - return the current task for a given cpu.
7063 * @cpu: the processor in question. 7086 * @cpu: the processor in question.
7064 * 7087 *
7065 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7088 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7066 * 7089 *
7067 * Return: The current task for @cpu. 7090 * Return: The current task for @cpu.
7068 */ 7091 */
7069 struct task_struct *curr_task(int cpu) 7092 struct task_struct *curr_task(int cpu)
7070 { 7093 {
7071 return cpu_curr(cpu); 7094 return cpu_curr(cpu);
7072 } 7095 }
7073 7096
7074 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ 7097 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7075 7098
7076 #ifdef CONFIG_IA64 7099 #ifdef CONFIG_IA64
7077 /** 7100 /**
7078 * set_curr_task - set the current task for a given cpu. 7101 * set_curr_task - set the current task for a given cpu.
7079 * @cpu: the processor in question. 7102 * @cpu: the processor in question.
7080 * @p: the task pointer to set. 7103 * @p: the task pointer to set.
7081 * 7104 *
7082 * Description: This function must only be used when non-maskable interrupts 7105 * Description: This function must only be used when non-maskable interrupts
7083 * are serviced on a separate stack. It allows the architecture to switch the 7106 * are serviced on a separate stack. It allows the architecture to switch the
7084 * notion of the current task on a cpu in a non-blocking manner. This function 7107 * notion of the current task on a cpu in a non-blocking manner. This function
7085 * must be called with all CPU's synchronized, and interrupts disabled, the 7108 * must be called with all CPU's synchronized, and interrupts disabled, the
7086 * and caller must save the original value of the current task (see 7109 * and caller must save the original value of the current task (see
7087 * curr_task() above) and restore that value before reenabling interrupts and 7110 * curr_task() above) and restore that value before reenabling interrupts and
7088 * re-starting the system. 7111 * re-starting the system.
7089 * 7112 *
7090 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7113 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7091 */ 7114 */
7092 void set_curr_task(int cpu, struct task_struct *p) 7115 void set_curr_task(int cpu, struct task_struct *p)
7093 { 7116 {
7094 cpu_curr(cpu) = p; 7117 cpu_curr(cpu) = p;
7095 } 7118 }
7096 7119
7097 #endif 7120 #endif
7098 7121
7099 #ifdef CONFIG_CGROUP_SCHED 7122 #ifdef CONFIG_CGROUP_SCHED
7100 /* task_group_lock serializes the addition/removal of task groups */ 7123 /* task_group_lock serializes the addition/removal of task groups */
7101 static DEFINE_SPINLOCK(task_group_lock); 7124 static DEFINE_SPINLOCK(task_group_lock);
7102 7125
7103 static void free_sched_group(struct task_group *tg) 7126 static void free_sched_group(struct task_group *tg)
7104 { 7127 {
7105 free_fair_sched_group(tg); 7128 free_fair_sched_group(tg);
7106 free_rt_sched_group(tg); 7129 free_rt_sched_group(tg);
7107 autogroup_free(tg); 7130 autogroup_free(tg);
7108 kfree(tg); 7131 kfree(tg);
7109 } 7132 }
7110 7133
7111 /* allocate runqueue etc for a new task group */ 7134 /* allocate runqueue etc for a new task group */
7112 struct task_group *sched_create_group(struct task_group *parent) 7135 struct task_group *sched_create_group(struct task_group *parent)
7113 { 7136 {
7114 struct task_group *tg; 7137 struct task_group *tg;
7115 7138
7116 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7139 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7117 if (!tg) 7140 if (!tg)
7118 return ERR_PTR(-ENOMEM); 7141 return ERR_PTR(-ENOMEM);
7119 7142
7120 if (!alloc_fair_sched_group(tg, parent)) 7143 if (!alloc_fair_sched_group(tg, parent))
7121 goto err; 7144 goto err;
7122 7145
7123 if (!alloc_rt_sched_group(tg, parent)) 7146 if (!alloc_rt_sched_group(tg, parent))
7124 goto err; 7147 goto err;
7125 7148
7126 return tg; 7149 return tg;
7127 7150
7128 err: 7151 err:
7129 free_sched_group(tg); 7152 free_sched_group(tg);
7130 return ERR_PTR(-ENOMEM); 7153 return ERR_PTR(-ENOMEM);
7131 } 7154 }
7132 7155
7133 void sched_online_group(struct task_group *tg, struct task_group *parent) 7156 void sched_online_group(struct task_group *tg, struct task_group *parent)
7134 { 7157 {
7135 unsigned long flags; 7158 unsigned long flags;
7136 7159
7137 spin_lock_irqsave(&task_group_lock, flags); 7160 spin_lock_irqsave(&task_group_lock, flags);
7138 list_add_rcu(&tg->list, &task_groups); 7161 list_add_rcu(&tg->list, &task_groups);
7139 7162
7140 WARN_ON(!parent); /* root should already exist */ 7163 WARN_ON(!parent); /* root should already exist */
7141 7164
7142 tg->parent = parent; 7165 tg->parent = parent;
7143 INIT_LIST_HEAD(&tg->children); 7166 INIT_LIST_HEAD(&tg->children);
7144 list_add_rcu(&tg->siblings, &parent->children); 7167 list_add_rcu(&tg->siblings, &parent->children);
7145 spin_unlock_irqrestore(&task_group_lock, flags); 7168 spin_unlock_irqrestore(&task_group_lock, flags);
7146 } 7169 }
7147 7170
7148 /* rcu callback to free various structures associated with a task group */ 7171 /* rcu callback to free various structures associated with a task group */
7149 static void free_sched_group_rcu(struct rcu_head *rhp) 7172 static void free_sched_group_rcu(struct rcu_head *rhp)
7150 { 7173 {
7151 /* now it should be safe to free those cfs_rqs */ 7174 /* now it should be safe to free those cfs_rqs */
7152 free_sched_group(container_of(rhp, struct task_group, rcu)); 7175 free_sched_group(container_of(rhp, struct task_group, rcu));
7153 } 7176 }
7154 7177
7155 /* Destroy runqueue etc associated with a task group */ 7178 /* Destroy runqueue etc associated with a task group */
7156 void sched_destroy_group(struct task_group *tg) 7179 void sched_destroy_group(struct task_group *tg)
7157 { 7180 {
7158 /* wait for possible concurrent references to cfs_rqs complete */ 7181 /* wait for possible concurrent references to cfs_rqs complete */
7159 call_rcu(&tg->rcu, free_sched_group_rcu); 7182 call_rcu(&tg->rcu, free_sched_group_rcu);
7160 } 7183 }
7161 7184
7162 void sched_offline_group(struct task_group *tg) 7185 void sched_offline_group(struct task_group *tg)
7163 { 7186 {
7164 unsigned long flags; 7187 unsigned long flags;
7165 int i; 7188 int i;
7166 7189
7167 /* end participation in shares distribution */ 7190 /* end participation in shares distribution */
7168 for_each_possible_cpu(i) 7191 for_each_possible_cpu(i)
7169 unregister_fair_sched_group(tg, i); 7192 unregister_fair_sched_group(tg, i);
7170 7193
7171 spin_lock_irqsave(&task_group_lock, flags); 7194 spin_lock_irqsave(&task_group_lock, flags);
7172 list_del_rcu(&tg->list); 7195 list_del_rcu(&tg->list);
7173 list_del_rcu(&tg->siblings); 7196 list_del_rcu(&tg->siblings);
7174 spin_unlock_irqrestore(&task_group_lock, flags); 7197 spin_unlock_irqrestore(&task_group_lock, flags);
7175 } 7198 }
7176 7199
7177 /* change task's runqueue when it moves between groups. 7200 /* change task's runqueue when it moves between groups.
7178 * The caller of this function should have put the task in its new group 7201 * The caller of this function should have put the task in its new group
7179 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 7202 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
7180 * reflect its new group. 7203 * reflect its new group.
7181 */ 7204 */
7182 void sched_move_task(struct task_struct *tsk) 7205 void sched_move_task(struct task_struct *tsk)
7183 { 7206 {
7184 struct task_group *tg; 7207 struct task_group *tg;
7185 int on_rq, running; 7208 int on_rq, running;
7186 unsigned long flags; 7209 unsigned long flags;
7187 struct rq *rq; 7210 struct rq *rq;
7188 7211
7189 rq = task_rq_lock(tsk, &flags); 7212 rq = task_rq_lock(tsk, &flags);
7190 7213
7191 running = task_current(rq, tsk); 7214 running = task_current(rq, tsk);
7192 on_rq = tsk->on_rq; 7215 on_rq = tsk->on_rq;
7193 7216
7194 if (on_rq) 7217 if (on_rq)
7195 dequeue_task(rq, tsk, 0); 7218 dequeue_task(rq, tsk, 0);
7196 if (unlikely(running)) 7219 if (unlikely(running))
7197 tsk->sched_class->put_prev_task(rq, tsk); 7220 tsk->sched_class->put_prev_task(rq, tsk);
7198 7221
7199 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7222 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7200 lockdep_is_held(&tsk->sighand->siglock)), 7223 lockdep_is_held(&tsk->sighand->siglock)),
7201 struct task_group, css); 7224 struct task_group, css);
7202 tg = autogroup_task_group(tsk, tg); 7225 tg = autogroup_task_group(tsk, tg);
7203 tsk->sched_task_group = tg; 7226 tsk->sched_task_group = tg;
7204 7227
7205 #ifdef CONFIG_FAIR_GROUP_SCHED 7228 #ifdef CONFIG_FAIR_GROUP_SCHED
7206 if (tsk->sched_class->task_move_group) 7229 if (tsk->sched_class->task_move_group)
7207 tsk->sched_class->task_move_group(tsk, on_rq); 7230 tsk->sched_class->task_move_group(tsk, on_rq);
7208 else 7231 else
7209 #endif 7232 #endif
7210 set_task_rq(tsk, task_cpu(tsk)); 7233 set_task_rq(tsk, task_cpu(tsk));
7211 7234
7212 if (unlikely(running)) 7235 if (unlikely(running))
7213 tsk->sched_class->set_curr_task(rq); 7236 tsk->sched_class->set_curr_task(rq);
7214 if (on_rq) 7237 if (on_rq)
7215 enqueue_task(rq, tsk, 0); 7238 enqueue_task(rq, tsk, 0);
7216 7239
7217 task_rq_unlock(rq, tsk, &flags); 7240 task_rq_unlock(rq, tsk, &flags);
7218 } 7241 }
7219 #endif /* CONFIG_CGROUP_SCHED */ 7242 #endif /* CONFIG_CGROUP_SCHED */
7220 7243
7221 #ifdef CONFIG_RT_GROUP_SCHED 7244 #ifdef CONFIG_RT_GROUP_SCHED
7222 /* 7245 /*
7223 * Ensure that the real time constraints are schedulable. 7246 * Ensure that the real time constraints are schedulable.
7224 */ 7247 */
7225 static DEFINE_MUTEX(rt_constraints_mutex); 7248 static DEFINE_MUTEX(rt_constraints_mutex);
7226 7249
7227 /* Must be called with tasklist_lock held */ 7250 /* Must be called with tasklist_lock held */
7228 static inline int tg_has_rt_tasks(struct task_group *tg) 7251 static inline int tg_has_rt_tasks(struct task_group *tg)
7229 { 7252 {
7230 struct task_struct *g, *p; 7253 struct task_struct *g, *p;
7231 7254
7232 do_each_thread(g, p) { 7255 do_each_thread(g, p) {
7233 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7256 if (rt_task(p) && task_rq(p)->rt.tg == tg)
7234 return 1; 7257 return 1;
7235 } while_each_thread(g, p); 7258 } while_each_thread(g, p);
7236 7259
7237 return 0; 7260 return 0;
7238 } 7261 }
7239 7262
7240 struct rt_schedulable_data { 7263 struct rt_schedulable_data {
7241 struct task_group *tg; 7264 struct task_group *tg;
7242 u64 rt_period; 7265 u64 rt_period;
7243 u64 rt_runtime; 7266 u64 rt_runtime;
7244 }; 7267 };
7245 7268
7246 static int tg_rt_schedulable(struct task_group *tg, void *data) 7269 static int tg_rt_schedulable(struct task_group *tg, void *data)
7247 { 7270 {
7248 struct rt_schedulable_data *d = data; 7271 struct rt_schedulable_data *d = data;
7249 struct task_group *child; 7272 struct task_group *child;
7250 unsigned long total, sum = 0; 7273 unsigned long total, sum = 0;
7251 u64 period, runtime; 7274 u64 period, runtime;
7252 7275
7253 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7276 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7254 runtime = tg->rt_bandwidth.rt_runtime; 7277 runtime = tg->rt_bandwidth.rt_runtime;
7255 7278
7256 if (tg == d->tg) { 7279 if (tg == d->tg) {
7257 period = d->rt_period; 7280 period = d->rt_period;
7258 runtime = d->rt_runtime; 7281 runtime = d->rt_runtime;
7259 } 7282 }
7260 7283
7261 /* 7284 /*
7262 * Cannot have more runtime than the period. 7285 * Cannot have more runtime than the period.
7263 */ 7286 */
7264 if (runtime > period && runtime != RUNTIME_INF) 7287 if (runtime > period && runtime != RUNTIME_INF)
7265 return -EINVAL; 7288 return -EINVAL;
7266 7289
7267 /* 7290 /*
7268 * Ensure we don't starve existing RT tasks. 7291 * Ensure we don't starve existing RT tasks.
7269 */ 7292 */
7270 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 7293 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
7271 return -EBUSY; 7294 return -EBUSY;
7272 7295
7273 total = to_ratio(period, runtime); 7296 total = to_ratio(period, runtime);
7274 7297
7275 /* 7298 /*
7276 * Nobody can have more than the global setting allows. 7299 * Nobody can have more than the global setting allows.
7277 */ 7300 */
7278 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 7301 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
7279 return -EINVAL; 7302 return -EINVAL;
7280 7303
7281 /* 7304 /*
7282 * The sum of our children's runtime should not exceed our own. 7305 * The sum of our children's runtime should not exceed our own.
7283 */ 7306 */
7284 list_for_each_entry_rcu(child, &tg->children, siblings) { 7307 list_for_each_entry_rcu(child, &tg->children, siblings) {
7285 period = ktime_to_ns(child->rt_bandwidth.rt_period); 7308 period = ktime_to_ns(child->rt_bandwidth.rt_period);
7286 runtime = child->rt_bandwidth.rt_runtime; 7309 runtime = child->rt_bandwidth.rt_runtime;
7287 7310
7288 if (child == d->tg) { 7311 if (child == d->tg) {
7289 period = d->rt_period; 7312 period = d->rt_period;
7290 runtime = d->rt_runtime; 7313 runtime = d->rt_runtime;
7291 } 7314 }
7292 7315
7293 sum += to_ratio(period, runtime); 7316 sum += to_ratio(period, runtime);
7294 } 7317 }
7295 7318
7296 if (sum > total) 7319 if (sum > total)
7297 return -EINVAL; 7320 return -EINVAL;
7298 7321
7299 return 0; 7322 return 0;
7300 } 7323 }
7301 7324
7302 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 7325 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
7303 { 7326 {
7304 int ret; 7327 int ret;
7305 7328
7306 struct rt_schedulable_data data = { 7329 struct rt_schedulable_data data = {
7307 .tg = tg, 7330 .tg = tg,
7308 .rt_period = period, 7331 .rt_period = period,
7309 .rt_runtime = runtime, 7332 .rt_runtime = runtime,
7310 }; 7333 };
7311 7334
7312 rcu_read_lock(); 7335 rcu_read_lock();
7313 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data); 7336 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
7314 rcu_read_unlock(); 7337 rcu_read_unlock();
7315 7338
7316 return ret; 7339 return ret;
7317 } 7340 }
7318 7341
7319 static int tg_set_rt_bandwidth(struct task_group *tg, 7342 static int tg_set_rt_bandwidth(struct task_group *tg,
7320 u64 rt_period, u64 rt_runtime) 7343 u64 rt_period, u64 rt_runtime)
7321 { 7344 {
7322 int i, err = 0; 7345 int i, err = 0;
7323 7346
7324 mutex_lock(&rt_constraints_mutex); 7347 mutex_lock(&rt_constraints_mutex);
7325 read_lock(&tasklist_lock); 7348 read_lock(&tasklist_lock);
7326 err = __rt_schedulable(tg, rt_period, rt_runtime); 7349 err = __rt_schedulable(tg, rt_period, rt_runtime);
7327 if (err) 7350 if (err)
7328 goto unlock; 7351 goto unlock;
7329 7352
7330 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7353 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7331 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 7354 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
7332 tg->rt_bandwidth.rt_runtime = rt_runtime; 7355 tg->rt_bandwidth.rt_runtime = rt_runtime;
7333 7356
7334 for_each_possible_cpu(i) { 7357 for_each_possible_cpu(i) {
7335 struct rt_rq *rt_rq = tg->rt_rq[i]; 7358 struct rt_rq *rt_rq = tg->rt_rq[i];
7336 7359
7337 raw_spin_lock(&rt_rq->rt_runtime_lock); 7360 raw_spin_lock(&rt_rq->rt_runtime_lock);
7338 rt_rq->rt_runtime = rt_runtime; 7361 rt_rq->rt_runtime = rt_runtime;
7339 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7362 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7340 } 7363 }
7341 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 7364 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
7342 unlock: 7365 unlock:
7343 read_unlock(&tasklist_lock); 7366 read_unlock(&tasklist_lock);
7344 mutex_unlock(&rt_constraints_mutex); 7367 mutex_unlock(&rt_constraints_mutex);
7345 7368
7346 return err; 7369 return err;
7347 } 7370 }
7348 7371
7349 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 7372 static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
7350 { 7373 {
7351 u64 rt_runtime, rt_period; 7374 u64 rt_runtime, rt_period;
7352 7375
7353 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 7376 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
7354 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 7377 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
7355 if (rt_runtime_us < 0) 7378 if (rt_runtime_us < 0)
7356 rt_runtime = RUNTIME_INF; 7379 rt_runtime = RUNTIME_INF;
7357 7380
7358 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7381 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7359 } 7382 }
7360 7383
7361 static long sched_group_rt_runtime(struct task_group *tg) 7384 static long sched_group_rt_runtime(struct task_group *tg)
7362 { 7385 {
7363 u64 rt_runtime_us; 7386 u64 rt_runtime_us;
7364 7387
7365 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 7388 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
7366 return -1; 7389 return -1;
7367 7390
7368 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 7391 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
7369 do_div(rt_runtime_us, NSEC_PER_USEC); 7392 do_div(rt_runtime_us, NSEC_PER_USEC);
7370 return rt_runtime_us; 7393 return rt_runtime_us;
7371 } 7394 }
7372 7395
7373 static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 7396 static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
7374 { 7397 {
7375 u64 rt_runtime, rt_period; 7398 u64 rt_runtime, rt_period;
7376 7399
7377 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 7400 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
7378 rt_runtime = tg->rt_bandwidth.rt_runtime; 7401 rt_runtime = tg->rt_bandwidth.rt_runtime;
7379 7402
7380 if (rt_period == 0) 7403 if (rt_period == 0)
7381 return -EINVAL; 7404 return -EINVAL;
7382 7405
7383 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime); 7406 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
7384 } 7407 }
7385 7408
7386 static long sched_group_rt_period(struct task_group *tg) 7409 static long sched_group_rt_period(struct task_group *tg)
7387 { 7410 {
7388 u64 rt_period_us; 7411 u64 rt_period_us;
7389 7412
7390 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 7413 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
7391 do_div(rt_period_us, NSEC_PER_USEC); 7414 do_div(rt_period_us, NSEC_PER_USEC);
7392 return rt_period_us; 7415 return rt_period_us;
7393 } 7416 }
7394 #endif /* CONFIG_RT_GROUP_SCHED */ 7417 #endif /* CONFIG_RT_GROUP_SCHED */
7395 7418
7396 #ifdef CONFIG_RT_GROUP_SCHED 7419 #ifdef CONFIG_RT_GROUP_SCHED
7397 static int sched_rt_global_constraints(void) 7420 static int sched_rt_global_constraints(void)
7398 { 7421 {
7399 int ret = 0; 7422 int ret = 0;
7400 7423
7401 mutex_lock(&rt_constraints_mutex); 7424 mutex_lock(&rt_constraints_mutex);
7402 read_lock(&tasklist_lock); 7425 read_lock(&tasklist_lock);
7403 ret = __rt_schedulable(NULL, 0, 0); 7426 ret = __rt_schedulable(NULL, 0, 0);
7404 read_unlock(&tasklist_lock); 7427 read_unlock(&tasklist_lock);
7405 mutex_unlock(&rt_constraints_mutex); 7428 mutex_unlock(&rt_constraints_mutex);
7406 7429
7407 return ret; 7430 return ret;
7408 } 7431 }
7409 7432
7410 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 7433 static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
7411 { 7434 {
7412 /* Don't accept realtime tasks when there is no way for them to run */ 7435 /* Don't accept realtime tasks when there is no way for them to run */
7413 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 7436 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
7414 return 0; 7437 return 0;
7415 7438
7416 return 1; 7439 return 1;
7417 } 7440 }
7418 7441
7419 #else /* !CONFIG_RT_GROUP_SCHED */ 7442 #else /* !CONFIG_RT_GROUP_SCHED */
7420 static int sched_rt_global_constraints(void) 7443 static int sched_rt_global_constraints(void)
7421 { 7444 {
7422 unsigned long flags; 7445 unsigned long flags;
7423 int i, ret = 0; 7446 int i, ret = 0;
7424 7447
7425 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 7448 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
7426 for_each_possible_cpu(i) { 7449 for_each_possible_cpu(i) {
7427 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 7450 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
7428 7451
7429 raw_spin_lock(&rt_rq->rt_runtime_lock); 7452 raw_spin_lock(&rt_rq->rt_runtime_lock);
7430 rt_rq->rt_runtime = global_rt_runtime(); 7453 rt_rq->rt_runtime = global_rt_runtime();
7431 raw_spin_unlock(&rt_rq->rt_runtime_lock); 7454 raw_spin_unlock(&rt_rq->rt_runtime_lock);
7432 } 7455 }
7433 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 7456 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
7434 7457
7435 return ret; 7458 return ret;
7436 } 7459 }
7437 #endif /* CONFIG_RT_GROUP_SCHED */ 7460 #endif /* CONFIG_RT_GROUP_SCHED */
7438 7461
7439 static int sched_dl_global_constraints(void) 7462 static int sched_dl_global_constraints(void)
7440 { 7463 {
7441 u64 runtime = global_rt_runtime(); 7464 u64 runtime = global_rt_runtime();
7442 u64 period = global_rt_period(); 7465 u64 period = global_rt_period();
7443 u64 new_bw = to_ratio(period, runtime); 7466 u64 new_bw = to_ratio(period, runtime);
7444 int cpu, ret = 0; 7467 int cpu, ret = 0;
7445 unsigned long flags; 7468 unsigned long flags;
7446 7469
7447 /* 7470 /*
7448 * Here we want to check the bandwidth not being set to some 7471 * Here we want to check the bandwidth not being set to some
7449 * value smaller than the currently allocated bandwidth in 7472 * value smaller than the currently allocated bandwidth in
7450 * any of the root_domains. 7473 * any of the root_domains.
7451 * 7474 *
7452 * FIXME: Cycling on all the CPUs is overdoing, but simpler than 7475 * FIXME: Cycling on all the CPUs is overdoing, but simpler than
7453 * cycling on root_domains... Discussion on different/better 7476 * cycling on root_domains... Discussion on different/better
7454 * solutions is welcome! 7477 * solutions is welcome!
7455 */ 7478 */
7456 for_each_possible_cpu(cpu) { 7479 for_each_possible_cpu(cpu) {
7457 struct dl_bw *dl_b = dl_bw_of(cpu); 7480 struct dl_bw *dl_b = dl_bw_of(cpu);
7458 7481
7459 raw_spin_lock_irqsave(&dl_b->lock, flags); 7482 raw_spin_lock_irqsave(&dl_b->lock, flags);
7460 if (new_bw < dl_b->total_bw) 7483 if (new_bw < dl_b->total_bw)
7461 ret = -EBUSY; 7484 ret = -EBUSY;
7462 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7485 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7463 7486
7464 if (ret) 7487 if (ret)
7465 break; 7488 break;
7466 } 7489 }
7467 7490
7468 return ret; 7491 return ret;
7469 } 7492 }
7470 7493
7471 static void sched_dl_do_global(void) 7494 static void sched_dl_do_global(void)
7472 { 7495 {
7473 u64 new_bw = -1; 7496 u64 new_bw = -1;
7474 int cpu; 7497 int cpu;
7475 unsigned long flags; 7498 unsigned long flags;
7476 7499
7477 def_dl_bandwidth.dl_period = global_rt_period(); 7500 def_dl_bandwidth.dl_period = global_rt_period();
7478 def_dl_bandwidth.dl_runtime = global_rt_runtime(); 7501 def_dl_bandwidth.dl_runtime = global_rt_runtime();
7479 7502
7480 if (global_rt_runtime() != RUNTIME_INF) 7503 if (global_rt_runtime() != RUNTIME_INF)
7481 new_bw = to_ratio(global_rt_period(), global_rt_runtime()); 7504 new_bw = to_ratio(global_rt_period(), global_rt_runtime());
7482 7505
7483 /* 7506 /*
7484 * FIXME: As above... 7507 * FIXME: As above...
7485 */ 7508 */
7486 for_each_possible_cpu(cpu) { 7509 for_each_possible_cpu(cpu) {
7487 struct dl_bw *dl_b = dl_bw_of(cpu); 7510 struct dl_bw *dl_b = dl_bw_of(cpu);
7488 7511
7489 raw_spin_lock_irqsave(&dl_b->lock, flags); 7512 raw_spin_lock_irqsave(&dl_b->lock, flags);
7490 dl_b->bw = new_bw; 7513 dl_b->bw = new_bw;
7491 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7514 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7492 } 7515 }
7493 } 7516 }
7494 7517
7495 static int sched_rt_global_validate(void) 7518 static int sched_rt_global_validate(void)
7496 { 7519 {
7497 if (sysctl_sched_rt_period <= 0) 7520 if (sysctl_sched_rt_period <= 0)
7498 return -EINVAL; 7521 return -EINVAL;
7499 7522
7500 if ((sysctl_sched_rt_runtime != RUNTIME_INF) && 7523 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
7501 (sysctl_sched_rt_runtime > sysctl_sched_rt_period)) 7524 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
7502 return -EINVAL; 7525 return -EINVAL;
7503 7526
7504 return 0; 7527 return 0;
7505 } 7528 }
7506 7529
7507 static void sched_rt_do_global(void) 7530 static void sched_rt_do_global(void)
7508 { 7531 {
7509 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 7532 def_rt_bandwidth.rt_runtime = global_rt_runtime();
7510 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); 7533 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
7511 } 7534 }
7512 7535
7513 int sched_rt_handler(struct ctl_table *table, int write, 7536 int sched_rt_handler(struct ctl_table *table, int write,
7514 void __user *buffer, size_t *lenp, 7537 void __user *buffer, size_t *lenp,
7515 loff_t *ppos) 7538 loff_t *ppos)
7516 { 7539 {
7517 int old_period, old_runtime; 7540 int old_period, old_runtime;
7518 static DEFINE_MUTEX(mutex); 7541 static DEFINE_MUTEX(mutex);
7519 int ret; 7542 int ret;
7520 7543
7521 mutex_lock(&mutex); 7544 mutex_lock(&mutex);
7522 old_period = sysctl_sched_rt_period; 7545 old_period = sysctl_sched_rt_period;
7523 old_runtime = sysctl_sched_rt_runtime; 7546 old_runtime = sysctl_sched_rt_runtime;
7524 7547
7525 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7548 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7526 7549
7527 if (!ret && write) { 7550 if (!ret && write) {
7528 ret = sched_rt_global_validate(); 7551 ret = sched_rt_global_validate();
7529 if (ret) 7552 if (ret)
7530 goto undo; 7553 goto undo;
7531 7554
7532 ret = sched_rt_global_constraints(); 7555 ret = sched_rt_global_constraints();
7533 if (ret) 7556 if (ret)
7534 goto undo; 7557 goto undo;
7535 7558
7536 ret = sched_dl_global_constraints(); 7559 ret = sched_dl_global_constraints();
7537 if (ret) 7560 if (ret)
7538 goto undo; 7561 goto undo;
7539 7562
7540 sched_rt_do_global(); 7563 sched_rt_do_global();
7541 sched_dl_do_global(); 7564 sched_dl_do_global();
7542 } 7565 }
7543 if (0) { 7566 if (0) {
7544 undo: 7567 undo:
7545 sysctl_sched_rt_period = old_period; 7568 sysctl_sched_rt_period = old_period;
7546 sysctl_sched_rt_runtime = old_runtime; 7569 sysctl_sched_rt_runtime = old_runtime;
7547 } 7570 }
7548 mutex_unlock(&mutex); 7571 mutex_unlock(&mutex);
7549 7572
7550 return ret; 7573 return ret;
7551 } 7574 }
7552 7575
7553 int sched_rr_handler(struct ctl_table *table, int write, 7576 int sched_rr_handler(struct ctl_table *table, int write,
7554 void __user *buffer, size_t *lenp, 7577 void __user *buffer, size_t *lenp,
7555 loff_t *ppos) 7578 loff_t *ppos)
7556 { 7579 {
7557 int ret; 7580 int ret;
7558 static DEFINE_MUTEX(mutex); 7581 static DEFINE_MUTEX(mutex);
7559 7582
7560 mutex_lock(&mutex); 7583 mutex_lock(&mutex);
7561 ret = proc_dointvec(table, write, buffer, lenp, ppos); 7584 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7562 /* make sure that internally we keep jiffies */ 7585 /* make sure that internally we keep jiffies */
7563 /* also, writing zero resets timeslice to default */ 7586 /* also, writing zero resets timeslice to default */
7564 if (!ret && write) { 7587 if (!ret && write) {
7565 sched_rr_timeslice = sched_rr_timeslice <= 0 ? 7588 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7566 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice); 7589 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7567 } 7590 }
7568 mutex_unlock(&mutex); 7591 mutex_unlock(&mutex);
7569 return ret; 7592 return ret;
7570 } 7593 }
7571 7594
7572 #ifdef CONFIG_CGROUP_SCHED 7595 #ifdef CONFIG_CGROUP_SCHED
7573 7596
7574 static inline struct task_group *css_tg(struct cgroup_subsys_state *css) 7597 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
7575 { 7598 {
7576 return css ? container_of(css, struct task_group, css) : NULL; 7599 return css ? container_of(css, struct task_group, css) : NULL;
7577 } 7600 }
7578 7601
7579 static struct cgroup_subsys_state * 7602 static struct cgroup_subsys_state *
7580 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 7603 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
7581 { 7604 {
7582 struct task_group *parent = css_tg(parent_css); 7605 struct task_group *parent = css_tg(parent_css);
7583 struct task_group *tg; 7606 struct task_group *tg;
7584 7607
7585 if (!parent) { 7608 if (!parent) {
7586 /* This is early initialization for the top cgroup */ 7609 /* This is early initialization for the top cgroup */
7587 return &root_task_group.css; 7610 return &root_task_group.css;
7588 } 7611 }
7589 7612
7590 tg = sched_create_group(parent); 7613 tg = sched_create_group(parent);
7591 if (IS_ERR(tg)) 7614 if (IS_ERR(tg))
7592 return ERR_PTR(-ENOMEM); 7615 return ERR_PTR(-ENOMEM);
7593 7616
7594 return &tg->css; 7617 return &tg->css;
7595 } 7618 }
7596 7619
7597 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) 7620 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
7598 { 7621 {
7599 struct task_group *tg = css_tg(css); 7622 struct task_group *tg = css_tg(css);
7600 struct task_group *parent = css_tg(css_parent(css)); 7623 struct task_group *parent = css_tg(css_parent(css));
7601 7624
7602 if (parent) 7625 if (parent)
7603 sched_online_group(tg, parent); 7626 sched_online_group(tg, parent);
7604 return 0; 7627 return 0;
7605 } 7628 }
7606 7629
7607 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) 7630 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
7608 { 7631 {
7609 struct task_group *tg = css_tg(css); 7632 struct task_group *tg = css_tg(css);
7610 7633
7611 sched_destroy_group(tg); 7634 sched_destroy_group(tg);
7612 } 7635 }
7613 7636
7614 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 7637 static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7615 { 7638 {
7616 struct task_group *tg = css_tg(css); 7639 struct task_group *tg = css_tg(css);
7617 7640
7618 sched_offline_group(tg); 7641 sched_offline_group(tg);
7619 } 7642 }
7620 7643
7621 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7644 static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7622 struct cgroup_taskset *tset) 7645 struct cgroup_taskset *tset)
7623 { 7646 {
7624 struct task_struct *task; 7647 struct task_struct *task;
7625 7648
7626 cgroup_taskset_for_each(task, tset) { 7649 cgroup_taskset_for_each(task, tset) {
7627 #ifdef CONFIG_RT_GROUP_SCHED 7650 #ifdef CONFIG_RT_GROUP_SCHED
7628 if (!sched_rt_can_attach(css_tg(css), task)) 7651 if (!sched_rt_can_attach(css_tg(css), task))
7629 return -EINVAL; 7652 return -EINVAL;
7630 #else 7653 #else
7631 /* We don't support RT-tasks being in separate groups */ 7654 /* We don't support RT-tasks being in separate groups */
7632 if (task->sched_class != &fair_sched_class) 7655 if (task->sched_class != &fair_sched_class)
7633 return -EINVAL; 7656 return -EINVAL;
7634 #endif 7657 #endif
7635 } 7658 }
7636 return 0; 7659 return 0;
7637 } 7660 }
7638 7661
7639 static void cpu_cgroup_attach(struct cgroup_subsys_state *css, 7662 static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
7640 struct cgroup_taskset *tset) 7663 struct cgroup_taskset *tset)
7641 { 7664 {
7642 struct task_struct *task; 7665 struct task_struct *task;
7643 7666
7644 cgroup_taskset_for_each(task, tset) 7667 cgroup_taskset_for_each(task, tset)
7645 sched_move_task(task); 7668 sched_move_task(task);
7646 } 7669 }
7647 7670
7648 static void cpu_cgroup_exit(struct cgroup_subsys_state *css, 7671 static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
7649 struct cgroup_subsys_state *old_css, 7672 struct cgroup_subsys_state *old_css,
7650 struct task_struct *task) 7673 struct task_struct *task)
7651 { 7674 {
7652 /* 7675 /*
7653 * cgroup_exit() is called in the copy_process() failure path. 7676 * cgroup_exit() is called in the copy_process() failure path.
7654 * Ignore this case since the task hasn't ran yet, this avoids 7677 * Ignore this case since the task hasn't ran yet, this avoids
7655 * trying to poke a half freed task state from generic code. 7678 * trying to poke a half freed task state from generic code.
7656 */ 7679 */
7657 if (!(task->flags & PF_EXITING)) 7680 if (!(task->flags & PF_EXITING))
7658 return; 7681 return;
7659 7682
7660 sched_move_task(task); 7683 sched_move_task(task);
7661 } 7684 }
7662 7685
7663 #ifdef CONFIG_FAIR_GROUP_SCHED 7686 #ifdef CONFIG_FAIR_GROUP_SCHED
7664 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 7687 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7665 struct cftype *cftype, u64 shareval) 7688 struct cftype *cftype, u64 shareval)
7666 { 7689 {
7667 return sched_group_set_shares(css_tg(css), scale_load(shareval)); 7690 return sched_group_set_shares(css_tg(css), scale_load(shareval));
7668 } 7691 }
7669 7692
7670 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, 7693 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
7671 struct cftype *cft) 7694 struct cftype *cft)
7672 { 7695 {
7673 struct task_group *tg = css_tg(css); 7696 struct task_group *tg = css_tg(css);
7674 7697
7675 return (u64) scale_load_down(tg->shares); 7698 return (u64) scale_load_down(tg->shares);
7676 } 7699 }
7677 7700
7678 #ifdef CONFIG_CFS_BANDWIDTH 7701 #ifdef CONFIG_CFS_BANDWIDTH
7679 static DEFINE_MUTEX(cfs_constraints_mutex); 7702 static DEFINE_MUTEX(cfs_constraints_mutex);
7680 7703
7681 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ 7704 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
7682 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */ 7705 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
7683 7706
7684 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); 7707 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
7685 7708
7686 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) 7709 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7687 { 7710 {
7688 int i, ret = 0, runtime_enabled, runtime_was_enabled; 7711 int i, ret = 0, runtime_enabled, runtime_was_enabled;
7689 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7712 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7690 7713
7691 if (tg == &root_task_group) 7714 if (tg == &root_task_group)
7692 return -EINVAL; 7715 return -EINVAL;
7693 7716
7694 /* 7717 /*
7695 * Ensure we have at some amount of bandwidth every period. This is 7718 * Ensure we have at some amount of bandwidth every period. This is
7696 * to prevent reaching a state of large arrears when throttled via 7719 * to prevent reaching a state of large arrears when throttled via
7697 * entity_tick() resulting in prolonged exit starvation. 7720 * entity_tick() resulting in prolonged exit starvation.
7698 */ 7721 */
7699 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) 7722 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
7700 return -EINVAL; 7723 return -EINVAL;
7701 7724
7702 /* 7725 /*
7703 * Likewise, bound things on the otherside by preventing insane quota 7726 * Likewise, bound things on the otherside by preventing insane quota
7704 * periods. This also allows us to normalize in computing quota 7727 * periods. This also allows us to normalize in computing quota
7705 * feasibility. 7728 * feasibility.
7706 */ 7729 */
7707 if (period > max_cfs_quota_period) 7730 if (period > max_cfs_quota_period)
7708 return -EINVAL; 7731 return -EINVAL;
7709 7732
7710 mutex_lock(&cfs_constraints_mutex); 7733 mutex_lock(&cfs_constraints_mutex);
7711 ret = __cfs_schedulable(tg, period, quota); 7734 ret = __cfs_schedulable(tg, period, quota);
7712 if (ret) 7735 if (ret)
7713 goto out_unlock; 7736 goto out_unlock;
7714 7737
7715 runtime_enabled = quota != RUNTIME_INF; 7738 runtime_enabled = quota != RUNTIME_INF;
7716 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7739 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7717 /* 7740 /*
7718 * If we need to toggle cfs_bandwidth_used, off->on must occur 7741 * If we need to toggle cfs_bandwidth_used, off->on must occur
7719 * before making related changes, and on->off must occur afterwards 7742 * before making related changes, and on->off must occur afterwards
7720 */ 7743 */
7721 if (runtime_enabled && !runtime_was_enabled) 7744 if (runtime_enabled && !runtime_was_enabled)
7722 cfs_bandwidth_usage_inc(); 7745 cfs_bandwidth_usage_inc();
7723 raw_spin_lock_irq(&cfs_b->lock); 7746 raw_spin_lock_irq(&cfs_b->lock);
7724 cfs_b->period = ns_to_ktime(period); 7747 cfs_b->period = ns_to_ktime(period);
7725 cfs_b->quota = quota; 7748 cfs_b->quota = quota;
7726 7749
7727 __refill_cfs_bandwidth_runtime(cfs_b); 7750 __refill_cfs_bandwidth_runtime(cfs_b);
7728 /* restart the period timer (if active) to handle new period expiry */ 7751 /* restart the period timer (if active) to handle new period expiry */
7729 if (runtime_enabled && cfs_b->timer_active) { 7752 if (runtime_enabled && cfs_b->timer_active) {
7730 /* force a reprogram */ 7753 /* force a reprogram */
7731 cfs_b->timer_active = 0; 7754 cfs_b->timer_active = 0;
7732 __start_cfs_bandwidth(cfs_b); 7755 __start_cfs_bandwidth(cfs_b);
7733 } 7756 }
7734 raw_spin_unlock_irq(&cfs_b->lock); 7757 raw_spin_unlock_irq(&cfs_b->lock);
7735 7758
7736 for_each_possible_cpu(i) { 7759 for_each_possible_cpu(i) {
7737 struct cfs_rq *cfs_rq = tg->cfs_rq[i]; 7760 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
7738 struct rq *rq = cfs_rq->rq; 7761 struct rq *rq = cfs_rq->rq;
7739 7762
7740 raw_spin_lock_irq(&rq->lock); 7763 raw_spin_lock_irq(&rq->lock);
7741 cfs_rq->runtime_enabled = runtime_enabled; 7764 cfs_rq->runtime_enabled = runtime_enabled;
7742 cfs_rq->runtime_remaining = 0; 7765 cfs_rq->runtime_remaining = 0;
7743 7766
7744 if (cfs_rq->throttled) 7767 if (cfs_rq->throttled)
7745 unthrottle_cfs_rq(cfs_rq); 7768 unthrottle_cfs_rq(cfs_rq);
7746 raw_spin_unlock_irq(&rq->lock); 7769 raw_spin_unlock_irq(&rq->lock);
7747 } 7770 }
7748 if (runtime_was_enabled && !runtime_enabled) 7771 if (runtime_was_enabled && !runtime_enabled)
7749 cfs_bandwidth_usage_dec(); 7772 cfs_bandwidth_usage_dec();
7750 out_unlock: 7773 out_unlock:
7751 mutex_unlock(&cfs_constraints_mutex); 7774 mutex_unlock(&cfs_constraints_mutex);
7752 7775
7753 return ret; 7776 return ret;
7754 } 7777 }
7755 7778
7756 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) 7779 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
7757 { 7780 {
7758 u64 quota, period; 7781 u64 quota, period;
7759 7782
7760 period = ktime_to_ns(tg->cfs_bandwidth.period); 7783 period = ktime_to_ns(tg->cfs_bandwidth.period);
7761 if (cfs_quota_us < 0) 7784 if (cfs_quota_us < 0)
7762 quota = RUNTIME_INF; 7785 quota = RUNTIME_INF;
7763 else 7786 else
7764 quota = (u64)cfs_quota_us * NSEC_PER_USEC; 7787 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
7765 7788
7766 return tg_set_cfs_bandwidth(tg, period, quota); 7789 return tg_set_cfs_bandwidth(tg, period, quota);
7767 } 7790 }
7768 7791
7769 long tg_get_cfs_quota(struct task_group *tg) 7792 long tg_get_cfs_quota(struct task_group *tg)
7770 { 7793 {
7771 u64 quota_us; 7794 u64 quota_us;
7772 7795
7773 if (tg->cfs_bandwidth.quota == RUNTIME_INF) 7796 if (tg->cfs_bandwidth.quota == RUNTIME_INF)
7774 return -1; 7797 return -1;
7775 7798
7776 quota_us = tg->cfs_bandwidth.quota; 7799 quota_us = tg->cfs_bandwidth.quota;
7777 do_div(quota_us, NSEC_PER_USEC); 7800 do_div(quota_us, NSEC_PER_USEC);
7778 7801
7779 return quota_us; 7802 return quota_us;
7780 } 7803 }
7781 7804
7782 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) 7805 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
7783 { 7806 {
7784 u64 quota, period; 7807 u64 quota, period;
7785 7808
7786 period = (u64)cfs_period_us * NSEC_PER_USEC; 7809 period = (u64)cfs_period_us * NSEC_PER_USEC;
7787 quota = tg->cfs_bandwidth.quota; 7810 quota = tg->cfs_bandwidth.quota;
7788 7811
7789 return tg_set_cfs_bandwidth(tg, period, quota); 7812 return tg_set_cfs_bandwidth(tg, period, quota);
7790 } 7813 }
7791 7814
7792 long tg_get_cfs_period(struct task_group *tg) 7815 long tg_get_cfs_period(struct task_group *tg)
7793 { 7816 {
7794 u64 cfs_period_us; 7817 u64 cfs_period_us;
7795 7818
7796 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); 7819 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
7797 do_div(cfs_period_us, NSEC_PER_USEC); 7820 do_div(cfs_period_us, NSEC_PER_USEC);
7798 7821
7799 return cfs_period_us; 7822 return cfs_period_us;
7800 } 7823 }
7801 7824
7802 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, 7825 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
7803 struct cftype *cft) 7826 struct cftype *cft)
7804 { 7827 {
7805 return tg_get_cfs_quota(css_tg(css)); 7828 return tg_get_cfs_quota(css_tg(css));
7806 } 7829 }
7807 7830
7808 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, 7831 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
7809 struct cftype *cftype, s64 cfs_quota_us) 7832 struct cftype *cftype, s64 cfs_quota_us)
7810 { 7833 {
7811 return tg_set_cfs_quota(css_tg(css), cfs_quota_us); 7834 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
7812 } 7835 }
7813 7836
7814 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, 7837 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
7815 struct cftype *cft) 7838 struct cftype *cft)
7816 { 7839 {
7817 return tg_get_cfs_period(css_tg(css)); 7840 return tg_get_cfs_period(css_tg(css));
7818 } 7841 }
7819 7842
7820 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, 7843 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
7821 struct cftype *cftype, u64 cfs_period_us) 7844 struct cftype *cftype, u64 cfs_period_us)
7822 { 7845 {
7823 return tg_set_cfs_period(css_tg(css), cfs_period_us); 7846 return tg_set_cfs_period(css_tg(css), cfs_period_us);
7824 } 7847 }
7825 7848
7826 struct cfs_schedulable_data { 7849 struct cfs_schedulable_data {
7827 struct task_group *tg; 7850 struct task_group *tg;
7828 u64 period, quota; 7851 u64 period, quota;
7829 }; 7852 };
7830 7853
7831 /* 7854 /*
7832 * normalize group quota/period to be quota/max_period 7855 * normalize group quota/period to be quota/max_period
7833 * note: units are usecs 7856 * note: units are usecs
7834 */ 7857 */
7835 static u64 normalize_cfs_quota(struct task_group *tg, 7858 static u64 normalize_cfs_quota(struct task_group *tg,
7836 struct cfs_schedulable_data *d) 7859 struct cfs_schedulable_data *d)
7837 { 7860 {
7838 u64 quota, period; 7861 u64 quota, period;
7839 7862
7840 if (tg == d->tg) { 7863 if (tg == d->tg) {
7841 period = d->period; 7864 period = d->period;
7842 quota = d->quota; 7865 quota = d->quota;
7843 } else { 7866 } else {
7844 period = tg_get_cfs_period(tg); 7867 period = tg_get_cfs_period(tg);
7845 quota = tg_get_cfs_quota(tg); 7868 quota = tg_get_cfs_quota(tg);
7846 } 7869 }
7847 7870
7848 /* note: these should typically be equivalent */ 7871 /* note: these should typically be equivalent */
7849 if (quota == RUNTIME_INF || quota == -1) 7872 if (quota == RUNTIME_INF || quota == -1)
7850 return RUNTIME_INF; 7873 return RUNTIME_INF;
7851 7874
7852 return to_ratio(period, quota); 7875 return to_ratio(period, quota);
7853 } 7876 }
7854 7877
7855 static int tg_cfs_schedulable_down(struct task_group *tg, void *data) 7878 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7856 { 7879 {
7857 struct cfs_schedulable_data *d = data; 7880 struct cfs_schedulable_data *d = data;
7858 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7881 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7859 s64 quota = 0, parent_quota = -1; 7882 s64 quota = 0, parent_quota = -1;
7860 7883
7861 if (!tg->parent) { 7884 if (!tg->parent) {
7862 quota = RUNTIME_INF; 7885 quota = RUNTIME_INF;
7863 } else { 7886 } else {
7864 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 7887 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7865 7888
7866 quota = normalize_cfs_quota(tg, d); 7889 quota = normalize_cfs_quota(tg, d);
7867 parent_quota = parent_b->hierarchal_quota; 7890 parent_quota = parent_b->hierarchal_quota;
7868 7891
7869 /* 7892 /*
7870 * ensure max(child_quota) <= parent_quota, inherit when no 7893 * ensure max(child_quota) <= parent_quota, inherit when no
7871 * limit is set 7894 * limit is set
7872 */ 7895 */
7873 if (quota == RUNTIME_INF) 7896 if (quota == RUNTIME_INF)
7874 quota = parent_quota; 7897 quota = parent_quota;
7875 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 7898 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
7876 return -EINVAL; 7899 return -EINVAL;
7877 } 7900 }
7878 cfs_b->hierarchal_quota = quota; 7901 cfs_b->hierarchal_quota = quota;
7879 7902
7880 return 0; 7903 return 0;
7881 } 7904 }
7882 7905
7883 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) 7906 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
7884 { 7907 {
7885 int ret; 7908 int ret;
7886 struct cfs_schedulable_data data = { 7909 struct cfs_schedulable_data data = {
7887 .tg = tg, 7910 .tg = tg,
7888 .period = period, 7911 .period = period,
7889 .quota = quota, 7912 .quota = quota,
7890 }; 7913 };
7891 7914
7892 if (quota != RUNTIME_INF) { 7915 if (quota != RUNTIME_INF) {
7893 do_div(data.period, NSEC_PER_USEC); 7916 do_div(data.period, NSEC_PER_USEC);
7894 do_div(data.quota, NSEC_PER_USEC); 7917 do_div(data.quota, NSEC_PER_USEC);
7895 } 7918 }
7896 7919
7897 rcu_read_lock(); 7920 rcu_read_lock();
7898 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); 7921 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
7899 rcu_read_unlock(); 7922 rcu_read_unlock();
7900 7923
7901 return ret; 7924 return ret;
7902 } 7925 }
7903 7926
7904 static int cpu_stats_show(struct seq_file *sf, void *v) 7927 static int cpu_stats_show(struct seq_file *sf, void *v)
7905 { 7928 {
7906 struct task_group *tg = css_tg(seq_css(sf)); 7929 struct task_group *tg = css_tg(seq_css(sf));
7907 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; 7930 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
7908 7931
7909 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); 7932 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
7910 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled); 7933 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
7911 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time); 7934 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
7912 7935
7913 return 0; 7936 return 0;
7914 } 7937 }
7915 #endif /* CONFIG_CFS_BANDWIDTH */ 7938 #endif /* CONFIG_CFS_BANDWIDTH */
7916 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7939 #endif /* CONFIG_FAIR_GROUP_SCHED */
7917 7940
7918 #ifdef CONFIG_RT_GROUP_SCHED 7941 #ifdef CONFIG_RT_GROUP_SCHED
7919 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, 7942 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
7920 struct cftype *cft, s64 val) 7943 struct cftype *cft, s64 val)
7921 { 7944 {
7922 return sched_group_set_rt_runtime(css_tg(css), val); 7945 return sched_group_set_rt_runtime(css_tg(css), val);
7923 } 7946 }
7924 7947
7925 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, 7948 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
7926 struct cftype *cft) 7949 struct cftype *cft)
7927 { 7950 {
7928 return sched_group_rt_runtime(css_tg(css)); 7951 return sched_group_rt_runtime(css_tg(css));
7929 } 7952 }
7930 7953
7931 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, 7954 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
7932 struct cftype *cftype, u64 rt_period_us) 7955 struct cftype *cftype, u64 rt_period_us)
7933 { 7956 {
7934 return sched_group_set_rt_period(css_tg(css), rt_period_us); 7957 return sched_group_set_rt_period(css_tg(css), rt_period_us);
7935 } 7958 }
7936 7959
7937 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, 7960 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
7938 struct cftype *cft) 7961 struct cftype *cft)
7939 { 7962 {
7940 return sched_group_rt_period(css_tg(css)); 7963 return sched_group_rt_period(css_tg(css));
7941 } 7964 }
7942 #endif /* CONFIG_RT_GROUP_SCHED */ 7965 #endif /* CONFIG_RT_GROUP_SCHED */
7943 7966
7944 static struct cftype cpu_files[] = { 7967 static struct cftype cpu_files[] = {
7945 #ifdef CONFIG_FAIR_GROUP_SCHED 7968 #ifdef CONFIG_FAIR_GROUP_SCHED
7946 { 7969 {
7947 .name = "shares", 7970 .name = "shares",
7948 .read_u64 = cpu_shares_read_u64, 7971 .read_u64 = cpu_shares_read_u64,
7949 .write_u64 = cpu_shares_write_u64, 7972 .write_u64 = cpu_shares_write_u64,
7950 }, 7973 },
7951 #endif 7974 #endif
7952 #ifdef CONFIG_CFS_BANDWIDTH 7975 #ifdef CONFIG_CFS_BANDWIDTH
7953 { 7976 {
7954 .name = "cfs_quota_us", 7977 .name = "cfs_quota_us",
7955 .read_s64 = cpu_cfs_quota_read_s64, 7978 .read_s64 = cpu_cfs_quota_read_s64,
7956 .write_s64 = cpu_cfs_quota_write_s64, 7979 .write_s64 = cpu_cfs_quota_write_s64,
7957 }, 7980 },
7958 { 7981 {
7959 .name = "cfs_period_us", 7982 .name = "cfs_period_us",
7960 .read_u64 = cpu_cfs_period_read_u64, 7983 .read_u64 = cpu_cfs_period_read_u64,
7961 .write_u64 = cpu_cfs_period_write_u64, 7984 .write_u64 = cpu_cfs_period_write_u64,
7962 }, 7985 },
7963 { 7986 {
7964 .name = "stat", 7987 .name = "stat",
7965 .seq_show = cpu_stats_show, 7988 .seq_show = cpu_stats_show,
7966 }, 7989 },
7967 #endif 7990 #endif
7968 #ifdef CONFIG_RT_GROUP_SCHED 7991 #ifdef CONFIG_RT_GROUP_SCHED
7969 { 7992 {
7970 .name = "rt_runtime_us", 7993 .name = "rt_runtime_us",
7971 .read_s64 = cpu_rt_runtime_read, 7994 .read_s64 = cpu_rt_runtime_read,
7972 .write_s64 = cpu_rt_runtime_write, 7995 .write_s64 = cpu_rt_runtime_write,
7973 }, 7996 },
7974 { 7997 {
7975 .name = "rt_period_us", 7998 .name = "rt_period_us",
7976 .read_u64 = cpu_rt_period_read_uint, 7999 .read_u64 = cpu_rt_period_read_uint,
7977 .write_u64 = cpu_rt_period_write_uint, 8000 .write_u64 = cpu_rt_period_write_uint,
7978 }, 8001 },
7979 #endif 8002 #endif
7980 { } /* terminate */ 8003 { } /* terminate */
7981 }; 8004 };
7982 8005
7983 struct cgroup_subsys cpu_cgrp_subsys = { 8006 struct cgroup_subsys cpu_cgrp_subsys = {
7984 .css_alloc = cpu_cgroup_css_alloc, 8007 .css_alloc = cpu_cgroup_css_alloc,
7985 .css_free = cpu_cgroup_css_free, 8008 .css_free = cpu_cgroup_css_free,
7986 .css_online = cpu_cgroup_css_online, 8009 .css_online = cpu_cgroup_css_online,
7987 .css_offline = cpu_cgroup_css_offline, 8010 .css_offline = cpu_cgroup_css_offline,
7988 .can_attach = cpu_cgroup_can_attach, 8011 .can_attach = cpu_cgroup_can_attach,
7989 .attach = cpu_cgroup_attach, 8012 .attach = cpu_cgroup_attach,
7990 .exit = cpu_cgroup_exit, 8013 .exit = cpu_cgroup_exit,
7991 .base_cftypes = cpu_files, 8014 .base_cftypes = cpu_files,
7992 .early_init = 1, 8015 .early_init = 1,
7993 }; 8016 };
7994 8017
7995 #endif /* CONFIG_CGROUP_SCHED */ 8018 #endif /* CONFIG_CGROUP_SCHED */
7996 8019
7997 void dump_cpu_task(int cpu) 8020 void dump_cpu_task(int cpu)
7998 { 8021 {
kernel/sched/cpudeadline.c
1 /* 1 /*
2 * kernel/sched/cpudl.c 2 * kernel/sched/cpudl.c
3 * 3 *
4 * Global CPU deadline management 4 * Global CPU deadline management
5 * 5 *
6 * Author: Juri Lelli <j.lelli@sssup.it> 6 * Author: Juri Lelli <j.lelli@sssup.it>
7 * 7 *
8 * This program is free software; you can redistribute it and/or 8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License 9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; version 2 10 * as published by the Free Software Foundation; version 2
11 * of the License. 11 * of the License.
12 */ 12 */
13 13
14 #include <linux/gfp.h> 14 #include <linux/gfp.h>
15 #include <linux/kernel.h> 15 #include <linux/kernel.h>
16 #include <linux/slab.h>
16 #include "cpudeadline.h" 17 #include "cpudeadline.h"
17 18
18 static inline int parent(int i) 19 static inline int parent(int i)
19 { 20 {
20 return (i - 1) >> 1; 21 return (i - 1) >> 1;
21 } 22 }
22 23
23 static inline int left_child(int i) 24 static inline int left_child(int i)
24 { 25 {
25 return (i << 1) + 1; 26 return (i << 1) + 1;
26 } 27 }
27 28
28 static inline int right_child(int i) 29 static inline int right_child(int i)
29 { 30 {
30 return (i << 1) + 2; 31 return (i << 1) + 2;
31 } 32 }
32 33
33 static inline int dl_time_before(u64 a, u64 b) 34 static inline int dl_time_before(u64 a, u64 b)
34 { 35 {
35 return (s64)(a - b) < 0; 36 return (s64)(a - b) < 0;
36 } 37 }
37 38
38 static void cpudl_exchange(struct cpudl *cp, int a, int b) 39 static void cpudl_exchange(struct cpudl *cp, int a, int b)
39 { 40 {
40 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; 41 int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
41 42
42 swap(cp->elements[a], cp->elements[b]); 43 swap(cp->elements[a].cpu, cp->elements[b].cpu);
43 swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); 44 swap(cp->elements[a].dl , cp->elements[b].dl );
45
46 swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
44 } 47 }
45 48
46 static void cpudl_heapify(struct cpudl *cp, int idx) 49 static void cpudl_heapify(struct cpudl *cp, int idx)
47 { 50 {
48 int l, r, largest; 51 int l, r, largest;
49 52
50 /* adapted from lib/prio_heap.c */ 53 /* adapted from lib/prio_heap.c */
51 while(1) { 54 while(1) {
52 l = left_child(idx); 55 l = left_child(idx);
53 r = right_child(idx); 56 r = right_child(idx);
54 largest = idx; 57 largest = idx;
55 58
56 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl, 59 if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
57 cp->elements[l].dl)) 60 cp->elements[l].dl))
58 largest = l; 61 largest = l;
59 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl, 62 if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
60 cp->elements[r].dl)) 63 cp->elements[r].dl))
61 largest = r; 64 largest = r;
62 if (largest == idx) 65 if (largest == idx)
63 break; 66 break;
64 67
65 /* Push idx down the heap one level and bump one up */ 68 /* Push idx down the heap one level and bump one up */
66 cpudl_exchange(cp, largest, idx); 69 cpudl_exchange(cp, largest, idx);
67 idx = largest; 70 idx = largest;
68 } 71 }
69 } 72 }
70 73
71 static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl) 74 static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
72 { 75 {
73 WARN_ON(idx == IDX_INVALID || !cpu_present(idx)); 76 WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
74 77
75 if (dl_time_before(new_dl, cp->elements[idx].dl)) { 78 if (dl_time_before(new_dl, cp->elements[idx].dl)) {
76 cp->elements[idx].dl = new_dl; 79 cp->elements[idx].dl = new_dl;
77 cpudl_heapify(cp, idx); 80 cpudl_heapify(cp, idx);
78 } else { 81 } else {
79 cp->elements[idx].dl = new_dl; 82 cp->elements[idx].dl = new_dl;
80 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl, 83 while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
81 cp->elements[idx].dl)) { 84 cp->elements[idx].dl)) {
82 cpudl_exchange(cp, idx, parent(idx)); 85 cpudl_exchange(cp, idx, parent(idx));
83 idx = parent(idx); 86 idx = parent(idx);
84 } 87 }
85 } 88 }
86 } 89 }
87 90
88 static inline int cpudl_maximum(struct cpudl *cp) 91 static inline int cpudl_maximum(struct cpudl *cp)
89 { 92 {
90 return cp->elements[0].cpu; 93 return cp->elements[0].cpu;
91 } 94 }
92 95
93 /* 96 /*
94 * cpudl_find - find the best (later-dl) CPU in the system 97 * cpudl_find - find the best (later-dl) CPU in the system
95 * @cp: the cpudl max-heap context 98 * @cp: the cpudl max-heap context
96 * @p: the task 99 * @p: the task
97 * @later_mask: a mask to fill in with the selected CPUs (or NULL) 100 * @later_mask: a mask to fill in with the selected CPUs (or NULL)
98 * 101 *
99 * Returns: int - best CPU (heap maximum if suitable) 102 * Returns: int - best CPU (heap maximum if suitable)
100 */ 103 */
101 int cpudl_find(struct cpudl *cp, struct task_struct *p, 104 int cpudl_find(struct cpudl *cp, struct task_struct *p,
102 struct cpumask *later_mask) 105 struct cpumask *later_mask)
103 { 106 {
104 int best_cpu = -1; 107 int best_cpu = -1;
105 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
106 109
107 if (later_mask && cpumask_and(later_mask, cp->free_cpus, 110 if (later_mask && cpumask_and(later_mask, cp->free_cpus,
108 &p->cpus_allowed) && cpumask_and(later_mask, 111 &p->cpus_allowed) && cpumask_and(later_mask,
109 later_mask, cpu_active_mask)) { 112 later_mask, cpu_active_mask)) {
110 best_cpu = cpumask_any(later_mask); 113 best_cpu = cpumask_any(later_mask);
111 goto out; 114 goto out;
112 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 115 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
113 dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 116 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
114 best_cpu = cpudl_maximum(cp); 117 best_cpu = cpudl_maximum(cp);
115 if (later_mask) 118 if (later_mask)
116 cpumask_set_cpu(best_cpu, later_mask); 119 cpumask_set_cpu(best_cpu, later_mask);
117 } 120 }
118 121
119 out: 122 out:
120 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 123 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
121 124
122 return best_cpu; 125 return best_cpu;
123 } 126 }
124 127
125 /* 128 /*
126 * cpudl_set - update the cpudl max-heap 129 * cpudl_set - update the cpudl max-heap
127 * @cp: the cpudl max-heap context 130 * @cp: the cpudl max-heap context
128 * @cpu: the target cpu 131 * @cpu: the target cpu
129 * @dl: the new earliest deadline for this cpu 132 * @dl: the new earliest deadline for this cpu
130 * 133 *
131 * Notes: assumes cpu_rq(cpu)->lock is locked 134 * Notes: assumes cpu_rq(cpu)->lock is locked
132 * 135 *
133 * Returns: (void) 136 * Returns: (void)
134 */ 137 */
135 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) 138 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
136 { 139 {
137 int old_idx, new_cpu; 140 int old_idx, new_cpu;
138 unsigned long flags; 141 unsigned long flags;
139 142
140 WARN_ON(!cpu_present(cpu)); 143 WARN_ON(!cpu_present(cpu));
141 144
142 raw_spin_lock_irqsave(&cp->lock, flags); 145 raw_spin_lock_irqsave(&cp->lock, flags);
143 old_idx = cp->cpu_to_idx[cpu]; 146 old_idx = cp->elements[cpu].idx;
144 if (!is_valid) { 147 if (!is_valid) {
145 /* remove item */ 148 /* remove item */
146 if (old_idx == IDX_INVALID) { 149 if (old_idx == IDX_INVALID) {
147 /* 150 /*
148 * Nothing to remove if old_idx was invalid. 151 * Nothing to remove if old_idx was invalid.
149 * This could happen if a rq_offline_dl is 152 * This could happen if a rq_offline_dl is
150 * called for a CPU without -dl tasks running. 153 * called for a CPU without -dl tasks running.
151 */ 154 */
152 goto out; 155 goto out;
153 } 156 }
154 new_cpu = cp->elements[cp->size - 1].cpu; 157 new_cpu = cp->elements[cp->size - 1].cpu;
155 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; 158 cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
156 cp->elements[old_idx].cpu = new_cpu; 159 cp->elements[old_idx].cpu = new_cpu;
157 cp->size--; 160 cp->size--;
158 cp->cpu_to_idx[new_cpu] = old_idx; 161 cp->elements[new_cpu].idx = old_idx;
159 cp->cpu_to_idx[cpu] = IDX_INVALID; 162 cp->elements[cpu].idx = IDX_INVALID;
160 while (old_idx > 0 && dl_time_before( 163 while (old_idx > 0 && dl_time_before(
161 cp->elements[parent(old_idx)].dl, 164 cp->elements[parent(old_idx)].dl,
162 cp->elements[old_idx].dl)) { 165 cp->elements[old_idx].dl)) {
163 cpudl_exchange(cp, old_idx, parent(old_idx)); 166 cpudl_exchange(cp, old_idx, parent(old_idx));
164 old_idx = parent(old_idx); 167 old_idx = parent(old_idx);
165 } 168 }
166 cpumask_set_cpu(cpu, cp->free_cpus); 169 cpumask_set_cpu(cpu, cp->free_cpus);
167 cpudl_heapify(cp, old_idx); 170 cpudl_heapify(cp, old_idx);
168 171
169 goto out; 172 goto out;
170 } 173 }
171 174
172 if (old_idx == IDX_INVALID) { 175 if (old_idx == IDX_INVALID) {
173 cp->size++; 176 cp->size++;
174 cp->elements[cp->size - 1].dl = 0; 177 cp->elements[cp->size - 1].dl = 0;
175 cp->elements[cp->size - 1].cpu = cpu; 178 cp->elements[cp->size - 1].cpu = cpu;
176 cp->cpu_to_idx[cpu] = cp->size - 1; 179 cp->elements[cpu].idx = cp->size - 1;
177 cpudl_change_key(cp, cp->size - 1, dl); 180 cpudl_change_key(cp, cp->size - 1, dl);
178 cpumask_clear_cpu(cpu, cp->free_cpus); 181 cpumask_clear_cpu(cpu, cp->free_cpus);
179 } else { 182 } else {
180 cpudl_change_key(cp, old_idx, dl); 183 cpudl_change_key(cp, old_idx, dl);
181 } 184 }
182 185
183 out: 186 out:
184 raw_spin_unlock_irqrestore(&cp->lock, flags); 187 raw_spin_unlock_irqrestore(&cp->lock, flags);
185 } 188 }
186 189
187 /* 190 /*
188 * cpudl_init - initialize the cpudl structure 191 * cpudl_init - initialize the cpudl structure
189 * @cp: the cpudl max-heap context 192 * @cp: the cpudl max-heap context
190 */ 193 */
191 int cpudl_init(struct cpudl *cp) 194 int cpudl_init(struct cpudl *cp)
192 { 195 {
193 int i; 196 int i;
194 197
195 memset(cp, 0, sizeof(*cp)); 198 memset(cp, 0, sizeof(*cp));
196 raw_spin_lock_init(&cp->lock); 199 raw_spin_lock_init(&cp->lock);
197 cp->size = 0; 200 cp->size = 0;
198 for (i = 0; i < NR_CPUS; i++) 201
199 cp->cpu_to_idx[i] = IDX_INVALID; 202 cp->elements = kcalloc(nr_cpu_ids,
200 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) 203 sizeof(struct cpudl_item),
204 GFP_KERNEL);
205 if (!cp->elements)
201 return -ENOMEM; 206 return -ENOMEM;
207
208 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
209 kfree(cp->elements);
210 return -ENOMEM;
211 }
212
213 for_each_possible_cpu(i)
214 cp->elements[i].idx = IDX_INVALID;
215
202 cpumask_setall(cp->free_cpus); 216 cpumask_setall(cp->free_cpus);
203 217
204 return 0; 218 return 0;
205 } 219 }
206 220
207 /* 221 /*
208 * cpudl_cleanup - clean up the cpudl structure 222 * cpudl_cleanup - clean up the cpudl structure
209 * @cp: the cpudl max-heap context 223 * @cp: the cpudl max-heap context
210 */ 224 */
211 void cpudl_cleanup(struct cpudl *cp) 225 void cpudl_cleanup(struct cpudl *cp)
212 { 226 {
213 free_cpumask_var(cp->free_cpus); 227 free_cpumask_var(cp->free_cpus);
228 kfree(cp->elements);
214 } 229 }
215 230
kernel/sched/cpudeadline.h
1 #ifndef _LINUX_CPUDL_H 1 #ifndef _LINUX_CPUDL_H
2 #define _LINUX_CPUDL_H 2 #define _LINUX_CPUDL_H
3 3
4 #include <linux/sched.h> 4 #include <linux/sched.h>
5 5
6 #define IDX_INVALID -1 6 #define IDX_INVALID -1
7 7
8 struct array_item { 8 struct cpudl_item {
9 u64 dl; 9 u64 dl;
10 int cpu; 10 int cpu;
11 int idx;
11 }; 12 };
12 13
13 struct cpudl { 14 struct cpudl {
14 raw_spinlock_t lock; 15 raw_spinlock_t lock;
15 int size; 16 int size;
16 int cpu_to_idx[NR_CPUS];
17 struct array_item elements[NR_CPUS];
18 cpumask_var_t free_cpus; 17 cpumask_var_t free_cpus;
18 struct cpudl_item *elements;
19 }; 19 };
20 20
21 21
22 #ifdef CONFIG_SMP 22 #ifdef CONFIG_SMP
23 int cpudl_find(struct cpudl *cp, struct task_struct *p, 23 int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask); 24 struct cpumask *later_mask);
25 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25 void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26 int cpudl_init(struct cpudl *cp); 26 int cpudl_init(struct cpudl *cp);
27 void cpudl_cleanup(struct cpudl *cp); 27 void cpudl_cleanup(struct cpudl *cp);
28 #else 28 #else
29 #define cpudl_set(cp, cpu, dl) do { } while (0) 29 #define cpudl_set(cp, cpu, dl) do { } while (0)
30 #define cpudl_init() do { } while (0) 30 #define cpudl_init() do { } while (0)
31 #endif /* CONFIG_SMP */ 31 #endif /* CONFIG_SMP */
32 32
kernel/sched/cpupri.c
1 /* 1 /*
2 * kernel/sched/cpupri.c 2 * kernel/sched/cpupri.c
3 * 3 *
4 * CPU priority management 4 * CPU priority management
5 * 5 *
6 * Copyright (C) 2007-2008 Novell 6 * Copyright (C) 2007-2008 Novell
7 * 7 *
8 * Author: Gregory Haskins <ghaskins@novell.com> 8 * Author: Gregory Haskins <ghaskins@novell.com>
9 * 9 *
10 * This code tracks the priority of each CPU so that global migration 10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows: 11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 * 12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99 13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 * 14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state 15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with 16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus 17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity 18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a 20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that 21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived. 22 * yields the worst case search is fairly contrived.
23 * 23 *
24 * This program is free software; you can redistribute it and/or 24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License 25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2 26 * as published by the Free Software Foundation; version 2
27 * of the License. 27 * of the License.
28 */ 28 */
29 29
30 #include <linux/gfp.h> 30 #include <linux/gfp.h>
31 #include <linux/sched.h> 31 #include <linux/sched.h>
32 #include <linux/sched/rt.h> 32 #include <linux/sched/rt.h>
33 #include <linux/slab.h>
33 #include "cpupri.h" 34 #include "cpupri.h"
34 35
35 /* Convert between a 140 based task->prio, and our 102 based cpupri */ 36 /* Convert between a 140 based task->prio, and our 102 based cpupri */
36 static int convert_prio(int prio) 37 static int convert_prio(int prio)
37 { 38 {
38 int cpupri; 39 int cpupri;
39 40
40 if (prio == CPUPRI_INVALID) 41 if (prio == CPUPRI_INVALID)
41 cpupri = CPUPRI_INVALID; 42 cpupri = CPUPRI_INVALID;
42 else if (prio == MAX_PRIO) 43 else if (prio == MAX_PRIO)
43 cpupri = CPUPRI_IDLE; 44 cpupri = CPUPRI_IDLE;
44 else if (prio >= MAX_RT_PRIO) 45 else if (prio >= MAX_RT_PRIO)
45 cpupri = CPUPRI_NORMAL; 46 cpupri = CPUPRI_NORMAL;
46 else 47 else
47 cpupri = MAX_RT_PRIO - prio + 1; 48 cpupri = MAX_RT_PRIO - prio + 1;
48 49
49 return cpupri; 50 return cpupri;
50 } 51 }
51 52
52 /** 53 /**
53 * cpupri_find - find the best (lowest-pri) CPU in the system 54 * cpupri_find - find the best (lowest-pri) CPU in the system
54 * @cp: The cpupri context 55 * @cp: The cpupri context
55 * @p: The task 56 * @p: The task
56 * @lowest_mask: A mask to fill in with selected CPUs (or NULL) 57 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
57 * 58 *
58 * Note: This function returns the recommended CPUs as calculated during the 59 * Note: This function returns the recommended CPUs as calculated during the
59 * current invocation. By the time the call returns, the CPUs may have in 60 * current invocation. By the time the call returns, the CPUs may have in
60 * fact changed priorities any number of times. While not ideal, it is not 61 * fact changed priorities any number of times. While not ideal, it is not
61 * an issue of correctness since the normal rebalancer logic will correct 62 * an issue of correctness since the normal rebalancer logic will correct
62 * any discrepancies created by racing against the uncertainty of the current 63 * any discrepancies created by racing against the uncertainty of the current
63 * priority configuration. 64 * priority configuration.
64 * 65 *
65 * Return: (int)bool - CPUs were found 66 * Return: (int)bool - CPUs were found
66 */ 67 */
67 int cpupri_find(struct cpupri *cp, struct task_struct *p, 68 int cpupri_find(struct cpupri *cp, struct task_struct *p,
68 struct cpumask *lowest_mask) 69 struct cpumask *lowest_mask)
69 { 70 {
70 int idx = 0; 71 int idx = 0;
71 int task_pri = convert_prio(p->prio); 72 int task_pri = convert_prio(p->prio);
72 73
73 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); 74 BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
74 75
75 for (idx = 0; idx < task_pri; idx++) { 76 for (idx = 0; idx < task_pri; idx++) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; 77 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 int skip = 0; 78 int skip = 0;
78 79
79 if (!atomic_read(&(vec)->count)) 80 if (!atomic_read(&(vec)->count))
80 skip = 1; 81 skip = 1;
81 /* 82 /*
82 * When looking at the vector, we need to read the counter, 83 * When looking at the vector, we need to read the counter,
83 * do a memory barrier, then read the mask. 84 * do a memory barrier, then read the mask.
84 * 85 *
85 * Note: This is still all racey, but we can deal with it. 86 * Note: This is still all racey, but we can deal with it.
86 * Ideally, we only want to look at masks that are set. 87 * Ideally, we only want to look at masks that are set.
87 * 88 *
88 * If a mask is not set, then the only thing wrong is that we 89 * If a mask is not set, then the only thing wrong is that we
89 * did a little more work than necessary. 90 * did a little more work than necessary.
90 * 91 *
91 * If we read a zero count but the mask is set, because of the 92 * If we read a zero count but the mask is set, because of the
92 * memory barriers, that can only happen when the highest prio 93 * memory barriers, that can only happen when the highest prio
93 * task for a run queue has left the run queue, in which case, 94 * task for a run queue has left the run queue, in which case,
94 * it will be followed by a pull. If the task we are processing 95 * it will be followed by a pull. If the task we are processing
95 * fails to find a proper place to go, that pull request will 96 * fails to find a proper place to go, that pull request will
96 * pull this task if the run queue is running at a lower 97 * pull this task if the run queue is running at a lower
97 * priority. 98 * priority.
98 */ 99 */
99 smp_rmb(); 100 smp_rmb();
100 101
101 /* Need to do the rmb for every iteration */ 102 /* Need to do the rmb for every iteration */
102 if (skip) 103 if (skip)
103 continue; 104 continue;
104 105
105 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 106 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
106 continue; 107 continue;
107 108
108 if (lowest_mask) { 109 if (lowest_mask) {
109 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 110 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
110 111
111 /* 112 /*
112 * We have to ensure that we have at least one bit 113 * We have to ensure that we have at least one bit
113 * still set in the array, since the map could have 114 * still set in the array, since the map could have
114 * been concurrently emptied between the first and 115 * been concurrently emptied between the first and
115 * second reads of vec->mask. If we hit this 116 * second reads of vec->mask. If we hit this
116 * condition, simply act as though we never hit this 117 * condition, simply act as though we never hit this
117 * priority level and continue on. 118 * priority level and continue on.
118 */ 119 */
119 if (cpumask_any(lowest_mask) >= nr_cpu_ids) 120 if (cpumask_any(lowest_mask) >= nr_cpu_ids)
120 continue; 121 continue;
121 } 122 }
122 123
123 return 1; 124 return 1;
124 } 125 }
125 126
126 return 0; 127 return 0;
127 } 128 }
128 129
129 /** 130 /**
130 * cpupri_set - update the cpu priority setting 131 * cpupri_set - update the cpu priority setting
131 * @cp: The cpupri context 132 * @cp: The cpupri context
132 * @cpu: The target cpu 133 * @cpu: The target cpu
133 * @newpri: The priority (INVALID-RT99) to assign to this CPU 134 * @newpri: The priority (INVALID-RT99) to assign to this CPU
134 * 135 *
135 * Note: Assumes cpu_rq(cpu)->lock is locked 136 * Note: Assumes cpu_rq(cpu)->lock is locked
136 * 137 *
137 * Returns: (void) 138 * Returns: (void)
138 */ 139 */
139 void cpupri_set(struct cpupri *cp, int cpu, int newpri) 140 void cpupri_set(struct cpupri *cp, int cpu, int newpri)
140 { 141 {
141 int *currpri = &cp->cpu_to_pri[cpu]; 142 int *currpri = &cp->cpu_to_pri[cpu];
142 int oldpri = *currpri; 143 int oldpri = *currpri;
143 int do_mb = 0; 144 int do_mb = 0;
144 145
145 newpri = convert_prio(newpri); 146 newpri = convert_prio(newpri);
146 147
147 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); 148 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
148 149
149 if (newpri == oldpri) 150 if (newpri == oldpri)
150 return; 151 return;
151 152
152 /* 153 /*
153 * If the cpu was currently mapped to a different value, we 154 * If the cpu was currently mapped to a different value, we
154 * need to map it to the new value then remove the old value. 155 * need to map it to the new value then remove the old value.
155 * Note, we must add the new value first, otherwise we risk the 156 * Note, we must add the new value first, otherwise we risk the
156 * cpu being missed by the priority loop in cpupri_find. 157 * cpu being missed by the priority loop in cpupri_find.
157 */ 158 */
158 if (likely(newpri != CPUPRI_INVALID)) { 159 if (likely(newpri != CPUPRI_INVALID)) {
159 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 160 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
160 161
161 cpumask_set_cpu(cpu, vec->mask); 162 cpumask_set_cpu(cpu, vec->mask);
162 /* 163 /*
163 * When adding a new vector, we update the mask first, 164 * When adding a new vector, we update the mask first,
164 * do a write memory barrier, and then update the count, to 165 * do a write memory barrier, and then update the count, to
165 * make sure the vector is visible when count is set. 166 * make sure the vector is visible when count is set.
166 */ 167 */
167 smp_mb__before_atomic_inc(); 168 smp_mb__before_atomic_inc();
168 atomic_inc(&(vec)->count); 169 atomic_inc(&(vec)->count);
169 do_mb = 1; 170 do_mb = 1;
170 } 171 }
171 if (likely(oldpri != CPUPRI_INVALID)) { 172 if (likely(oldpri != CPUPRI_INVALID)) {
172 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 173 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
173 174
174 /* 175 /*
175 * Because the order of modification of the vec->count 176 * Because the order of modification of the vec->count
176 * is important, we must make sure that the update 177 * is important, we must make sure that the update
177 * of the new prio is seen before we decrement the 178 * of the new prio is seen before we decrement the
178 * old prio. This makes sure that the loop sees 179 * old prio. This makes sure that the loop sees
179 * one or the other when we raise the priority of 180 * one or the other when we raise the priority of
180 * the run queue. We don't care about when we lower the 181 * the run queue. We don't care about when we lower the
181 * priority, as that will trigger an rt pull anyway. 182 * priority, as that will trigger an rt pull anyway.
182 * 183 *
183 * We only need to do a memory barrier if we updated 184 * We only need to do a memory barrier if we updated
184 * the new priority vec. 185 * the new priority vec.
185 */ 186 */
186 if (do_mb) 187 if (do_mb)
187 smp_mb__after_atomic_inc(); 188 smp_mb__after_atomic_inc();
188 189
189 /* 190 /*
190 * When removing from the vector, we decrement the counter first 191 * When removing from the vector, we decrement the counter first
191 * do a memory barrier and then clear the mask. 192 * do a memory barrier and then clear the mask.
192 */ 193 */
193 atomic_dec(&(vec)->count); 194 atomic_dec(&(vec)->count);
194 smp_mb__after_atomic_inc(); 195 smp_mb__after_atomic_inc();
195 cpumask_clear_cpu(cpu, vec->mask); 196 cpumask_clear_cpu(cpu, vec->mask);
196 } 197 }
197 198
198 *currpri = newpri; 199 *currpri = newpri;
199 } 200 }
200 201
201 /** 202 /**
202 * cpupri_init - initialize the cpupri structure 203 * cpupri_init - initialize the cpupri structure
203 * @cp: The cpupri context 204 * @cp: The cpupri context
204 * 205 *
205 * Return: -ENOMEM on memory allocation failure. 206 * Return: -ENOMEM on memory allocation failure.
206 */ 207 */
207 int cpupri_init(struct cpupri *cp) 208 int cpupri_init(struct cpupri *cp)
208 { 209 {
209 int i; 210 int i;
210 211
211 memset(cp, 0, sizeof(*cp)); 212 memset(cp, 0, sizeof(*cp));
212 213
213 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 214 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
214 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 215 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
215 216
216 atomic_set(&vec->count, 0); 217 atomic_set(&vec->count, 0);
217 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) 218 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
218 goto cleanup; 219 goto cleanup;
219 } 220 }
220 221
222 cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
223 if (!cp->cpu_to_pri)
224 goto cleanup;
225
221 for_each_possible_cpu(i) 226 for_each_possible_cpu(i)
222 cp->cpu_to_pri[i] = CPUPRI_INVALID; 227 cp->cpu_to_pri[i] = CPUPRI_INVALID;
228
223 return 0; 229 return 0;
224 230
225 cleanup: 231 cleanup:
226 for (i--; i >= 0; i--) 232 for (i--; i >= 0; i--)
227 free_cpumask_var(cp->pri_to_cpu[i].mask); 233 free_cpumask_var(cp->pri_to_cpu[i].mask);
228 return -ENOMEM; 234 return -ENOMEM;
229 } 235 }
230 236
231 /** 237 /**
232 * cpupri_cleanup - clean up the cpupri structure 238 * cpupri_cleanup - clean up the cpupri structure
233 * @cp: The cpupri context 239 * @cp: The cpupri context
234 */ 240 */
235 void cpupri_cleanup(struct cpupri *cp) 241 void cpupri_cleanup(struct cpupri *cp)
236 { 242 {
237 int i; 243 int i;
238 244
245 kfree(cp->cpu_to_pri);
239 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) 246 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
240 free_cpumask_var(cp->pri_to_cpu[i].mask); 247 free_cpumask_var(cp->pri_to_cpu[i].mask);
241 } 248 }
242 249
kernel/sched/cpupri.h
1 #ifndef _LINUX_CPUPRI_H 1 #ifndef _LINUX_CPUPRI_H
2 #define _LINUX_CPUPRI_H 2 #define _LINUX_CPUPRI_H
3 3
4 #include <linux/sched.h> 4 #include <linux/sched.h>
5 5
6 #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 6 #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7 7
8 #define CPUPRI_INVALID -1 8 #define CPUPRI_INVALID -1
9 #define CPUPRI_IDLE 0 9 #define CPUPRI_IDLE 0
10 #define CPUPRI_NORMAL 1 10 #define CPUPRI_NORMAL 1
11 /* values 2-101 are RT priorities 0-99 */ 11 /* values 2-101 are RT priorities 0-99 */
12 12
13 struct cpupri_vec { 13 struct cpupri_vec {
14 atomic_t count; 14 atomic_t count;
15 cpumask_var_t mask; 15 cpumask_var_t mask;
16 }; 16 };
17 17
18 struct cpupri { 18 struct cpupri {
19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 19 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
20 int cpu_to_pri[NR_CPUS]; 20 int *cpu_to_pri;
21 }; 21 };
22 22
23 #ifdef CONFIG_SMP 23 #ifdef CONFIG_SMP
24 int cpupri_find(struct cpupri *cp, 24 int cpupri_find(struct cpupri *cp,
25 struct task_struct *p, struct cpumask *lowest_mask); 25 struct task_struct *p, struct cpumask *lowest_mask);
26 void cpupri_set(struct cpupri *cp, int cpu, int pri); 26 void cpupri_set(struct cpupri *cp, int cpu, int pri);
27 int cpupri_init(struct cpupri *cp); 27 int cpupri_init(struct cpupri *cp);
28 void cpupri_cleanup(struct cpupri *cp); 28 void cpupri_cleanup(struct cpupri *cp);
29 #else 29 #else
30 #define cpupri_set(cp, cpu, pri) do { } while (0) 30 #define cpupri_set(cp, cpu, pri) do { } while (0)
31 #define cpupri_init() do { } while (0) 31 #define cpupri_init() do { } while (0)
32 #endif 32 #endif
33 33
34 #endif /* _LINUX_CPUPRI_H */ 34 #endif /* _LINUX_CPUPRI_H */
35 35