Commit 74f5187ac873042f502227701ed1727e7c5fbfa9

Authored by Peter Zijlstra
Committed by Ingo Molnar
1 parent 09a40af524

sched: Cure load average vs NO_HZ woes

Chase reported that due to us decrementing calc_load_task prematurely
(before the next LOAD_FREQ sample), the load average could be scewed
by as much as the number of CPUs in the machine.

This patch, based on Chase's patch, cures the problem by keeping the
delta of the CPU going into NO_HZ idle separately and folding that in
on the next LOAD_FREQ update.

This restores the balance and we get strict LOAD_FREQ period samples.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Chase Douglas <chase.douglas@canonical.com>
LKML-Reference: <1271934490.1776.343.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 2 changed files with 68 additions and 15 deletions Inline Diff

1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz 26 * Thomas Gleixner, Mike Kravetz
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/uaccess.h> 33 #include <linux/uaccess.h>
34 #include <linux/highmem.h> 34 #include <linux/highmem.h>
35 #include <linux/smp_lock.h> 35 #include <linux/smp_lock.h>
36 #include <asm/mmu_context.h> 36 #include <asm/mmu_context.h>
37 #include <linux/interrupt.h> 37 #include <linux/interrupt.h>
38 #include <linux/capability.h> 38 #include <linux/capability.h>
39 #include <linux/completion.h> 39 #include <linux/completion.h>
40 #include <linux/kernel_stat.h> 40 #include <linux/kernel_stat.h>
41 #include <linux/debug_locks.h> 41 #include <linux/debug_locks.h>
42 #include <linux/perf_event.h> 42 #include <linux/perf_event.h>
43 #include <linux/security.h> 43 #include <linux/security.h>
44 #include <linux/notifier.h> 44 #include <linux/notifier.h>
45 #include <linux/profile.h> 45 #include <linux/profile.h>
46 #include <linux/freezer.h> 46 #include <linux/freezer.h>
47 #include <linux/vmalloc.h> 47 #include <linux/vmalloc.h>
48 #include <linux/blkdev.h> 48 #include <linux/blkdev.h>
49 #include <linux/delay.h> 49 #include <linux/delay.h>
50 #include <linux/pid_namespace.h> 50 #include <linux/pid_namespace.h>
51 #include <linux/smp.h> 51 #include <linux/smp.h>
52 #include <linux/threads.h> 52 #include <linux/threads.h>
53 #include <linux/timer.h> 53 #include <linux/timer.h>
54 #include <linux/rcupdate.h> 54 #include <linux/rcupdate.h>
55 #include <linux/cpu.h> 55 #include <linux/cpu.h>
56 #include <linux/cpuset.h> 56 #include <linux/cpuset.h>
57 #include <linux/percpu.h> 57 #include <linux/percpu.h>
58 #include <linux/kthread.h> 58 #include <linux/kthread.h>
59 #include <linux/proc_fs.h> 59 #include <linux/proc_fs.h>
60 #include <linux/seq_file.h> 60 #include <linux/seq_file.h>
61 #include <linux/sysctl.h> 61 #include <linux/sysctl.h>
62 #include <linux/syscalls.h> 62 #include <linux/syscalls.h>
63 #include <linux/times.h> 63 #include <linux/times.h>
64 #include <linux/tsacct_kern.h> 64 #include <linux/tsacct_kern.h>
65 #include <linux/kprobes.h> 65 #include <linux/kprobes.h>
66 #include <linux/delayacct.h> 66 #include <linux/delayacct.h>
67 #include <linux/unistd.h> 67 #include <linux/unistd.h>
68 #include <linux/pagemap.h> 68 #include <linux/pagemap.h>
69 #include <linux/hrtimer.h> 69 #include <linux/hrtimer.h>
70 #include <linux/tick.h> 70 #include <linux/tick.h>
71 #include <linux/debugfs.h> 71 #include <linux/debugfs.h>
72 #include <linux/ctype.h> 72 #include <linux/ctype.h>
73 #include <linux/ftrace.h> 73 #include <linux/ftrace.h>
74 #include <linux/slab.h> 74 #include <linux/slab.h>
75 75
76 #include <asm/tlb.h> 76 #include <asm/tlb.h>
77 #include <asm/irq_regs.h> 77 #include <asm/irq_regs.h>
78 78
79 #include "sched_cpupri.h" 79 #include "sched_cpupri.h"
80 80
81 #define CREATE_TRACE_POINTS 81 #define CREATE_TRACE_POINTS
82 #include <trace/events/sched.h> 82 #include <trace/events/sched.h>
83 83
84 /* 84 /*
85 * Convert user-nice values [ -20 ... 0 ... 19 ] 85 * Convert user-nice values [ -20 ... 0 ... 19 ]
86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 86 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
87 * and back. 87 * and back.
88 */ 88 */
89 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 89 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
90 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 90 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
91 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 91 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
92 92
93 /* 93 /*
94 * 'User priority' is the nice value converted to something we 94 * 'User priority' is the nice value converted to something we
95 * can work with better when scaling various scheduler parameters, 95 * can work with better when scaling various scheduler parameters,
96 * it's a [ 0 ... 39 ] range. 96 * it's a [ 0 ... 39 ] range.
97 */ 97 */
98 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 98 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
99 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 99 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
100 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 100 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
101 101
102 /* 102 /*
103 * Helpers for converting nanosecond timing to jiffy resolution 103 * Helpers for converting nanosecond timing to jiffy resolution
104 */ 104 */
105 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 105 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
106 106
107 #define NICE_0_LOAD SCHED_LOAD_SCALE 107 #define NICE_0_LOAD SCHED_LOAD_SCALE
108 #define NICE_0_SHIFT SCHED_LOAD_SHIFT 108 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
109 109
110 /* 110 /*
111 * These are the 'tuning knobs' of the scheduler: 111 * These are the 'tuning knobs' of the scheduler:
112 * 112 *
113 * default timeslice is 100 msecs (used only for SCHED_RR tasks). 113 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
114 * Timeslices get refilled after they expire. 114 * Timeslices get refilled after they expire.
115 */ 115 */
116 #define DEF_TIMESLICE (100 * HZ / 1000) 116 #define DEF_TIMESLICE (100 * HZ / 1000)
117 117
118 /* 118 /*
119 * single value that denotes runtime == period, ie unlimited time. 119 * single value that denotes runtime == period, ie unlimited time.
120 */ 120 */
121 #define RUNTIME_INF ((u64)~0ULL) 121 #define RUNTIME_INF ((u64)~0ULL)
122 122
123 static inline int rt_policy(int policy) 123 static inline int rt_policy(int policy)
124 { 124 {
125 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 125 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
126 return 1; 126 return 1;
127 return 0; 127 return 0;
128 } 128 }
129 129
130 static inline int task_has_rt_policy(struct task_struct *p) 130 static inline int task_has_rt_policy(struct task_struct *p)
131 { 131 {
132 return rt_policy(p->policy); 132 return rt_policy(p->policy);
133 } 133 }
134 134
135 /* 135 /*
136 * This is the priority-queue data structure of the RT scheduling class: 136 * This is the priority-queue data structure of the RT scheduling class:
137 */ 137 */
138 struct rt_prio_array { 138 struct rt_prio_array {
139 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 139 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
140 struct list_head queue[MAX_RT_PRIO]; 140 struct list_head queue[MAX_RT_PRIO];
141 }; 141 };
142 142
143 struct rt_bandwidth { 143 struct rt_bandwidth {
144 /* nests inside the rq lock: */ 144 /* nests inside the rq lock: */
145 raw_spinlock_t rt_runtime_lock; 145 raw_spinlock_t rt_runtime_lock;
146 ktime_t rt_period; 146 ktime_t rt_period;
147 u64 rt_runtime; 147 u64 rt_runtime;
148 struct hrtimer rt_period_timer; 148 struct hrtimer rt_period_timer;
149 }; 149 };
150 150
151 static struct rt_bandwidth def_rt_bandwidth; 151 static struct rt_bandwidth def_rt_bandwidth;
152 152
153 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 153 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
154 154
155 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 155 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
156 { 156 {
157 struct rt_bandwidth *rt_b = 157 struct rt_bandwidth *rt_b =
158 container_of(timer, struct rt_bandwidth, rt_period_timer); 158 container_of(timer, struct rt_bandwidth, rt_period_timer);
159 ktime_t now; 159 ktime_t now;
160 int overrun; 160 int overrun;
161 int idle = 0; 161 int idle = 0;
162 162
163 for (;;) { 163 for (;;) {
164 now = hrtimer_cb_get_time(timer); 164 now = hrtimer_cb_get_time(timer);
165 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 165 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
166 166
167 if (!overrun) 167 if (!overrun)
168 break; 168 break;
169 169
170 idle = do_sched_rt_period_timer(rt_b, overrun); 170 idle = do_sched_rt_period_timer(rt_b, overrun);
171 } 171 }
172 172
173 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 173 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
174 } 174 }
175 175
176 static 176 static
177 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 177 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 { 178 {
179 rt_b->rt_period = ns_to_ktime(period); 179 rt_b->rt_period = ns_to_ktime(period);
180 rt_b->rt_runtime = runtime; 180 rt_b->rt_runtime = runtime;
181 181
182 raw_spin_lock_init(&rt_b->rt_runtime_lock); 182 raw_spin_lock_init(&rt_b->rt_runtime_lock);
183 183
184 hrtimer_init(&rt_b->rt_period_timer, 184 hrtimer_init(&rt_b->rt_period_timer,
185 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 185 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
186 rt_b->rt_period_timer.function = sched_rt_period_timer; 186 rt_b->rt_period_timer.function = sched_rt_period_timer;
187 } 187 }
188 188
189 static inline int rt_bandwidth_enabled(void) 189 static inline int rt_bandwidth_enabled(void)
190 { 190 {
191 return sysctl_sched_rt_runtime >= 0; 191 return sysctl_sched_rt_runtime >= 0;
192 } 192 }
193 193
194 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 194 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
195 { 195 {
196 ktime_t now; 196 ktime_t now;
197 197
198 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 198 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
199 return; 199 return;
200 200
201 if (hrtimer_active(&rt_b->rt_period_timer)) 201 if (hrtimer_active(&rt_b->rt_period_timer))
202 return; 202 return;
203 203
204 raw_spin_lock(&rt_b->rt_runtime_lock); 204 raw_spin_lock(&rt_b->rt_runtime_lock);
205 for (;;) { 205 for (;;) {
206 unsigned long delta; 206 unsigned long delta;
207 ktime_t soft, hard; 207 ktime_t soft, hard;
208 208
209 if (hrtimer_active(&rt_b->rt_period_timer)) 209 if (hrtimer_active(&rt_b->rt_period_timer))
210 break; 210 break;
211 211
212 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 212 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
213 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 213 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
214 214
215 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); 215 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
216 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 216 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
217 delta = ktime_to_ns(ktime_sub(hard, soft)); 217 delta = ktime_to_ns(ktime_sub(hard, soft));
218 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 218 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
219 HRTIMER_MODE_ABS_PINNED, 0); 219 HRTIMER_MODE_ABS_PINNED, 0);
220 } 220 }
221 raw_spin_unlock(&rt_b->rt_runtime_lock); 221 raw_spin_unlock(&rt_b->rt_runtime_lock);
222 } 222 }
223 223
224 #ifdef CONFIG_RT_GROUP_SCHED 224 #ifdef CONFIG_RT_GROUP_SCHED
225 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 225 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
226 { 226 {
227 hrtimer_cancel(&rt_b->rt_period_timer); 227 hrtimer_cancel(&rt_b->rt_period_timer);
228 } 228 }
229 #endif 229 #endif
230 230
231 /* 231 /*
232 * sched_domains_mutex serializes calls to arch_init_sched_domains, 232 * sched_domains_mutex serializes calls to arch_init_sched_domains,
233 * detach_destroy_domains and partition_sched_domains. 233 * detach_destroy_domains and partition_sched_domains.
234 */ 234 */
235 static DEFINE_MUTEX(sched_domains_mutex); 235 static DEFINE_MUTEX(sched_domains_mutex);
236 236
237 #ifdef CONFIG_CGROUP_SCHED 237 #ifdef CONFIG_CGROUP_SCHED
238 238
239 #include <linux/cgroup.h> 239 #include <linux/cgroup.h>
240 240
241 struct cfs_rq; 241 struct cfs_rq;
242 242
243 static LIST_HEAD(task_groups); 243 static LIST_HEAD(task_groups);
244 244
245 /* task group related information */ 245 /* task group related information */
246 struct task_group { 246 struct task_group {
247 struct cgroup_subsys_state css; 247 struct cgroup_subsys_state css;
248 248
249 #ifdef CONFIG_FAIR_GROUP_SCHED 249 #ifdef CONFIG_FAIR_GROUP_SCHED
250 /* schedulable entities of this group on each cpu */ 250 /* schedulable entities of this group on each cpu */
251 struct sched_entity **se; 251 struct sched_entity **se;
252 /* runqueue "owned" by this group on each cpu */ 252 /* runqueue "owned" by this group on each cpu */
253 struct cfs_rq **cfs_rq; 253 struct cfs_rq **cfs_rq;
254 unsigned long shares; 254 unsigned long shares;
255 #endif 255 #endif
256 256
257 #ifdef CONFIG_RT_GROUP_SCHED 257 #ifdef CONFIG_RT_GROUP_SCHED
258 struct sched_rt_entity **rt_se; 258 struct sched_rt_entity **rt_se;
259 struct rt_rq **rt_rq; 259 struct rt_rq **rt_rq;
260 260
261 struct rt_bandwidth rt_bandwidth; 261 struct rt_bandwidth rt_bandwidth;
262 #endif 262 #endif
263 263
264 struct rcu_head rcu; 264 struct rcu_head rcu;
265 struct list_head list; 265 struct list_head list;
266 266
267 struct task_group *parent; 267 struct task_group *parent;
268 struct list_head siblings; 268 struct list_head siblings;
269 struct list_head children; 269 struct list_head children;
270 }; 270 };
271 271
272 #define root_task_group init_task_group 272 #define root_task_group init_task_group
273 273
274 /* task_group_lock serializes add/remove of task groups and also changes to 274 /* task_group_lock serializes add/remove of task groups and also changes to
275 * a task group's cpu shares. 275 * a task group's cpu shares.
276 */ 276 */
277 static DEFINE_SPINLOCK(task_group_lock); 277 static DEFINE_SPINLOCK(task_group_lock);
278 278
279 #ifdef CONFIG_FAIR_GROUP_SCHED 279 #ifdef CONFIG_FAIR_GROUP_SCHED
280 280
281 #ifdef CONFIG_SMP 281 #ifdef CONFIG_SMP
282 static int root_task_group_empty(void) 282 static int root_task_group_empty(void)
283 { 283 {
284 return list_empty(&root_task_group.children); 284 return list_empty(&root_task_group.children);
285 } 285 }
286 #endif 286 #endif
287 287
288 # define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288 # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
289 289
290 /* 290 /*
291 * A weight of 0 or 1 can cause arithmetics problems. 291 * A weight of 0 or 1 can cause arithmetics problems.
292 * A weight of a cfs_rq is the sum of weights of which entities 292 * A weight of a cfs_rq is the sum of weights of which entities
293 * are queued on this cfs_rq, so a weight of a entity should not be 293 * are queued on this cfs_rq, so a weight of a entity should not be
294 * too large, so as the shares value of a task group. 294 * too large, so as the shares value of a task group.
295 * (The default weight is 1024 - so there's no practical 295 * (The default weight is 1024 - so there's no practical
296 * limitation from this.) 296 * limitation from this.)
297 */ 297 */
298 #define MIN_SHARES 2 298 #define MIN_SHARES 2
299 #define MAX_SHARES (1UL << 18) 299 #define MAX_SHARES (1UL << 18)
300 300
301 static int init_task_group_load = INIT_TASK_GROUP_LOAD; 301 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
302 #endif 302 #endif
303 303
304 /* Default task group. 304 /* Default task group.
305 * Every task in system belong to this group at bootup. 305 * Every task in system belong to this group at bootup.
306 */ 306 */
307 struct task_group init_task_group; 307 struct task_group init_task_group;
308 308
309 /* return group to which a task belongs */ 309 /* return group to which a task belongs */
310 static inline struct task_group *task_group(struct task_struct *p) 310 static inline struct task_group *task_group(struct task_struct *p)
311 { 311 {
312 struct task_group *tg; 312 struct task_group *tg;
313 313
314 #ifdef CONFIG_CGROUP_SCHED 314 #ifdef CONFIG_CGROUP_SCHED
315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 315 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
316 struct task_group, css); 316 struct task_group, css);
317 #else 317 #else
318 tg = &init_task_group; 318 tg = &init_task_group;
319 #endif 319 #endif
320 return tg; 320 return tg;
321 } 321 }
322 322
323 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 323 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
324 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 324 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325 { 325 {
326 #ifdef CONFIG_FAIR_GROUP_SCHED 326 #ifdef CONFIG_FAIR_GROUP_SCHED
327 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 327 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
328 p->se.parent = task_group(p)->se[cpu]; 328 p->se.parent = task_group(p)->se[cpu];
329 #endif 329 #endif
330 330
331 #ifdef CONFIG_RT_GROUP_SCHED 331 #ifdef CONFIG_RT_GROUP_SCHED
332 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 332 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
333 p->rt.parent = task_group(p)->rt_se[cpu]; 333 p->rt.parent = task_group(p)->rt_se[cpu];
334 #endif 334 #endif
335 } 335 }
336 336
337 #else 337 #else
338 338
339 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 339 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
340 static inline struct task_group *task_group(struct task_struct *p) 340 static inline struct task_group *task_group(struct task_struct *p)
341 { 341 {
342 return NULL; 342 return NULL;
343 } 343 }
344 344
345 #endif /* CONFIG_CGROUP_SCHED */ 345 #endif /* CONFIG_CGROUP_SCHED */
346 346
347 /* CFS-related fields in a runqueue */ 347 /* CFS-related fields in a runqueue */
348 struct cfs_rq { 348 struct cfs_rq {
349 struct load_weight load; 349 struct load_weight load;
350 unsigned long nr_running; 350 unsigned long nr_running;
351 351
352 u64 exec_clock; 352 u64 exec_clock;
353 u64 min_vruntime; 353 u64 min_vruntime;
354 354
355 struct rb_root tasks_timeline; 355 struct rb_root tasks_timeline;
356 struct rb_node *rb_leftmost; 356 struct rb_node *rb_leftmost;
357 357
358 struct list_head tasks; 358 struct list_head tasks;
359 struct list_head *balance_iterator; 359 struct list_head *balance_iterator;
360 360
361 /* 361 /*
362 * 'curr' points to currently running entity on this cfs_rq. 362 * 'curr' points to currently running entity on this cfs_rq.
363 * It is set to NULL otherwise (i.e when none are currently running). 363 * It is set to NULL otherwise (i.e when none are currently running).
364 */ 364 */
365 struct sched_entity *curr, *next, *last; 365 struct sched_entity *curr, *next, *last;
366 366
367 unsigned int nr_spread_over; 367 unsigned int nr_spread_over;
368 368
369 #ifdef CONFIG_FAIR_GROUP_SCHED 369 #ifdef CONFIG_FAIR_GROUP_SCHED
370 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 370 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
371 371
372 /* 372 /*
373 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 373 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
374 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 374 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
375 * (like users, containers etc.) 375 * (like users, containers etc.)
376 * 376 *
377 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 377 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
378 * list is used during load balance. 378 * list is used during load balance.
379 */ 379 */
380 struct list_head leaf_cfs_rq_list; 380 struct list_head leaf_cfs_rq_list;
381 struct task_group *tg; /* group that "owns" this runqueue */ 381 struct task_group *tg; /* group that "owns" this runqueue */
382 382
383 #ifdef CONFIG_SMP 383 #ifdef CONFIG_SMP
384 /* 384 /*
385 * the part of load.weight contributed by tasks 385 * the part of load.weight contributed by tasks
386 */ 386 */
387 unsigned long task_weight; 387 unsigned long task_weight;
388 388
389 /* 389 /*
390 * h_load = weight * f(tg) 390 * h_load = weight * f(tg)
391 * 391 *
392 * Where f(tg) is the recursive weight fraction assigned to 392 * Where f(tg) is the recursive weight fraction assigned to
393 * this group. 393 * this group.
394 */ 394 */
395 unsigned long h_load; 395 unsigned long h_load;
396 396
397 /* 397 /*
398 * this cpu's part of tg->shares 398 * this cpu's part of tg->shares
399 */ 399 */
400 unsigned long shares; 400 unsigned long shares;
401 401
402 /* 402 /*
403 * load.weight at the time we set shares 403 * load.weight at the time we set shares
404 */ 404 */
405 unsigned long rq_weight; 405 unsigned long rq_weight;
406 #endif 406 #endif
407 #endif 407 #endif
408 }; 408 };
409 409
410 /* Real-Time classes' related field in a runqueue: */ 410 /* Real-Time classes' related field in a runqueue: */
411 struct rt_rq { 411 struct rt_rq {
412 struct rt_prio_array active; 412 struct rt_prio_array active;
413 unsigned long rt_nr_running; 413 unsigned long rt_nr_running;
414 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 414 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
415 struct { 415 struct {
416 int curr; /* highest queued rt task prio */ 416 int curr; /* highest queued rt task prio */
417 #ifdef CONFIG_SMP 417 #ifdef CONFIG_SMP
418 int next; /* next highest */ 418 int next; /* next highest */
419 #endif 419 #endif
420 } highest_prio; 420 } highest_prio;
421 #endif 421 #endif
422 #ifdef CONFIG_SMP 422 #ifdef CONFIG_SMP
423 unsigned long rt_nr_migratory; 423 unsigned long rt_nr_migratory;
424 unsigned long rt_nr_total; 424 unsigned long rt_nr_total;
425 int overloaded; 425 int overloaded;
426 struct plist_head pushable_tasks; 426 struct plist_head pushable_tasks;
427 #endif 427 #endif
428 int rt_throttled; 428 int rt_throttled;
429 u64 rt_time; 429 u64 rt_time;
430 u64 rt_runtime; 430 u64 rt_runtime;
431 /* Nests inside the rq lock: */ 431 /* Nests inside the rq lock: */
432 raw_spinlock_t rt_runtime_lock; 432 raw_spinlock_t rt_runtime_lock;
433 433
434 #ifdef CONFIG_RT_GROUP_SCHED 434 #ifdef CONFIG_RT_GROUP_SCHED
435 unsigned long rt_nr_boosted; 435 unsigned long rt_nr_boosted;
436 436
437 struct rq *rq; 437 struct rq *rq;
438 struct list_head leaf_rt_rq_list; 438 struct list_head leaf_rt_rq_list;
439 struct task_group *tg; 439 struct task_group *tg;
440 #endif 440 #endif
441 }; 441 };
442 442
443 #ifdef CONFIG_SMP 443 #ifdef CONFIG_SMP
444 444
445 /* 445 /*
446 * We add the notion of a root-domain which will be used to define per-domain 446 * We add the notion of a root-domain which will be used to define per-domain
447 * variables. Each exclusive cpuset essentially defines an island domain by 447 * variables. Each exclusive cpuset essentially defines an island domain by
448 * fully partitioning the member cpus from any other cpuset. Whenever a new 448 * fully partitioning the member cpus from any other cpuset. Whenever a new
449 * exclusive cpuset is created, we also create and attach a new root-domain 449 * exclusive cpuset is created, we also create and attach a new root-domain
450 * object. 450 * object.
451 * 451 *
452 */ 452 */
453 struct root_domain { 453 struct root_domain {
454 atomic_t refcount; 454 atomic_t refcount;
455 cpumask_var_t span; 455 cpumask_var_t span;
456 cpumask_var_t online; 456 cpumask_var_t online;
457 457
458 /* 458 /*
459 * The "RT overload" flag: it gets set if a CPU has more than 459 * The "RT overload" flag: it gets set if a CPU has more than
460 * one runnable RT task. 460 * one runnable RT task.
461 */ 461 */
462 cpumask_var_t rto_mask; 462 cpumask_var_t rto_mask;
463 atomic_t rto_count; 463 atomic_t rto_count;
464 #ifdef CONFIG_SMP 464 #ifdef CONFIG_SMP
465 struct cpupri cpupri; 465 struct cpupri cpupri;
466 #endif 466 #endif
467 }; 467 };
468 468
469 /* 469 /*
470 * By default the system creates a single root-domain with all cpus as 470 * By default the system creates a single root-domain with all cpus as
471 * members (mimicking the global state we have today). 471 * members (mimicking the global state we have today).
472 */ 472 */
473 static struct root_domain def_root_domain; 473 static struct root_domain def_root_domain;
474 474
475 #endif 475 #endif
476 476
477 /* 477 /*
478 * This is the main, per-CPU runqueue data structure. 478 * This is the main, per-CPU runqueue data structure.
479 * 479 *
480 * Locking rule: those places that want to lock multiple runqueues 480 * Locking rule: those places that want to lock multiple runqueues
481 * (such as the load balancing or the thread migration code), lock 481 * (such as the load balancing or the thread migration code), lock
482 * acquire operations must be ordered by ascending &runqueue. 482 * acquire operations must be ordered by ascending &runqueue.
483 */ 483 */
484 struct rq { 484 struct rq {
485 /* runqueue lock: */ 485 /* runqueue lock: */
486 raw_spinlock_t lock; 486 raw_spinlock_t lock;
487 487
488 /* 488 /*
489 * nr_running and cpu_load should be in the same cacheline because 489 * nr_running and cpu_load should be in the same cacheline because
490 * remote CPUs use both these fields when doing load calculation. 490 * remote CPUs use both these fields when doing load calculation.
491 */ 491 */
492 unsigned long nr_running; 492 unsigned long nr_running;
493 #define CPU_LOAD_IDX_MAX 5 493 #define CPU_LOAD_IDX_MAX 5
494 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 494 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
495 #ifdef CONFIG_NO_HZ 495 #ifdef CONFIG_NO_HZ
496 u64 nohz_stamp; 496 u64 nohz_stamp;
497 unsigned char in_nohz_recently; 497 unsigned char in_nohz_recently;
498 #endif 498 #endif
499 unsigned int skip_clock_update; 499 unsigned int skip_clock_update;
500 500
501 /* capture load from *all* tasks on this cpu: */ 501 /* capture load from *all* tasks on this cpu: */
502 struct load_weight load; 502 struct load_weight load;
503 unsigned long nr_load_updates; 503 unsigned long nr_load_updates;
504 u64 nr_switches; 504 u64 nr_switches;
505 505
506 struct cfs_rq cfs; 506 struct cfs_rq cfs;
507 struct rt_rq rt; 507 struct rt_rq rt;
508 508
509 #ifdef CONFIG_FAIR_GROUP_SCHED 509 #ifdef CONFIG_FAIR_GROUP_SCHED
510 /* list of leaf cfs_rq on this cpu: */ 510 /* list of leaf cfs_rq on this cpu: */
511 struct list_head leaf_cfs_rq_list; 511 struct list_head leaf_cfs_rq_list;
512 #endif 512 #endif
513 #ifdef CONFIG_RT_GROUP_SCHED 513 #ifdef CONFIG_RT_GROUP_SCHED
514 struct list_head leaf_rt_rq_list; 514 struct list_head leaf_rt_rq_list;
515 #endif 515 #endif
516 516
517 /* 517 /*
518 * This is part of a global counter where only the total sum 518 * This is part of a global counter where only the total sum
519 * over all CPUs matters. A task can increase this counter on 519 * over all CPUs matters. A task can increase this counter on
520 * one CPU and if it got migrated afterwards it may decrease 520 * one CPU and if it got migrated afterwards it may decrease
521 * it on another CPU. Always updated under the runqueue lock: 521 * it on another CPU. Always updated under the runqueue lock:
522 */ 522 */
523 unsigned long nr_uninterruptible; 523 unsigned long nr_uninterruptible;
524 524
525 struct task_struct *curr, *idle; 525 struct task_struct *curr, *idle;
526 unsigned long next_balance; 526 unsigned long next_balance;
527 struct mm_struct *prev_mm; 527 struct mm_struct *prev_mm;
528 528
529 u64 clock; 529 u64 clock;
530 530
531 atomic_t nr_iowait; 531 atomic_t nr_iowait;
532 532
533 #ifdef CONFIG_SMP 533 #ifdef CONFIG_SMP
534 struct root_domain *rd; 534 struct root_domain *rd;
535 struct sched_domain *sd; 535 struct sched_domain *sd;
536 536
537 unsigned char idle_at_tick; 537 unsigned char idle_at_tick;
538 /* For active balancing */ 538 /* For active balancing */
539 int post_schedule; 539 int post_schedule;
540 int active_balance; 540 int active_balance;
541 int push_cpu; 541 int push_cpu;
542 /* cpu of this runqueue: */ 542 /* cpu of this runqueue: */
543 int cpu; 543 int cpu;
544 int online; 544 int online;
545 545
546 unsigned long avg_load_per_task; 546 unsigned long avg_load_per_task;
547 547
548 struct task_struct *migration_thread; 548 struct task_struct *migration_thread;
549 struct list_head migration_queue; 549 struct list_head migration_queue;
550 550
551 u64 rt_avg; 551 u64 rt_avg;
552 u64 age_stamp; 552 u64 age_stamp;
553 u64 idle_stamp; 553 u64 idle_stamp;
554 u64 avg_idle; 554 u64 avg_idle;
555 #endif 555 #endif
556 556
557 /* calc_load related fields */ 557 /* calc_load related fields */
558 unsigned long calc_load_update; 558 unsigned long calc_load_update;
559 long calc_load_active; 559 long calc_load_active;
560 560
561 #ifdef CONFIG_SCHED_HRTICK 561 #ifdef CONFIG_SCHED_HRTICK
562 #ifdef CONFIG_SMP 562 #ifdef CONFIG_SMP
563 int hrtick_csd_pending; 563 int hrtick_csd_pending;
564 struct call_single_data hrtick_csd; 564 struct call_single_data hrtick_csd;
565 #endif 565 #endif
566 struct hrtimer hrtick_timer; 566 struct hrtimer hrtick_timer;
567 #endif 567 #endif
568 568
569 #ifdef CONFIG_SCHEDSTATS 569 #ifdef CONFIG_SCHEDSTATS
570 /* latency stats */ 570 /* latency stats */
571 struct sched_info rq_sched_info; 571 struct sched_info rq_sched_info;
572 unsigned long long rq_cpu_time; 572 unsigned long long rq_cpu_time;
573 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 573 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
574 574
575 /* sys_sched_yield() stats */ 575 /* sys_sched_yield() stats */
576 unsigned int yld_count; 576 unsigned int yld_count;
577 577
578 /* schedule() stats */ 578 /* schedule() stats */
579 unsigned int sched_switch; 579 unsigned int sched_switch;
580 unsigned int sched_count; 580 unsigned int sched_count;
581 unsigned int sched_goidle; 581 unsigned int sched_goidle;
582 582
583 /* try_to_wake_up() stats */ 583 /* try_to_wake_up() stats */
584 unsigned int ttwu_count; 584 unsigned int ttwu_count;
585 unsigned int ttwu_local; 585 unsigned int ttwu_local;
586 586
587 /* BKL stats */ 587 /* BKL stats */
588 unsigned int bkl_count; 588 unsigned int bkl_count;
589 #endif 589 #endif
590 }; 590 };
591 591
592 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 592 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
593 593
594 static inline 594 static inline
595 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 595 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
596 { 596 {
597 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 597 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
598 598
599 /* 599 /*
600 * A queue event has occurred, and we're going to schedule. In 600 * A queue event has occurred, and we're going to schedule. In
601 * this case, we can save a useless back to back clock update. 601 * this case, we can save a useless back to back clock update.
602 */ 602 */
603 if (test_tsk_need_resched(p)) 603 if (test_tsk_need_resched(p))
604 rq->skip_clock_update = 1; 604 rq->skip_clock_update = 1;
605 } 605 }
606 606
607 static inline int cpu_of(struct rq *rq) 607 static inline int cpu_of(struct rq *rq)
608 { 608 {
609 #ifdef CONFIG_SMP 609 #ifdef CONFIG_SMP
610 return rq->cpu; 610 return rq->cpu;
611 #else 611 #else
612 return 0; 612 return 0;
613 #endif 613 #endif
614 } 614 }
615 615
616 #define rcu_dereference_check_sched_domain(p) \ 616 #define rcu_dereference_check_sched_domain(p) \
617 rcu_dereference_check((p), \ 617 rcu_dereference_check((p), \
618 rcu_read_lock_sched_held() || \ 618 rcu_read_lock_sched_held() || \
619 lockdep_is_held(&sched_domains_mutex)) 619 lockdep_is_held(&sched_domains_mutex))
620 620
621 /* 621 /*
622 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 622 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
623 * See detach_destroy_domains: synchronize_sched for details. 623 * See detach_destroy_domains: synchronize_sched for details.
624 * 624 *
625 * The domain tree of any CPU may only be accessed from within 625 * The domain tree of any CPU may only be accessed from within
626 * preempt-disabled sections. 626 * preempt-disabled sections.
627 */ 627 */
628 #define for_each_domain(cpu, __sd) \ 628 #define for_each_domain(cpu, __sd) \
629 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 629 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
630 630
631 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 631 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
632 #define this_rq() (&__get_cpu_var(runqueues)) 632 #define this_rq() (&__get_cpu_var(runqueues))
633 #define task_rq(p) cpu_rq(task_cpu(p)) 633 #define task_rq(p) cpu_rq(task_cpu(p))
634 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 634 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
635 #define raw_rq() (&__raw_get_cpu_var(runqueues)) 635 #define raw_rq() (&__raw_get_cpu_var(runqueues))
636 636
637 inline void update_rq_clock(struct rq *rq) 637 inline void update_rq_clock(struct rq *rq)
638 { 638 {
639 if (!rq->skip_clock_update) 639 if (!rq->skip_clock_update)
640 rq->clock = sched_clock_cpu(cpu_of(rq)); 640 rq->clock = sched_clock_cpu(cpu_of(rq));
641 } 641 }
642 642
643 /* 643 /*
644 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 644 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
645 */ 645 */
646 #ifdef CONFIG_SCHED_DEBUG 646 #ifdef CONFIG_SCHED_DEBUG
647 # define const_debug __read_mostly 647 # define const_debug __read_mostly
648 #else 648 #else
649 # define const_debug static const 649 # define const_debug static const
650 #endif 650 #endif
651 651
652 /** 652 /**
653 * runqueue_is_locked 653 * runqueue_is_locked
654 * @cpu: the processor in question. 654 * @cpu: the processor in question.
655 * 655 *
656 * Returns true if the current cpu runqueue is locked. 656 * Returns true if the current cpu runqueue is locked.
657 * This interface allows printk to be called with the runqueue lock 657 * This interface allows printk to be called with the runqueue lock
658 * held and know whether or not it is OK to wake up the klogd. 658 * held and know whether or not it is OK to wake up the klogd.
659 */ 659 */
660 int runqueue_is_locked(int cpu) 660 int runqueue_is_locked(int cpu)
661 { 661 {
662 return raw_spin_is_locked(&cpu_rq(cpu)->lock); 662 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
663 } 663 }
664 664
665 /* 665 /*
666 * Debugging: various feature bits 666 * Debugging: various feature bits
667 */ 667 */
668 668
669 #define SCHED_FEAT(name, enabled) \ 669 #define SCHED_FEAT(name, enabled) \
670 __SCHED_FEAT_##name , 670 __SCHED_FEAT_##name ,
671 671
672 enum { 672 enum {
673 #include "sched_features.h" 673 #include "sched_features.h"
674 }; 674 };
675 675
676 #undef SCHED_FEAT 676 #undef SCHED_FEAT
677 677
678 #define SCHED_FEAT(name, enabled) \ 678 #define SCHED_FEAT(name, enabled) \
679 (1UL << __SCHED_FEAT_##name) * enabled | 679 (1UL << __SCHED_FEAT_##name) * enabled |
680 680
681 const_debug unsigned int sysctl_sched_features = 681 const_debug unsigned int sysctl_sched_features =
682 #include "sched_features.h" 682 #include "sched_features.h"
683 0; 683 0;
684 684
685 #undef SCHED_FEAT 685 #undef SCHED_FEAT
686 686
687 #ifdef CONFIG_SCHED_DEBUG 687 #ifdef CONFIG_SCHED_DEBUG
688 #define SCHED_FEAT(name, enabled) \ 688 #define SCHED_FEAT(name, enabled) \
689 #name , 689 #name ,
690 690
691 static __read_mostly char *sched_feat_names[] = { 691 static __read_mostly char *sched_feat_names[] = {
692 #include "sched_features.h" 692 #include "sched_features.h"
693 NULL 693 NULL
694 }; 694 };
695 695
696 #undef SCHED_FEAT 696 #undef SCHED_FEAT
697 697
698 static int sched_feat_show(struct seq_file *m, void *v) 698 static int sched_feat_show(struct seq_file *m, void *v)
699 { 699 {
700 int i; 700 int i;
701 701
702 for (i = 0; sched_feat_names[i]; i++) { 702 for (i = 0; sched_feat_names[i]; i++) {
703 if (!(sysctl_sched_features & (1UL << i))) 703 if (!(sysctl_sched_features & (1UL << i)))
704 seq_puts(m, "NO_"); 704 seq_puts(m, "NO_");
705 seq_printf(m, "%s ", sched_feat_names[i]); 705 seq_printf(m, "%s ", sched_feat_names[i]);
706 } 706 }
707 seq_puts(m, "\n"); 707 seq_puts(m, "\n");
708 708
709 return 0; 709 return 0;
710 } 710 }
711 711
712 static ssize_t 712 static ssize_t
713 sched_feat_write(struct file *filp, const char __user *ubuf, 713 sched_feat_write(struct file *filp, const char __user *ubuf,
714 size_t cnt, loff_t *ppos) 714 size_t cnt, loff_t *ppos)
715 { 715 {
716 char buf[64]; 716 char buf[64];
717 char *cmp = buf; 717 char *cmp = buf;
718 int neg = 0; 718 int neg = 0;
719 int i; 719 int i;
720 720
721 if (cnt > 63) 721 if (cnt > 63)
722 cnt = 63; 722 cnt = 63;
723 723
724 if (copy_from_user(&buf, ubuf, cnt)) 724 if (copy_from_user(&buf, ubuf, cnt))
725 return -EFAULT; 725 return -EFAULT;
726 726
727 buf[cnt] = 0; 727 buf[cnt] = 0;
728 728
729 if (strncmp(buf, "NO_", 3) == 0) { 729 if (strncmp(buf, "NO_", 3) == 0) {
730 neg = 1; 730 neg = 1;
731 cmp += 3; 731 cmp += 3;
732 } 732 }
733 733
734 for (i = 0; sched_feat_names[i]; i++) { 734 for (i = 0; sched_feat_names[i]; i++) {
735 int len = strlen(sched_feat_names[i]); 735 int len = strlen(sched_feat_names[i]);
736 736
737 if (strncmp(cmp, sched_feat_names[i], len) == 0) { 737 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
738 if (neg) 738 if (neg)
739 sysctl_sched_features &= ~(1UL << i); 739 sysctl_sched_features &= ~(1UL << i);
740 else 740 else
741 sysctl_sched_features |= (1UL << i); 741 sysctl_sched_features |= (1UL << i);
742 break; 742 break;
743 } 743 }
744 } 744 }
745 745
746 if (!sched_feat_names[i]) 746 if (!sched_feat_names[i])
747 return -EINVAL; 747 return -EINVAL;
748 748
749 *ppos += cnt; 749 *ppos += cnt;
750 750
751 return cnt; 751 return cnt;
752 } 752 }
753 753
754 static int sched_feat_open(struct inode *inode, struct file *filp) 754 static int sched_feat_open(struct inode *inode, struct file *filp)
755 { 755 {
756 return single_open(filp, sched_feat_show, NULL); 756 return single_open(filp, sched_feat_show, NULL);
757 } 757 }
758 758
759 static const struct file_operations sched_feat_fops = { 759 static const struct file_operations sched_feat_fops = {
760 .open = sched_feat_open, 760 .open = sched_feat_open,
761 .write = sched_feat_write, 761 .write = sched_feat_write,
762 .read = seq_read, 762 .read = seq_read,
763 .llseek = seq_lseek, 763 .llseek = seq_lseek,
764 .release = single_release, 764 .release = single_release,
765 }; 765 };
766 766
767 static __init int sched_init_debug(void) 767 static __init int sched_init_debug(void)
768 { 768 {
769 debugfs_create_file("sched_features", 0644, NULL, NULL, 769 debugfs_create_file("sched_features", 0644, NULL, NULL,
770 &sched_feat_fops); 770 &sched_feat_fops);
771 771
772 return 0; 772 return 0;
773 } 773 }
774 late_initcall(sched_init_debug); 774 late_initcall(sched_init_debug);
775 775
776 #endif 776 #endif
777 777
778 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 778 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
779 779
780 /* 780 /*
781 * Number of tasks to iterate in a single balance run. 781 * Number of tasks to iterate in a single balance run.
782 * Limited because this is done with IRQs disabled. 782 * Limited because this is done with IRQs disabled.
783 */ 783 */
784 const_debug unsigned int sysctl_sched_nr_migrate = 32; 784 const_debug unsigned int sysctl_sched_nr_migrate = 32;
785 785
786 /* 786 /*
787 * ratelimit for updating the group shares. 787 * ratelimit for updating the group shares.
788 * default: 0.25ms 788 * default: 0.25ms
789 */ 789 */
790 unsigned int sysctl_sched_shares_ratelimit = 250000; 790 unsigned int sysctl_sched_shares_ratelimit = 250000;
791 unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; 791 unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
792 792
793 /* 793 /*
794 * Inject some fuzzyness into changing the per-cpu group shares 794 * Inject some fuzzyness into changing the per-cpu group shares
795 * this avoids remote rq-locks at the expense of fairness. 795 * this avoids remote rq-locks at the expense of fairness.
796 * default: 4 796 * default: 4
797 */ 797 */
798 unsigned int sysctl_sched_shares_thresh = 4; 798 unsigned int sysctl_sched_shares_thresh = 4;
799 799
800 /* 800 /*
801 * period over which we average the RT time consumption, measured 801 * period over which we average the RT time consumption, measured
802 * in ms. 802 * in ms.
803 * 803 *
804 * default: 1s 804 * default: 1s
805 */ 805 */
806 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 806 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
807 807
808 /* 808 /*
809 * period over which we measure -rt task cpu usage in us. 809 * period over which we measure -rt task cpu usage in us.
810 * default: 1s 810 * default: 1s
811 */ 811 */
812 unsigned int sysctl_sched_rt_period = 1000000; 812 unsigned int sysctl_sched_rt_period = 1000000;
813 813
814 static __read_mostly int scheduler_running; 814 static __read_mostly int scheduler_running;
815 815
816 /* 816 /*
817 * part of the period that we allow rt tasks to run in us. 817 * part of the period that we allow rt tasks to run in us.
818 * default: 0.95s 818 * default: 0.95s
819 */ 819 */
820 int sysctl_sched_rt_runtime = 950000; 820 int sysctl_sched_rt_runtime = 950000;
821 821
822 static inline u64 global_rt_period(void) 822 static inline u64 global_rt_period(void)
823 { 823 {
824 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 824 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
825 } 825 }
826 826
827 static inline u64 global_rt_runtime(void) 827 static inline u64 global_rt_runtime(void)
828 { 828 {
829 if (sysctl_sched_rt_runtime < 0) 829 if (sysctl_sched_rt_runtime < 0)
830 return RUNTIME_INF; 830 return RUNTIME_INF;
831 831
832 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 832 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
833 } 833 }
834 834
835 #ifndef prepare_arch_switch 835 #ifndef prepare_arch_switch
836 # define prepare_arch_switch(next) do { } while (0) 836 # define prepare_arch_switch(next) do { } while (0)
837 #endif 837 #endif
838 #ifndef finish_arch_switch 838 #ifndef finish_arch_switch
839 # define finish_arch_switch(prev) do { } while (0) 839 # define finish_arch_switch(prev) do { } while (0)
840 #endif 840 #endif
841 841
842 static inline int task_current(struct rq *rq, struct task_struct *p) 842 static inline int task_current(struct rq *rq, struct task_struct *p)
843 { 843 {
844 return rq->curr == p; 844 return rq->curr == p;
845 } 845 }
846 846
847 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 847 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
848 static inline int task_running(struct rq *rq, struct task_struct *p) 848 static inline int task_running(struct rq *rq, struct task_struct *p)
849 { 849 {
850 return task_current(rq, p); 850 return task_current(rq, p);
851 } 851 }
852 852
853 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 853 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
854 { 854 {
855 } 855 }
856 856
857 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 857 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
858 { 858 {
859 #ifdef CONFIG_DEBUG_SPINLOCK 859 #ifdef CONFIG_DEBUG_SPINLOCK
860 /* this is a valid case when another task releases the spinlock */ 860 /* this is a valid case when another task releases the spinlock */
861 rq->lock.owner = current; 861 rq->lock.owner = current;
862 #endif 862 #endif
863 /* 863 /*
864 * If we are tracking spinlock dependencies then we have to 864 * If we are tracking spinlock dependencies then we have to
865 * fix up the runqueue lock - which gets 'carried over' from 865 * fix up the runqueue lock - which gets 'carried over' from
866 * prev into current: 866 * prev into current:
867 */ 867 */
868 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 868 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
869 869
870 raw_spin_unlock_irq(&rq->lock); 870 raw_spin_unlock_irq(&rq->lock);
871 } 871 }
872 872
873 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 873 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
874 static inline int task_running(struct rq *rq, struct task_struct *p) 874 static inline int task_running(struct rq *rq, struct task_struct *p)
875 { 875 {
876 #ifdef CONFIG_SMP 876 #ifdef CONFIG_SMP
877 return p->oncpu; 877 return p->oncpu;
878 #else 878 #else
879 return task_current(rq, p); 879 return task_current(rq, p);
880 #endif 880 #endif
881 } 881 }
882 882
883 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 883 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
884 { 884 {
885 #ifdef CONFIG_SMP 885 #ifdef CONFIG_SMP
886 /* 886 /*
887 * We can optimise this out completely for !SMP, because the 887 * We can optimise this out completely for !SMP, because the
888 * SMP rebalancing from interrupt is the only thing that cares 888 * SMP rebalancing from interrupt is the only thing that cares
889 * here. 889 * here.
890 */ 890 */
891 next->oncpu = 1; 891 next->oncpu = 1;
892 #endif 892 #endif
893 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 893 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
894 raw_spin_unlock_irq(&rq->lock); 894 raw_spin_unlock_irq(&rq->lock);
895 #else 895 #else
896 raw_spin_unlock(&rq->lock); 896 raw_spin_unlock(&rq->lock);
897 #endif 897 #endif
898 } 898 }
899 899
900 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 900 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
901 { 901 {
902 #ifdef CONFIG_SMP 902 #ifdef CONFIG_SMP
903 /* 903 /*
904 * After ->oncpu is cleared, the task can be moved to a different CPU. 904 * After ->oncpu is cleared, the task can be moved to a different CPU.
905 * We must ensure this doesn't happen until the switch is completely 905 * We must ensure this doesn't happen until the switch is completely
906 * finished. 906 * finished.
907 */ 907 */
908 smp_wmb(); 908 smp_wmb();
909 prev->oncpu = 0; 909 prev->oncpu = 0;
910 #endif 910 #endif
911 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 911 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
912 local_irq_enable(); 912 local_irq_enable();
913 #endif 913 #endif
914 } 914 }
915 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 915 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
916 916
917 /* 917 /*
918 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 918 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
919 * against ttwu(). 919 * against ttwu().
920 */ 920 */
921 static inline int task_is_waking(struct task_struct *p) 921 static inline int task_is_waking(struct task_struct *p)
922 { 922 {
923 return unlikely(p->state == TASK_WAKING); 923 return unlikely(p->state == TASK_WAKING);
924 } 924 }
925 925
926 /* 926 /*
927 * __task_rq_lock - lock the runqueue a given task resides on. 927 * __task_rq_lock - lock the runqueue a given task resides on.
928 * Must be called interrupts disabled. 928 * Must be called interrupts disabled.
929 */ 929 */
930 static inline struct rq *__task_rq_lock(struct task_struct *p) 930 static inline struct rq *__task_rq_lock(struct task_struct *p)
931 __acquires(rq->lock) 931 __acquires(rq->lock)
932 { 932 {
933 struct rq *rq; 933 struct rq *rq;
934 934
935 for (;;) { 935 for (;;) {
936 rq = task_rq(p); 936 rq = task_rq(p);
937 raw_spin_lock(&rq->lock); 937 raw_spin_lock(&rq->lock);
938 if (likely(rq == task_rq(p))) 938 if (likely(rq == task_rq(p)))
939 return rq; 939 return rq;
940 raw_spin_unlock(&rq->lock); 940 raw_spin_unlock(&rq->lock);
941 } 941 }
942 } 942 }
943 943
944 /* 944 /*
945 * task_rq_lock - lock the runqueue a given task resides on and disable 945 * task_rq_lock - lock the runqueue a given task resides on and disable
946 * interrupts. Note the ordering: we can safely lookup the task_rq without 946 * interrupts. Note the ordering: we can safely lookup the task_rq without
947 * explicitly disabling preemption. 947 * explicitly disabling preemption.
948 */ 948 */
949 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 949 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
950 __acquires(rq->lock) 950 __acquires(rq->lock)
951 { 951 {
952 struct rq *rq; 952 struct rq *rq;
953 953
954 for (;;) { 954 for (;;) {
955 local_irq_save(*flags); 955 local_irq_save(*flags);
956 rq = task_rq(p); 956 rq = task_rq(p);
957 raw_spin_lock(&rq->lock); 957 raw_spin_lock(&rq->lock);
958 if (likely(rq == task_rq(p))) 958 if (likely(rq == task_rq(p)))
959 return rq; 959 return rq;
960 raw_spin_unlock_irqrestore(&rq->lock, *flags); 960 raw_spin_unlock_irqrestore(&rq->lock, *flags);
961 } 961 }
962 } 962 }
963 963
964 void task_rq_unlock_wait(struct task_struct *p) 964 void task_rq_unlock_wait(struct task_struct *p)
965 { 965 {
966 struct rq *rq = task_rq(p); 966 struct rq *rq = task_rq(p);
967 967
968 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 968 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
969 raw_spin_unlock_wait(&rq->lock); 969 raw_spin_unlock_wait(&rq->lock);
970 } 970 }
971 971
972 static void __task_rq_unlock(struct rq *rq) 972 static void __task_rq_unlock(struct rq *rq)
973 __releases(rq->lock) 973 __releases(rq->lock)
974 { 974 {
975 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
976 } 976 }
977 977
978 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
979 __releases(rq->lock) 979 __releases(rq->lock)
980 { 980 {
981 raw_spin_unlock_irqrestore(&rq->lock, *flags); 981 raw_spin_unlock_irqrestore(&rq->lock, *flags);
982 } 982 }
983 983
984 /* 984 /*
985 * this_rq_lock - lock this runqueue and disable interrupts. 985 * this_rq_lock - lock this runqueue and disable interrupts.
986 */ 986 */
987 static struct rq *this_rq_lock(void) 987 static struct rq *this_rq_lock(void)
988 __acquires(rq->lock) 988 __acquires(rq->lock)
989 { 989 {
990 struct rq *rq; 990 struct rq *rq;
991 991
992 local_irq_disable(); 992 local_irq_disable();
993 rq = this_rq(); 993 rq = this_rq();
994 raw_spin_lock(&rq->lock); 994 raw_spin_lock(&rq->lock);
995 995
996 return rq; 996 return rq;
997 } 997 }
998 998
999 #ifdef CONFIG_SCHED_HRTICK 999 #ifdef CONFIG_SCHED_HRTICK
1000 /* 1000 /*
1001 * Use HR-timers to deliver accurate preemption points. 1001 * Use HR-timers to deliver accurate preemption points.
1002 * 1002 *
1003 * Its all a bit involved since we cannot program an hrt while holding the 1003 * Its all a bit involved since we cannot program an hrt while holding the
1004 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 1004 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1005 * reschedule event. 1005 * reschedule event.
1006 * 1006 *
1007 * When we get rescheduled we reprogram the hrtick_timer outside of the 1007 * When we get rescheduled we reprogram the hrtick_timer outside of the
1008 * rq->lock. 1008 * rq->lock.
1009 */ 1009 */
1010 1010
1011 /* 1011 /*
1012 * Use hrtick when: 1012 * Use hrtick when:
1013 * - enabled by features 1013 * - enabled by features
1014 * - hrtimer is actually high res 1014 * - hrtimer is actually high res
1015 */ 1015 */
1016 static inline int hrtick_enabled(struct rq *rq) 1016 static inline int hrtick_enabled(struct rq *rq)
1017 { 1017 {
1018 if (!sched_feat(HRTICK)) 1018 if (!sched_feat(HRTICK))
1019 return 0; 1019 return 0;
1020 if (!cpu_active(cpu_of(rq))) 1020 if (!cpu_active(cpu_of(rq)))
1021 return 0; 1021 return 0;
1022 return hrtimer_is_hres_active(&rq->hrtick_timer); 1022 return hrtimer_is_hres_active(&rq->hrtick_timer);
1023 } 1023 }
1024 1024
1025 static void hrtick_clear(struct rq *rq) 1025 static void hrtick_clear(struct rq *rq)
1026 { 1026 {
1027 if (hrtimer_active(&rq->hrtick_timer)) 1027 if (hrtimer_active(&rq->hrtick_timer))
1028 hrtimer_cancel(&rq->hrtick_timer); 1028 hrtimer_cancel(&rq->hrtick_timer);
1029 } 1029 }
1030 1030
1031 /* 1031 /*
1032 * High-resolution timer tick. 1032 * High-resolution timer tick.
1033 * Runs from hardirq context with interrupts disabled. 1033 * Runs from hardirq context with interrupts disabled.
1034 */ 1034 */
1035 static enum hrtimer_restart hrtick(struct hrtimer *timer) 1035 static enum hrtimer_restart hrtick(struct hrtimer *timer)
1036 { 1036 {
1037 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 1037 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1038 1038
1039 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1039 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1040 1040
1041 raw_spin_lock(&rq->lock); 1041 raw_spin_lock(&rq->lock);
1042 update_rq_clock(rq); 1042 update_rq_clock(rq);
1043 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1043 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1044 raw_spin_unlock(&rq->lock); 1044 raw_spin_unlock(&rq->lock);
1045 1045
1046 return HRTIMER_NORESTART; 1046 return HRTIMER_NORESTART;
1047 } 1047 }
1048 1048
1049 #ifdef CONFIG_SMP 1049 #ifdef CONFIG_SMP
1050 /* 1050 /*
1051 * called from hardirq (IPI) context 1051 * called from hardirq (IPI) context
1052 */ 1052 */
1053 static void __hrtick_start(void *arg) 1053 static void __hrtick_start(void *arg)
1054 { 1054 {
1055 struct rq *rq = arg; 1055 struct rq *rq = arg;
1056 1056
1057 raw_spin_lock(&rq->lock); 1057 raw_spin_lock(&rq->lock);
1058 hrtimer_restart(&rq->hrtick_timer); 1058 hrtimer_restart(&rq->hrtick_timer);
1059 rq->hrtick_csd_pending = 0; 1059 rq->hrtick_csd_pending = 0;
1060 raw_spin_unlock(&rq->lock); 1060 raw_spin_unlock(&rq->lock);
1061 } 1061 }
1062 1062
1063 /* 1063 /*
1064 * Called to set the hrtick timer state. 1064 * Called to set the hrtick timer state.
1065 * 1065 *
1066 * called with rq->lock held and irqs disabled 1066 * called with rq->lock held and irqs disabled
1067 */ 1067 */
1068 static void hrtick_start(struct rq *rq, u64 delay) 1068 static void hrtick_start(struct rq *rq, u64 delay)
1069 { 1069 {
1070 struct hrtimer *timer = &rq->hrtick_timer; 1070 struct hrtimer *timer = &rq->hrtick_timer;
1071 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1071 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1072 1072
1073 hrtimer_set_expires(timer, time); 1073 hrtimer_set_expires(timer, time);
1074 1074
1075 if (rq == this_rq()) { 1075 if (rq == this_rq()) {
1076 hrtimer_restart(timer); 1076 hrtimer_restart(timer);
1077 } else if (!rq->hrtick_csd_pending) { 1077 } else if (!rq->hrtick_csd_pending) {
1078 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 1078 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1079 rq->hrtick_csd_pending = 1; 1079 rq->hrtick_csd_pending = 1;
1080 } 1080 }
1081 } 1081 }
1082 1082
1083 static int 1083 static int
1084 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 1084 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1085 { 1085 {
1086 int cpu = (int)(long)hcpu; 1086 int cpu = (int)(long)hcpu;
1087 1087
1088 switch (action) { 1088 switch (action) {
1089 case CPU_UP_CANCELED: 1089 case CPU_UP_CANCELED:
1090 case CPU_UP_CANCELED_FROZEN: 1090 case CPU_UP_CANCELED_FROZEN:
1091 case CPU_DOWN_PREPARE: 1091 case CPU_DOWN_PREPARE:
1092 case CPU_DOWN_PREPARE_FROZEN: 1092 case CPU_DOWN_PREPARE_FROZEN:
1093 case CPU_DEAD: 1093 case CPU_DEAD:
1094 case CPU_DEAD_FROZEN: 1094 case CPU_DEAD_FROZEN:
1095 hrtick_clear(cpu_rq(cpu)); 1095 hrtick_clear(cpu_rq(cpu));
1096 return NOTIFY_OK; 1096 return NOTIFY_OK;
1097 } 1097 }
1098 1098
1099 return NOTIFY_DONE; 1099 return NOTIFY_DONE;
1100 } 1100 }
1101 1101
1102 static __init void init_hrtick(void) 1102 static __init void init_hrtick(void)
1103 { 1103 {
1104 hotcpu_notifier(hotplug_hrtick, 0); 1104 hotcpu_notifier(hotplug_hrtick, 0);
1105 } 1105 }
1106 #else 1106 #else
1107 /* 1107 /*
1108 * Called to set the hrtick timer state. 1108 * Called to set the hrtick timer state.
1109 * 1109 *
1110 * called with rq->lock held and irqs disabled 1110 * called with rq->lock held and irqs disabled
1111 */ 1111 */
1112 static void hrtick_start(struct rq *rq, u64 delay) 1112 static void hrtick_start(struct rq *rq, u64 delay)
1113 { 1113 {
1114 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1114 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1115 HRTIMER_MODE_REL_PINNED, 0); 1115 HRTIMER_MODE_REL_PINNED, 0);
1116 } 1116 }
1117 1117
1118 static inline void init_hrtick(void) 1118 static inline void init_hrtick(void)
1119 { 1119 {
1120 } 1120 }
1121 #endif /* CONFIG_SMP */ 1121 #endif /* CONFIG_SMP */
1122 1122
1123 static void init_rq_hrtick(struct rq *rq) 1123 static void init_rq_hrtick(struct rq *rq)
1124 { 1124 {
1125 #ifdef CONFIG_SMP 1125 #ifdef CONFIG_SMP
1126 rq->hrtick_csd_pending = 0; 1126 rq->hrtick_csd_pending = 0;
1127 1127
1128 rq->hrtick_csd.flags = 0; 1128 rq->hrtick_csd.flags = 0;
1129 rq->hrtick_csd.func = __hrtick_start; 1129 rq->hrtick_csd.func = __hrtick_start;
1130 rq->hrtick_csd.info = rq; 1130 rq->hrtick_csd.info = rq;
1131 #endif 1131 #endif
1132 1132
1133 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1133 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1134 rq->hrtick_timer.function = hrtick; 1134 rq->hrtick_timer.function = hrtick;
1135 } 1135 }
1136 #else /* CONFIG_SCHED_HRTICK */ 1136 #else /* CONFIG_SCHED_HRTICK */
1137 static inline void hrtick_clear(struct rq *rq) 1137 static inline void hrtick_clear(struct rq *rq)
1138 { 1138 {
1139 } 1139 }
1140 1140
1141 static inline void init_rq_hrtick(struct rq *rq) 1141 static inline void init_rq_hrtick(struct rq *rq)
1142 { 1142 {
1143 } 1143 }
1144 1144
1145 static inline void init_hrtick(void) 1145 static inline void init_hrtick(void)
1146 { 1146 {
1147 } 1147 }
1148 #endif /* CONFIG_SCHED_HRTICK */ 1148 #endif /* CONFIG_SCHED_HRTICK */
1149 1149
1150 /* 1150 /*
1151 * resched_task - mark a task 'to be rescheduled now'. 1151 * resched_task - mark a task 'to be rescheduled now'.
1152 * 1152 *
1153 * On UP this means the setting of the need_resched flag, on SMP it 1153 * On UP this means the setting of the need_resched flag, on SMP it
1154 * might also involve a cross-CPU call to trigger the scheduler on 1154 * might also involve a cross-CPU call to trigger the scheduler on
1155 * the target CPU. 1155 * the target CPU.
1156 */ 1156 */
1157 #ifdef CONFIG_SMP 1157 #ifdef CONFIG_SMP
1158 1158
1159 #ifndef tsk_is_polling 1159 #ifndef tsk_is_polling
1160 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1160 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1161 #endif 1161 #endif
1162 1162
1163 static void resched_task(struct task_struct *p) 1163 static void resched_task(struct task_struct *p)
1164 { 1164 {
1165 int cpu; 1165 int cpu;
1166 1166
1167 assert_raw_spin_locked(&task_rq(p)->lock); 1167 assert_raw_spin_locked(&task_rq(p)->lock);
1168 1168
1169 if (test_tsk_need_resched(p)) 1169 if (test_tsk_need_resched(p))
1170 return; 1170 return;
1171 1171
1172 set_tsk_need_resched(p); 1172 set_tsk_need_resched(p);
1173 1173
1174 cpu = task_cpu(p); 1174 cpu = task_cpu(p);
1175 if (cpu == smp_processor_id()) 1175 if (cpu == smp_processor_id())
1176 return; 1176 return;
1177 1177
1178 /* NEED_RESCHED must be visible before we test polling */ 1178 /* NEED_RESCHED must be visible before we test polling */
1179 smp_mb(); 1179 smp_mb();
1180 if (!tsk_is_polling(p)) 1180 if (!tsk_is_polling(p))
1181 smp_send_reschedule(cpu); 1181 smp_send_reschedule(cpu);
1182 } 1182 }
1183 1183
1184 static void resched_cpu(int cpu) 1184 static void resched_cpu(int cpu)
1185 { 1185 {
1186 struct rq *rq = cpu_rq(cpu); 1186 struct rq *rq = cpu_rq(cpu);
1187 unsigned long flags; 1187 unsigned long flags;
1188 1188
1189 if (!raw_spin_trylock_irqsave(&rq->lock, flags)) 1189 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1190 return; 1190 return;
1191 resched_task(cpu_curr(cpu)); 1191 resched_task(cpu_curr(cpu));
1192 raw_spin_unlock_irqrestore(&rq->lock, flags); 1192 raw_spin_unlock_irqrestore(&rq->lock, flags);
1193 } 1193 }
1194 1194
1195 #ifdef CONFIG_NO_HZ 1195 #ifdef CONFIG_NO_HZ
1196 /* 1196 /*
1197 * When add_timer_on() enqueues a timer into the timer wheel of an 1197 * When add_timer_on() enqueues a timer into the timer wheel of an
1198 * idle CPU then this timer might expire before the next timer event 1198 * idle CPU then this timer might expire before the next timer event
1199 * which is scheduled to wake up that CPU. In case of a completely 1199 * which is scheduled to wake up that CPU. In case of a completely
1200 * idle system the next event might even be infinite time into the 1200 * idle system the next event might even be infinite time into the
1201 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 1201 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1202 * leaves the inner idle loop so the newly added timer is taken into 1202 * leaves the inner idle loop so the newly added timer is taken into
1203 * account when the CPU goes back to idle and evaluates the timer 1203 * account when the CPU goes back to idle and evaluates the timer
1204 * wheel for the next timer event. 1204 * wheel for the next timer event.
1205 */ 1205 */
1206 void wake_up_idle_cpu(int cpu) 1206 void wake_up_idle_cpu(int cpu)
1207 { 1207 {
1208 struct rq *rq = cpu_rq(cpu); 1208 struct rq *rq = cpu_rq(cpu);
1209 1209
1210 if (cpu == smp_processor_id()) 1210 if (cpu == smp_processor_id())
1211 return; 1211 return;
1212 1212
1213 /* 1213 /*
1214 * This is safe, as this function is called with the timer 1214 * This is safe, as this function is called with the timer
1215 * wheel base lock of (cpu) held. When the CPU is on the way 1215 * wheel base lock of (cpu) held. When the CPU is on the way
1216 * to idle and has not yet set rq->curr to idle then it will 1216 * to idle and has not yet set rq->curr to idle then it will
1217 * be serialized on the timer wheel base lock and take the new 1217 * be serialized on the timer wheel base lock and take the new
1218 * timer into account automatically. 1218 * timer into account automatically.
1219 */ 1219 */
1220 if (rq->curr != rq->idle) 1220 if (rq->curr != rq->idle)
1221 return; 1221 return;
1222 1222
1223 /* 1223 /*
1224 * We can set TIF_RESCHED on the idle task of the other CPU 1224 * We can set TIF_RESCHED on the idle task of the other CPU
1225 * lockless. The worst case is that the other CPU runs the 1225 * lockless. The worst case is that the other CPU runs the
1226 * idle task through an additional NOOP schedule() 1226 * idle task through an additional NOOP schedule()
1227 */ 1227 */
1228 set_tsk_need_resched(rq->idle); 1228 set_tsk_need_resched(rq->idle);
1229 1229
1230 /* NEED_RESCHED must be visible before we test polling */ 1230 /* NEED_RESCHED must be visible before we test polling */
1231 smp_mb(); 1231 smp_mb();
1232 if (!tsk_is_polling(rq->idle)) 1232 if (!tsk_is_polling(rq->idle))
1233 smp_send_reschedule(cpu); 1233 smp_send_reschedule(cpu);
1234 } 1234 }
1235 1235
1236 int nohz_ratelimit(int cpu) 1236 int nohz_ratelimit(int cpu)
1237 { 1237 {
1238 struct rq *rq = cpu_rq(cpu); 1238 struct rq *rq = cpu_rq(cpu);
1239 u64 diff = rq->clock - rq->nohz_stamp; 1239 u64 diff = rq->clock - rq->nohz_stamp;
1240 1240
1241 rq->nohz_stamp = rq->clock; 1241 rq->nohz_stamp = rq->clock;
1242 1242
1243 return diff < (NSEC_PER_SEC / HZ) >> 1; 1243 return diff < (NSEC_PER_SEC / HZ) >> 1;
1244 } 1244 }
1245 1245
1246 #endif /* CONFIG_NO_HZ */ 1246 #endif /* CONFIG_NO_HZ */
1247 1247
1248 static u64 sched_avg_period(void) 1248 static u64 sched_avg_period(void)
1249 { 1249 {
1250 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 1250 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1251 } 1251 }
1252 1252
1253 static void sched_avg_update(struct rq *rq) 1253 static void sched_avg_update(struct rq *rq)
1254 { 1254 {
1255 s64 period = sched_avg_period(); 1255 s64 period = sched_avg_period();
1256 1256
1257 while ((s64)(rq->clock - rq->age_stamp) > period) { 1257 while ((s64)(rq->clock - rq->age_stamp) > period) {
1258 rq->age_stamp += period; 1258 rq->age_stamp += period;
1259 rq->rt_avg /= 2; 1259 rq->rt_avg /= 2;
1260 } 1260 }
1261 } 1261 }
1262 1262
1263 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1263 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1264 { 1264 {
1265 rq->rt_avg += rt_delta; 1265 rq->rt_avg += rt_delta;
1266 sched_avg_update(rq); 1266 sched_avg_update(rq);
1267 } 1267 }
1268 1268
1269 #else /* !CONFIG_SMP */ 1269 #else /* !CONFIG_SMP */
1270 static void resched_task(struct task_struct *p) 1270 static void resched_task(struct task_struct *p)
1271 { 1271 {
1272 assert_raw_spin_locked(&task_rq(p)->lock); 1272 assert_raw_spin_locked(&task_rq(p)->lock);
1273 set_tsk_need_resched(p); 1273 set_tsk_need_resched(p);
1274 } 1274 }
1275 1275
1276 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1276 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1277 { 1277 {
1278 } 1278 }
1279 #endif /* CONFIG_SMP */ 1279 #endif /* CONFIG_SMP */
1280 1280
1281 #if BITS_PER_LONG == 32 1281 #if BITS_PER_LONG == 32
1282 # define WMULT_CONST (~0UL) 1282 # define WMULT_CONST (~0UL)
1283 #else 1283 #else
1284 # define WMULT_CONST (1UL << 32) 1284 # define WMULT_CONST (1UL << 32)
1285 #endif 1285 #endif
1286 1286
1287 #define WMULT_SHIFT 32 1287 #define WMULT_SHIFT 32
1288 1288
1289 /* 1289 /*
1290 * Shift right and round: 1290 * Shift right and round:
1291 */ 1291 */
1292 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1292 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1293 1293
1294 /* 1294 /*
1295 * delta *= weight / lw 1295 * delta *= weight / lw
1296 */ 1296 */
1297 static unsigned long 1297 static unsigned long
1298 calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1298 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1299 struct load_weight *lw) 1299 struct load_weight *lw)
1300 { 1300 {
1301 u64 tmp; 1301 u64 tmp;
1302 1302
1303 if (!lw->inv_weight) { 1303 if (!lw->inv_weight) {
1304 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1304 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1305 lw->inv_weight = 1; 1305 lw->inv_weight = 1;
1306 else 1306 else
1307 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1307 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1308 / (lw->weight+1); 1308 / (lw->weight+1);
1309 } 1309 }
1310 1310
1311 tmp = (u64)delta_exec * weight; 1311 tmp = (u64)delta_exec * weight;
1312 /* 1312 /*
1313 * Check whether we'd overflow the 64-bit multiplication: 1313 * Check whether we'd overflow the 64-bit multiplication:
1314 */ 1314 */
1315 if (unlikely(tmp > WMULT_CONST)) 1315 if (unlikely(tmp > WMULT_CONST))
1316 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 1316 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1317 WMULT_SHIFT/2); 1317 WMULT_SHIFT/2);
1318 else 1318 else
1319 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 1319 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1320 1320
1321 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1321 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1322 } 1322 }
1323 1323
1324 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1324 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1325 { 1325 {
1326 lw->weight += inc; 1326 lw->weight += inc;
1327 lw->inv_weight = 0; 1327 lw->inv_weight = 0;
1328 } 1328 }
1329 1329
1330 static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1330 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1331 { 1331 {
1332 lw->weight -= dec; 1332 lw->weight -= dec;
1333 lw->inv_weight = 0; 1333 lw->inv_weight = 0;
1334 } 1334 }
1335 1335
1336 /* 1336 /*
1337 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1337 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1338 * of tasks with abnormal "nice" values across CPUs the contribution that 1338 * of tasks with abnormal "nice" values across CPUs the contribution that
1339 * each task makes to its run queue's load is weighted according to its 1339 * each task makes to its run queue's load is weighted according to its
1340 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1340 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1341 * scaled version of the new time slice allocation that they receive on time 1341 * scaled version of the new time slice allocation that they receive on time
1342 * slice expiry etc. 1342 * slice expiry etc.
1343 */ 1343 */
1344 1344
1345 #define WEIGHT_IDLEPRIO 3 1345 #define WEIGHT_IDLEPRIO 3
1346 #define WMULT_IDLEPRIO 1431655765 1346 #define WMULT_IDLEPRIO 1431655765
1347 1347
1348 /* 1348 /*
1349 * Nice levels are multiplicative, with a gentle 10% change for every 1349 * Nice levels are multiplicative, with a gentle 10% change for every
1350 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 1350 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1351 * nice 1, it will get ~10% less CPU time than another CPU-bound task 1351 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1352 * that remained on nice 0. 1352 * that remained on nice 0.
1353 * 1353 *
1354 * The "10% effect" is relative and cumulative: from _any_ nice level, 1354 * The "10% effect" is relative and cumulative: from _any_ nice level,
1355 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 1355 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1356 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 1356 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1357 * If a task goes up by ~10% and another task goes down by ~10% then 1357 * If a task goes up by ~10% and another task goes down by ~10% then
1358 * the relative distance between them is ~25%.) 1358 * the relative distance between them is ~25%.)
1359 */ 1359 */
1360 static const int prio_to_weight[40] = { 1360 static const int prio_to_weight[40] = {
1361 /* -20 */ 88761, 71755, 56483, 46273, 36291, 1361 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1362 /* -15 */ 29154, 23254, 18705, 14949, 11916, 1362 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1363 /* -10 */ 9548, 7620, 6100, 4904, 3906, 1363 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1364 /* -5 */ 3121, 2501, 1991, 1586, 1277, 1364 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1365 /* 0 */ 1024, 820, 655, 526, 423, 1365 /* 0 */ 1024, 820, 655, 526, 423,
1366 /* 5 */ 335, 272, 215, 172, 137, 1366 /* 5 */ 335, 272, 215, 172, 137,
1367 /* 10 */ 110, 87, 70, 56, 45, 1367 /* 10 */ 110, 87, 70, 56, 45,
1368 /* 15 */ 36, 29, 23, 18, 15, 1368 /* 15 */ 36, 29, 23, 18, 15,
1369 }; 1369 };
1370 1370
1371 /* 1371 /*
1372 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 1372 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1373 * 1373 *
1374 * In cases where the weight does not change often, we can use the 1374 * In cases where the weight does not change often, we can use the
1375 * precalculated inverse to speed up arithmetics by turning divisions 1375 * precalculated inverse to speed up arithmetics by turning divisions
1376 * into multiplications: 1376 * into multiplications:
1377 */ 1377 */
1378 static const u32 prio_to_wmult[40] = { 1378 static const u32 prio_to_wmult[40] = {
1379 /* -20 */ 48388, 59856, 76040, 92818, 118348, 1379 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1380 /* -15 */ 147320, 184698, 229616, 287308, 360437, 1380 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1381 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 1381 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1382 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 1382 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1383 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 1383 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1384 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 1384 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1385 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 1385 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1386 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1386 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1387 }; 1387 };
1388 1388
1389 /* Time spent by the tasks of the cpu accounting group executing in ... */ 1389 /* Time spent by the tasks of the cpu accounting group executing in ... */
1390 enum cpuacct_stat_index { 1390 enum cpuacct_stat_index {
1391 CPUACCT_STAT_USER, /* ... user mode */ 1391 CPUACCT_STAT_USER, /* ... user mode */
1392 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 1392 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1393 1393
1394 CPUACCT_STAT_NSTATS, 1394 CPUACCT_STAT_NSTATS,
1395 }; 1395 };
1396 1396
1397 #ifdef CONFIG_CGROUP_CPUACCT 1397 #ifdef CONFIG_CGROUP_CPUACCT
1398 static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1398 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1399 static void cpuacct_update_stats(struct task_struct *tsk, 1399 static void cpuacct_update_stats(struct task_struct *tsk,
1400 enum cpuacct_stat_index idx, cputime_t val); 1400 enum cpuacct_stat_index idx, cputime_t val);
1401 #else 1401 #else
1402 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1402 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1403 static inline void cpuacct_update_stats(struct task_struct *tsk, 1403 static inline void cpuacct_update_stats(struct task_struct *tsk,
1404 enum cpuacct_stat_index idx, cputime_t val) {} 1404 enum cpuacct_stat_index idx, cputime_t val) {}
1405 #endif 1405 #endif
1406 1406
1407 static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1407 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1408 { 1408 {
1409 update_load_add(&rq->load, load); 1409 update_load_add(&rq->load, load);
1410 } 1410 }
1411 1411
1412 static inline void dec_cpu_load(struct rq *rq, unsigned long load) 1412 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1413 { 1413 {
1414 update_load_sub(&rq->load, load); 1414 update_load_sub(&rq->load, load);
1415 } 1415 }
1416 1416
1417 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1417 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1418 typedef int (*tg_visitor)(struct task_group *, void *); 1418 typedef int (*tg_visitor)(struct task_group *, void *);
1419 1419
1420 /* 1420 /*
1421 * Iterate the full tree, calling @down when first entering a node and @up when 1421 * Iterate the full tree, calling @down when first entering a node and @up when
1422 * leaving it for the final time. 1422 * leaving it for the final time.
1423 */ 1423 */
1424 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1424 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1425 { 1425 {
1426 struct task_group *parent, *child; 1426 struct task_group *parent, *child;
1427 int ret; 1427 int ret;
1428 1428
1429 rcu_read_lock(); 1429 rcu_read_lock();
1430 parent = &root_task_group; 1430 parent = &root_task_group;
1431 down: 1431 down:
1432 ret = (*down)(parent, data); 1432 ret = (*down)(parent, data);
1433 if (ret) 1433 if (ret)
1434 goto out_unlock; 1434 goto out_unlock;
1435 list_for_each_entry_rcu(child, &parent->children, siblings) { 1435 list_for_each_entry_rcu(child, &parent->children, siblings) {
1436 parent = child; 1436 parent = child;
1437 goto down; 1437 goto down;
1438 1438
1439 up: 1439 up:
1440 continue; 1440 continue;
1441 } 1441 }
1442 ret = (*up)(parent, data); 1442 ret = (*up)(parent, data);
1443 if (ret) 1443 if (ret)
1444 goto out_unlock; 1444 goto out_unlock;
1445 1445
1446 child = parent; 1446 child = parent;
1447 parent = parent->parent; 1447 parent = parent->parent;
1448 if (parent) 1448 if (parent)
1449 goto up; 1449 goto up;
1450 out_unlock: 1450 out_unlock:
1451 rcu_read_unlock(); 1451 rcu_read_unlock();
1452 1452
1453 return ret; 1453 return ret;
1454 } 1454 }
1455 1455
1456 static int tg_nop(struct task_group *tg, void *data) 1456 static int tg_nop(struct task_group *tg, void *data)
1457 { 1457 {
1458 return 0; 1458 return 0;
1459 } 1459 }
1460 #endif 1460 #endif
1461 1461
1462 #ifdef CONFIG_SMP 1462 #ifdef CONFIG_SMP
1463 /* Used instead of source_load when we know the type == 0 */ 1463 /* Used instead of source_load when we know the type == 0 */
1464 static unsigned long weighted_cpuload(const int cpu) 1464 static unsigned long weighted_cpuload(const int cpu)
1465 { 1465 {
1466 return cpu_rq(cpu)->load.weight; 1466 return cpu_rq(cpu)->load.weight;
1467 } 1467 }
1468 1468
1469 /* 1469 /*
1470 * Return a low guess at the load of a migration-source cpu weighted 1470 * Return a low guess at the load of a migration-source cpu weighted
1471 * according to the scheduling class and "nice" value. 1471 * according to the scheduling class and "nice" value.
1472 * 1472 *
1473 * We want to under-estimate the load of migration sources, to 1473 * We want to under-estimate the load of migration sources, to
1474 * balance conservatively. 1474 * balance conservatively.
1475 */ 1475 */
1476 static unsigned long source_load(int cpu, int type) 1476 static unsigned long source_load(int cpu, int type)
1477 { 1477 {
1478 struct rq *rq = cpu_rq(cpu); 1478 struct rq *rq = cpu_rq(cpu);
1479 unsigned long total = weighted_cpuload(cpu); 1479 unsigned long total = weighted_cpuload(cpu);
1480 1480
1481 if (type == 0 || !sched_feat(LB_BIAS)) 1481 if (type == 0 || !sched_feat(LB_BIAS))
1482 return total; 1482 return total;
1483 1483
1484 return min(rq->cpu_load[type-1], total); 1484 return min(rq->cpu_load[type-1], total);
1485 } 1485 }
1486 1486
1487 /* 1487 /*
1488 * Return a high guess at the load of a migration-target cpu weighted 1488 * Return a high guess at the load of a migration-target cpu weighted
1489 * according to the scheduling class and "nice" value. 1489 * according to the scheduling class and "nice" value.
1490 */ 1490 */
1491 static unsigned long target_load(int cpu, int type) 1491 static unsigned long target_load(int cpu, int type)
1492 { 1492 {
1493 struct rq *rq = cpu_rq(cpu); 1493 struct rq *rq = cpu_rq(cpu);
1494 unsigned long total = weighted_cpuload(cpu); 1494 unsigned long total = weighted_cpuload(cpu);
1495 1495
1496 if (type == 0 || !sched_feat(LB_BIAS)) 1496 if (type == 0 || !sched_feat(LB_BIAS))
1497 return total; 1497 return total;
1498 1498
1499 return max(rq->cpu_load[type-1], total); 1499 return max(rq->cpu_load[type-1], total);
1500 } 1500 }
1501 1501
1502 static struct sched_group *group_of(int cpu) 1502 static struct sched_group *group_of(int cpu)
1503 { 1503 {
1504 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); 1504 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1505 1505
1506 if (!sd) 1506 if (!sd)
1507 return NULL; 1507 return NULL;
1508 1508
1509 return sd->groups; 1509 return sd->groups;
1510 } 1510 }
1511 1511
1512 static unsigned long power_of(int cpu) 1512 static unsigned long power_of(int cpu)
1513 { 1513 {
1514 struct sched_group *group = group_of(cpu); 1514 struct sched_group *group = group_of(cpu);
1515 1515
1516 if (!group) 1516 if (!group)
1517 return SCHED_LOAD_SCALE; 1517 return SCHED_LOAD_SCALE;
1518 1518
1519 return group->cpu_power; 1519 return group->cpu_power;
1520 } 1520 }
1521 1521
1522 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1522 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1523 1523
1524 static unsigned long cpu_avg_load_per_task(int cpu) 1524 static unsigned long cpu_avg_load_per_task(int cpu)
1525 { 1525 {
1526 struct rq *rq = cpu_rq(cpu); 1526 struct rq *rq = cpu_rq(cpu);
1527 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1527 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1528 1528
1529 if (nr_running) 1529 if (nr_running)
1530 rq->avg_load_per_task = rq->load.weight / nr_running; 1530 rq->avg_load_per_task = rq->load.weight / nr_running;
1531 else 1531 else
1532 rq->avg_load_per_task = 0; 1532 rq->avg_load_per_task = 0;
1533 1533
1534 return rq->avg_load_per_task; 1534 return rq->avg_load_per_task;
1535 } 1535 }
1536 1536
1537 #ifdef CONFIG_FAIR_GROUP_SCHED 1537 #ifdef CONFIG_FAIR_GROUP_SCHED
1538 1538
1539 static __read_mostly unsigned long __percpu *update_shares_data; 1539 static __read_mostly unsigned long __percpu *update_shares_data;
1540 1540
1541 static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1541 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1542 1542
1543 /* 1543 /*
1544 * Calculate and set the cpu's group shares. 1544 * Calculate and set the cpu's group shares.
1545 */ 1545 */
1546 static void update_group_shares_cpu(struct task_group *tg, int cpu, 1546 static void update_group_shares_cpu(struct task_group *tg, int cpu,
1547 unsigned long sd_shares, 1547 unsigned long sd_shares,
1548 unsigned long sd_rq_weight, 1548 unsigned long sd_rq_weight,
1549 unsigned long *usd_rq_weight) 1549 unsigned long *usd_rq_weight)
1550 { 1550 {
1551 unsigned long shares, rq_weight; 1551 unsigned long shares, rq_weight;
1552 int boost = 0; 1552 int boost = 0;
1553 1553
1554 rq_weight = usd_rq_weight[cpu]; 1554 rq_weight = usd_rq_weight[cpu];
1555 if (!rq_weight) { 1555 if (!rq_weight) {
1556 boost = 1; 1556 boost = 1;
1557 rq_weight = NICE_0_LOAD; 1557 rq_weight = NICE_0_LOAD;
1558 } 1558 }
1559 1559
1560 /* 1560 /*
1561 * \Sum_j shares_j * rq_weight_i 1561 * \Sum_j shares_j * rq_weight_i
1562 * shares_i = ----------------------------- 1562 * shares_i = -----------------------------
1563 * \Sum_j rq_weight_j 1563 * \Sum_j rq_weight_j
1564 */ 1564 */
1565 shares = (sd_shares * rq_weight) / sd_rq_weight; 1565 shares = (sd_shares * rq_weight) / sd_rq_weight;
1566 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1566 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1567 1567
1568 if (abs(shares - tg->se[cpu]->load.weight) > 1568 if (abs(shares - tg->se[cpu]->load.weight) >
1569 sysctl_sched_shares_thresh) { 1569 sysctl_sched_shares_thresh) {
1570 struct rq *rq = cpu_rq(cpu); 1570 struct rq *rq = cpu_rq(cpu);
1571 unsigned long flags; 1571 unsigned long flags;
1572 1572
1573 raw_spin_lock_irqsave(&rq->lock, flags); 1573 raw_spin_lock_irqsave(&rq->lock, flags);
1574 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1574 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1575 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1575 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1576 __set_se_shares(tg->se[cpu], shares); 1576 __set_se_shares(tg->se[cpu], shares);
1577 raw_spin_unlock_irqrestore(&rq->lock, flags); 1577 raw_spin_unlock_irqrestore(&rq->lock, flags);
1578 } 1578 }
1579 } 1579 }
1580 1580
1581 /* 1581 /*
1582 * Re-compute the task group their per cpu shares over the given domain. 1582 * Re-compute the task group their per cpu shares over the given domain.
1583 * This needs to be done in a bottom-up fashion because the rq weight of a 1583 * This needs to be done in a bottom-up fashion because the rq weight of a
1584 * parent group depends on the shares of its child groups. 1584 * parent group depends on the shares of its child groups.
1585 */ 1585 */
1586 static int tg_shares_up(struct task_group *tg, void *data) 1586 static int tg_shares_up(struct task_group *tg, void *data)
1587 { 1587 {
1588 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; 1588 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1589 unsigned long *usd_rq_weight; 1589 unsigned long *usd_rq_weight;
1590 struct sched_domain *sd = data; 1590 struct sched_domain *sd = data;
1591 unsigned long flags; 1591 unsigned long flags;
1592 int i; 1592 int i;
1593 1593
1594 if (!tg->se[0]) 1594 if (!tg->se[0])
1595 return 0; 1595 return 0;
1596 1596
1597 local_irq_save(flags); 1597 local_irq_save(flags);
1598 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); 1598 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1599 1599
1600 for_each_cpu(i, sched_domain_span(sd)) { 1600 for_each_cpu(i, sched_domain_span(sd)) {
1601 weight = tg->cfs_rq[i]->load.weight; 1601 weight = tg->cfs_rq[i]->load.weight;
1602 usd_rq_weight[i] = weight; 1602 usd_rq_weight[i] = weight;
1603 1603
1604 rq_weight += weight; 1604 rq_weight += weight;
1605 /* 1605 /*
1606 * If there are currently no tasks on the cpu pretend there 1606 * If there are currently no tasks on the cpu pretend there
1607 * is one of average load so that when a new task gets to 1607 * is one of average load so that when a new task gets to
1608 * run here it will not get delayed by group starvation. 1608 * run here it will not get delayed by group starvation.
1609 */ 1609 */
1610 if (!weight) 1610 if (!weight)
1611 weight = NICE_0_LOAD; 1611 weight = NICE_0_LOAD;
1612 1612
1613 sum_weight += weight; 1613 sum_weight += weight;
1614 shares += tg->cfs_rq[i]->shares; 1614 shares += tg->cfs_rq[i]->shares;
1615 } 1615 }
1616 1616
1617 if (!rq_weight) 1617 if (!rq_weight)
1618 rq_weight = sum_weight; 1618 rq_weight = sum_weight;
1619 1619
1620 if ((!shares && rq_weight) || shares > tg->shares) 1620 if ((!shares && rq_weight) || shares > tg->shares)
1621 shares = tg->shares; 1621 shares = tg->shares;
1622 1622
1623 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1623 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1624 shares = tg->shares; 1624 shares = tg->shares;
1625 1625
1626 for_each_cpu(i, sched_domain_span(sd)) 1626 for_each_cpu(i, sched_domain_span(sd))
1627 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); 1627 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1628 1628
1629 local_irq_restore(flags); 1629 local_irq_restore(flags);
1630 1630
1631 return 0; 1631 return 0;
1632 } 1632 }
1633 1633
1634 /* 1634 /*
1635 * Compute the cpu's hierarchical load factor for each task group. 1635 * Compute the cpu's hierarchical load factor for each task group.
1636 * This needs to be done in a top-down fashion because the load of a child 1636 * This needs to be done in a top-down fashion because the load of a child
1637 * group is a fraction of its parents load. 1637 * group is a fraction of its parents load.
1638 */ 1638 */
1639 static int tg_load_down(struct task_group *tg, void *data) 1639 static int tg_load_down(struct task_group *tg, void *data)
1640 { 1640 {
1641 unsigned long load; 1641 unsigned long load;
1642 long cpu = (long)data; 1642 long cpu = (long)data;
1643 1643
1644 if (!tg->parent) { 1644 if (!tg->parent) {
1645 load = cpu_rq(cpu)->load.weight; 1645 load = cpu_rq(cpu)->load.weight;
1646 } else { 1646 } else {
1647 load = tg->parent->cfs_rq[cpu]->h_load; 1647 load = tg->parent->cfs_rq[cpu]->h_load;
1648 load *= tg->cfs_rq[cpu]->shares; 1648 load *= tg->cfs_rq[cpu]->shares;
1649 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1649 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1650 } 1650 }
1651 1651
1652 tg->cfs_rq[cpu]->h_load = load; 1652 tg->cfs_rq[cpu]->h_load = load;
1653 1653
1654 return 0; 1654 return 0;
1655 } 1655 }
1656 1656
1657 static void update_shares(struct sched_domain *sd) 1657 static void update_shares(struct sched_domain *sd)
1658 { 1658 {
1659 s64 elapsed; 1659 s64 elapsed;
1660 u64 now; 1660 u64 now;
1661 1661
1662 if (root_task_group_empty()) 1662 if (root_task_group_empty())
1663 return; 1663 return;
1664 1664
1665 now = cpu_clock(raw_smp_processor_id()); 1665 now = cpu_clock(raw_smp_processor_id());
1666 elapsed = now - sd->last_update; 1666 elapsed = now - sd->last_update;
1667 1667
1668 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1668 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1669 sd->last_update = now; 1669 sd->last_update = now;
1670 walk_tg_tree(tg_nop, tg_shares_up, sd); 1670 walk_tg_tree(tg_nop, tg_shares_up, sd);
1671 } 1671 }
1672 } 1672 }
1673 1673
1674 static void update_h_load(long cpu) 1674 static void update_h_load(long cpu)
1675 { 1675 {
1676 if (root_task_group_empty()) 1676 if (root_task_group_empty())
1677 return; 1677 return;
1678 1678
1679 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1679 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1680 } 1680 }
1681 1681
1682 #else 1682 #else
1683 1683
1684 static inline void update_shares(struct sched_domain *sd) 1684 static inline void update_shares(struct sched_domain *sd)
1685 { 1685 {
1686 } 1686 }
1687 1687
1688 #endif 1688 #endif
1689 1689
1690 #ifdef CONFIG_PREEMPT 1690 #ifdef CONFIG_PREEMPT
1691 1691
1692 static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1692 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1693 1693
1694 /* 1694 /*
1695 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1695 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1696 * way at the expense of forcing extra atomic operations in all 1696 * way at the expense of forcing extra atomic operations in all
1697 * invocations. This assures that the double_lock is acquired using the 1697 * invocations. This assures that the double_lock is acquired using the
1698 * same underlying policy as the spinlock_t on this architecture, which 1698 * same underlying policy as the spinlock_t on this architecture, which
1699 * reduces latency compared to the unfair variant below. However, it 1699 * reduces latency compared to the unfair variant below. However, it
1700 * also adds more overhead and therefore may reduce throughput. 1700 * also adds more overhead and therefore may reduce throughput.
1701 */ 1701 */
1702 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1702 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1703 __releases(this_rq->lock) 1703 __releases(this_rq->lock)
1704 __acquires(busiest->lock) 1704 __acquires(busiest->lock)
1705 __acquires(this_rq->lock) 1705 __acquires(this_rq->lock)
1706 { 1706 {
1707 raw_spin_unlock(&this_rq->lock); 1707 raw_spin_unlock(&this_rq->lock);
1708 double_rq_lock(this_rq, busiest); 1708 double_rq_lock(this_rq, busiest);
1709 1709
1710 return 1; 1710 return 1;
1711 } 1711 }
1712 1712
1713 #else 1713 #else
1714 /* 1714 /*
1715 * Unfair double_lock_balance: Optimizes throughput at the expense of 1715 * Unfair double_lock_balance: Optimizes throughput at the expense of
1716 * latency by eliminating extra atomic operations when the locks are 1716 * latency by eliminating extra atomic operations when the locks are
1717 * already in proper order on entry. This favors lower cpu-ids and will 1717 * already in proper order on entry. This favors lower cpu-ids and will
1718 * grant the double lock to lower cpus over higher ids under contention, 1718 * grant the double lock to lower cpus over higher ids under contention,
1719 * regardless of entry order into the function. 1719 * regardless of entry order into the function.
1720 */ 1720 */
1721 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1721 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1722 __releases(this_rq->lock) 1722 __releases(this_rq->lock)
1723 __acquires(busiest->lock) 1723 __acquires(busiest->lock)
1724 __acquires(this_rq->lock) 1724 __acquires(this_rq->lock)
1725 { 1725 {
1726 int ret = 0; 1726 int ret = 0;
1727 1727
1728 if (unlikely(!raw_spin_trylock(&busiest->lock))) { 1728 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1729 if (busiest < this_rq) { 1729 if (busiest < this_rq) {
1730 raw_spin_unlock(&this_rq->lock); 1730 raw_spin_unlock(&this_rq->lock);
1731 raw_spin_lock(&busiest->lock); 1731 raw_spin_lock(&busiest->lock);
1732 raw_spin_lock_nested(&this_rq->lock, 1732 raw_spin_lock_nested(&this_rq->lock,
1733 SINGLE_DEPTH_NESTING); 1733 SINGLE_DEPTH_NESTING);
1734 ret = 1; 1734 ret = 1;
1735 } else 1735 } else
1736 raw_spin_lock_nested(&busiest->lock, 1736 raw_spin_lock_nested(&busiest->lock,
1737 SINGLE_DEPTH_NESTING); 1737 SINGLE_DEPTH_NESTING);
1738 } 1738 }
1739 return ret; 1739 return ret;
1740 } 1740 }
1741 1741
1742 #endif /* CONFIG_PREEMPT */ 1742 #endif /* CONFIG_PREEMPT */
1743 1743
1744 /* 1744 /*
1745 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1745 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1746 */ 1746 */
1747 static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1747 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1748 { 1748 {
1749 if (unlikely(!irqs_disabled())) { 1749 if (unlikely(!irqs_disabled())) {
1750 /* printk() doesn't work good under rq->lock */ 1750 /* printk() doesn't work good under rq->lock */
1751 raw_spin_unlock(&this_rq->lock); 1751 raw_spin_unlock(&this_rq->lock);
1752 BUG_ON(1); 1752 BUG_ON(1);
1753 } 1753 }
1754 1754
1755 return _double_lock_balance(this_rq, busiest); 1755 return _double_lock_balance(this_rq, busiest);
1756 } 1756 }
1757 1757
1758 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1758 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1759 __releases(busiest->lock) 1759 __releases(busiest->lock)
1760 { 1760 {
1761 raw_spin_unlock(&busiest->lock); 1761 raw_spin_unlock(&busiest->lock);
1762 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1762 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1763 } 1763 }
1764 1764
1765 /* 1765 /*
1766 * double_rq_lock - safely lock two runqueues 1766 * double_rq_lock - safely lock two runqueues
1767 * 1767 *
1768 * Note this does not disable interrupts like task_rq_lock, 1768 * Note this does not disable interrupts like task_rq_lock,
1769 * you need to do so manually before calling. 1769 * you need to do so manually before calling.
1770 */ 1770 */
1771 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1771 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1772 __acquires(rq1->lock) 1772 __acquires(rq1->lock)
1773 __acquires(rq2->lock) 1773 __acquires(rq2->lock)
1774 { 1774 {
1775 BUG_ON(!irqs_disabled()); 1775 BUG_ON(!irqs_disabled());
1776 if (rq1 == rq2) { 1776 if (rq1 == rq2) {
1777 raw_spin_lock(&rq1->lock); 1777 raw_spin_lock(&rq1->lock);
1778 __acquire(rq2->lock); /* Fake it out ;) */ 1778 __acquire(rq2->lock); /* Fake it out ;) */
1779 } else { 1779 } else {
1780 if (rq1 < rq2) { 1780 if (rq1 < rq2) {
1781 raw_spin_lock(&rq1->lock); 1781 raw_spin_lock(&rq1->lock);
1782 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 1782 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1783 } else { 1783 } else {
1784 raw_spin_lock(&rq2->lock); 1784 raw_spin_lock(&rq2->lock);
1785 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1785 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1786 } 1786 }
1787 } 1787 }
1788 } 1788 }
1789 1789
1790 /* 1790 /*
1791 * double_rq_unlock - safely unlock two runqueues 1791 * double_rq_unlock - safely unlock two runqueues
1792 * 1792 *
1793 * Note this does not restore interrupts like task_rq_unlock, 1793 * Note this does not restore interrupts like task_rq_unlock,
1794 * you need to do so manually after calling. 1794 * you need to do so manually after calling.
1795 */ 1795 */
1796 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 1796 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1797 __releases(rq1->lock) 1797 __releases(rq1->lock)
1798 __releases(rq2->lock) 1798 __releases(rq2->lock)
1799 { 1799 {
1800 raw_spin_unlock(&rq1->lock); 1800 raw_spin_unlock(&rq1->lock);
1801 if (rq1 != rq2) 1801 if (rq1 != rq2)
1802 raw_spin_unlock(&rq2->lock); 1802 raw_spin_unlock(&rq2->lock);
1803 else 1803 else
1804 __release(rq2->lock); 1804 __release(rq2->lock);
1805 } 1805 }
1806 1806
1807 #endif 1807 #endif
1808 1808
1809 #ifdef CONFIG_FAIR_GROUP_SCHED 1809 #ifdef CONFIG_FAIR_GROUP_SCHED
1810 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1810 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1811 { 1811 {
1812 #ifdef CONFIG_SMP 1812 #ifdef CONFIG_SMP
1813 cfs_rq->shares = shares; 1813 cfs_rq->shares = shares;
1814 #endif 1814 #endif
1815 } 1815 }
1816 #endif 1816 #endif
1817 1817
1818 static void calc_load_account_active(struct rq *this_rq); 1818 static void calc_load_account_idle(struct rq *this_rq);
1819 static void update_sysctl(void); 1819 static void update_sysctl(void);
1820 static int get_update_sysctl_factor(void); 1820 static int get_update_sysctl_factor(void);
1821 1821
1822 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1822 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1823 { 1823 {
1824 set_task_rq(p, cpu); 1824 set_task_rq(p, cpu);
1825 #ifdef CONFIG_SMP 1825 #ifdef CONFIG_SMP
1826 /* 1826 /*
1827 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1827 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1828 * successfuly executed on another CPU. We must ensure that updates of 1828 * successfuly executed on another CPU. We must ensure that updates of
1829 * per-task data have been completed by this moment. 1829 * per-task data have been completed by this moment.
1830 */ 1830 */
1831 smp_wmb(); 1831 smp_wmb();
1832 task_thread_info(p)->cpu = cpu; 1832 task_thread_info(p)->cpu = cpu;
1833 #endif 1833 #endif
1834 } 1834 }
1835 1835
1836 static const struct sched_class rt_sched_class; 1836 static const struct sched_class rt_sched_class;
1837 1837
1838 #define sched_class_highest (&rt_sched_class) 1838 #define sched_class_highest (&rt_sched_class)
1839 #define for_each_class(class) \ 1839 #define for_each_class(class) \
1840 for (class = sched_class_highest; class; class = class->next) 1840 for (class = sched_class_highest; class; class = class->next)
1841 1841
1842 #include "sched_stats.h" 1842 #include "sched_stats.h"
1843 1843
1844 static void inc_nr_running(struct rq *rq) 1844 static void inc_nr_running(struct rq *rq)
1845 { 1845 {
1846 rq->nr_running++; 1846 rq->nr_running++;
1847 } 1847 }
1848 1848
1849 static void dec_nr_running(struct rq *rq) 1849 static void dec_nr_running(struct rq *rq)
1850 { 1850 {
1851 rq->nr_running--; 1851 rq->nr_running--;
1852 } 1852 }
1853 1853
1854 static void set_load_weight(struct task_struct *p) 1854 static void set_load_weight(struct task_struct *p)
1855 { 1855 {
1856 if (task_has_rt_policy(p)) { 1856 if (task_has_rt_policy(p)) {
1857 p->se.load.weight = prio_to_weight[0] * 2; 1857 p->se.load.weight = prio_to_weight[0] * 2;
1858 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1858 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1859 return; 1859 return;
1860 } 1860 }
1861 1861
1862 /* 1862 /*
1863 * SCHED_IDLE tasks get minimal weight: 1863 * SCHED_IDLE tasks get minimal weight:
1864 */ 1864 */
1865 if (p->policy == SCHED_IDLE) { 1865 if (p->policy == SCHED_IDLE) {
1866 p->se.load.weight = WEIGHT_IDLEPRIO; 1866 p->se.load.weight = WEIGHT_IDLEPRIO;
1867 p->se.load.inv_weight = WMULT_IDLEPRIO; 1867 p->se.load.inv_weight = WMULT_IDLEPRIO;
1868 return; 1868 return;
1869 } 1869 }
1870 1870
1871 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1871 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1872 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1872 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1873 } 1873 }
1874 1874
1875 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1875 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1876 { 1876 {
1877 update_rq_clock(rq); 1877 update_rq_clock(rq);
1878 sched_info_queued(p); 1878 sched_info_queued(p);
1879 p->sched_class->enqueue_task(rq, p, flags); 1879 p->sched_class->enqueue_task(rq, p, flags);
1880 p->se.on_rq = 1; 1880 p->se.on_rq = 1;
1881 } 1881 }
1882 1882
1883 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1883 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1884 { 1884 {
1885 update_rq_clock(rq); 1885 update_rq_clock(rq);
1886 sched_info_dequeued(p); 1886 sched_info_dequeued(p);
1887 p->sched_class->dequeue_task(rq, p, flags); 1887 p->sched_class->dequeue_task(rq, p, flags);
1888 p->se.on_rq = 0; 1888 p->se.on_rq = 0;
1889 } 1889 }
1890 1890
1891 /* 1891 /*
1892 * activate_task - move a task to the runqueue. 1892 * activate_task - move a task to the runqueue.
1893 */ 1893 */
1894 static void activate_task(struct rq *rq, struct task_struct *p, int flags) 1894 static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1895 { 1895 {
1896 if (task_contributes_to_load(p)) 1896 if (task_contributes_to_load(p))
1897 rq->nr_uninterruptible--; 1897 rq->nr_uninterruptible--;
1898 1898
1899 enqueue_task(rq, p, flags); 1899 enqueue_task(rq, p, flags);
1900 inc_nr_running(rq); 1900 inc_nr_running(rq);
1901 } 1901 }
1902 1902
1903 /* 1903 /*
1904 * deactivate_task - remove a task from the runqueue. 1904 * deactivate_task - remove a task from the runqueue.
1905 */ 1905 */
1906 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 1906 static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1907 { 1907 {
1908 if (task_contributes_to_load(p)) 1908 if (task_contributes_to_load(p))
1909 rq->nr_uninterruptible++; 1909 rq->nr_uninterruptible++;
1910 1910
1911 dequeue_task(rq, p, flags); 1911 dequeue_task(rq, p, flags);
1912 dec_nr_running(rq); 1912 dec_nr_running(rq);
1913 } 1913 }
1914 1914
1915 #include "sched_idletask.c" 1915 #include "sched_idletask.c"
1916 #include "sched_fair.c" 1916 #include "sched_fair.c"
1917 #include "sched_rt.c" 1917 #include "sched_rt.c"
1918 #ifdef CONFIG_SCHED_DEBUG 1918 #ifdef CONFIG_SCHED_DEBUG
1919 # include "sched_debug.c" 1919 # include "sched_debug.c"
1920 #endif 1920 #endif
1921 1921
1922 /* 1922 /*
1923 * __normal_prio - return the priority that is based on the static prio 1923 * __normal_prio - return the priority that is based on the static prio
1924 */ 1924 */
1925 static inline int __normal_prio(struct task_struct *p) 1925 static inline int __normal_prio(struct task_struct *p)
1926 { 1926 {
1927 return p->static_prio; 1927 return p->static_prio;
1928 } 1928 }
1929 1929
1930 /* 1930 /*
1931 * Calculate the expected normal priority: i.e. priority 1931 * Calculate the expected normal priority: i.e. priority
1932 * without taking RT-inheritance into account. Might be 1932 * without taking RT-inheritance into account. Might be
1933 * boosted by interactivity modifiers. Changes upon fork, 1933 * boosted by interactivity modifiers. Changes upon fork,
1934 * setprio syscalls, and whenever the interactivity 1934 * setprio syscalls, and whenever the interactivity
1935 * estimator recalculates. 1935 * estimator recalculates.
1936 */ 1936 */
1937 static inline int normal_prio(struct task_struct *p) 1937 static inline int normal_prio(struct task_struct *p)
1938 { 1938 {
1939 int prio; 1939 int prio;
1940 1940
1941 if (task_has_rt_policy(p)) 1941 if (task_has_rt_policy(p))
1942 prio = MAX_RT_PRIO-1 - p->rt_priority; 1942 prio = MAX_RT_PRIO-1 - p->rt_priority;
1943 else 1943 else
1944 prio = __normal_prio(p); 1944 prio = __normal_prio(p);
1945 return prio; 1945 return prio;
1946 } 1946 }
1947 1947
1948 /* 1948 /*
1949 * Calculate the current priority, i.e. the priority 1949 * Calculate the current priority, i.e. the priority
1950 * taken into account by the scheduler. This value might 1950 * taken into account by the scheduler. This value might
1951 * be boosted by RT tasks, or might be boosted by 1951 * be boosted by RT tasks, or might be boosted by
1952 * interactivity modifiers. Will be RT if the task got 1952 * interactivity modifiers. Will be RT if the task got
1953 * RT-boosted. If not then it returns p->normal_prio. 1953 * RT-boosted. If not then it returns p->normal_prio.
1954 */ 1954 */
1955 static int effective_prio(struct task_struct *p) 1955 static int effective_prio(struct task_struct *p)
1956 { 1956 {
1957 p->normal_prio = normal_prio(p); 1957 p->normal_prio = normal_prio(p);
1958 /* 1958 /*
1959 * If we are RT tasks or we were boosted to RT priority, 1959 * If we are RT tasks or we were boosted to RT priority,
1960 * keep the priority unchanged. Otherwise, update priority 1960 * keep the priority unchanged. Otherwise, update priority
1961 * to the normal priority: 1961 * to the normal priority:
1962 */ 1962 */
1963 if (!rt_prio(p->prio)) 1963 if (!rt_prio(p->prio))
1964 return p->normal_prio; 1964 return p->normal_prio;
1965 return p->prio; 1965 return p->prio;
1966 } 1966 }
1967 1967
1968 /** 1968 /**
1969 * task_curr - is this task currently executing on a CPU? 1969 * task_curr - is this task currently executing on a CPU?
1970 * @p: the task in question. 1970 * @p: the task in question.
1971 */ 1971 */
1972 inline int task_curr(const struct task_struct *p) 1972 inline int task_curr(const struct task_struct *p)
1973 { 1973 {
1974 return cpu_curr(task_cpu(p)) == p; 1974 return cpu_curr(task_cpu(p)) == p;
1975 } 1975 }
1976 1976
1977 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1977 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1978 const struct sched_class *prev_class, 1978 const struct sched_class *prev_class,
1979 int oldprio, int running) 1979 int oldprio, int running)
1980 { 1980 {
1981 if (prev_class != p->sched_class) { 1981 if (prev_class != p->sched_class) {
1982 if (prev_class->switched_from) 1982 if (prev_class->switched_from)
1983 prev_class->switched_from(rq, p, running); 1983 prev_class->switched_from(rq, p, running);
1984 p->sched_class->switched_to(rq, p, running); 1984 p->sched_class->switched_to(rq, p, running);
1985 } else 1985 } else
1986 p->sched_class->prio_changed(rq, p, oldprio, running); 1986 p->sched_class->prio_changed(rq, p, oldprio, running);
1987 } 1987 }
1988 1988
1989 #ifdef CONFIG_SMP 1989 #ifdef CONFIG_SMP
1990 /* 1990 /*
1991 * Is this task likely cache-hot: 1991 * Is this task likely cache-hot:
1992 */ 1992 */
1993 static int 1993 static int
1994 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1994 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1995 { 1995 {
1996 s64 delta; 1996 s64 delta;
1997 1997
1998 if (p->sched_class != &fair_sched_class) 1998 if (p->sched_class != &fair_sched_class)
1999 return 0; 1999 return 0;
2000 2000
2001 /* 2001 /*
2002 * Buddy candidates are cache hot: 2002 * Buddy candidates are cache hot:
2003 */ 2003 */
2004 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && 2004 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2005 (&p->se == cfs_rq_of(&p->se)->next || 2005 (&p->se == cfs_rq_of(&p->se)->next ||
2006 &p->se == cfs_rq_of(&p->se)->last)) 2006 &p->se == cfs_rq_of(&p->se)->last))
2007 return 1; 2007 return 1;
2008 2008
2009 if (sysctl_sched_migration_cost == -1) 2009 if (sysctl_sched_migration_cost == -1)
2010 return 1; 2010 return 1;
2011 if (sysctl_sched_migration_cost == 0) 2011 if (sysctl_sched_migration_cost == 0)
2012 return 0; 2012 return 0;
2013 2013
2014 delta = now - p->se.exec_start; 2014 delta = now - p->se.exec_start;
2015 2015
2016 return delta < (s64)sysctl_sched_migration_cost; 2016 return delta < (s64)sysctl_sched_migration_cost;
2017 } 2017 }
2018 2018
2019 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2019 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2020 { 2020 {
2021 #ifdef CONFIG_SCHED_DEBUG 2021 #ifdef CONFIG_SCHED_DEBUG
2022 /* 2022 /*
2023 * We should never call set_task_cpu() on a blocked task, 2023 * We should never call set_task_cpu() on a blocked task,
2024 * ttwu() will sort out the placement. 2024 * ttwu() will sort out the placement.
2025 */ 2025 */
2026 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2026 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2027 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2027 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2028 #endif 2028 #endif
2029 2029
2030 trace_sched_migrate_task(p, new_cpu); 2030 trace_sched_migrate_task(p, new_cpu);
2031 2031
2032 if (task_cpu(p) != new_cpu) { 2032 if (task_cpu(p) != new_cpu) {
2033 p->se.nr_migrations++; 2033 p->se.nr_migrations++;
2034 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); 2034 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2035 } 2035 }
2036 2036
2037 __set_task_cpu(p, new_cpu); 2037 __set_task_cpu(p, new_cpu);
2038 } 2038 }
2039 2039
2040 struct migration_req { 2040 struct migration_req {
2041 struct list_head list; 2041 struct list_head list;
2042 2042
2043 struct task_struct *task; 2043 struct task_struct *task;
2044 int dest_cpu; 2044 int dest_cpu;
2045 2045
2046 struct completion done; 2046 struct completion done;
2047 }; 2047 };
2048 2048
2049 /* 2049 /*
2050 * The task's runqueue lock must be held. 2050 * The task's runqueue lock must be held.
2051 * Returns true if you have to wait for migration thread. 2051 * Returns true if you have to wait for migration thread.
2052 */ 2052 */
2053 static int 2053 static int
2054 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) 2054 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2055 { 2055 {
2056 struct rq *rq = task_rq(p); 2056 struct rq *rq = task_rq(p);
2057 2057
2058 /* 2058 /*
2059 * If the task is not on a runqueue (and not running), then 2059 * If the task is not on a runqueue (and not running), then
2060 * the next wake-up will properly place the task. 2060 * the next wake-up will properly place the task.
2061 */ 2061 */
2062 if (!p->se.on_rq && !task_running(rq, p)) 2062 if (!p->se.on_rq && !task_running(rq, p))
2063 return 0; 2063 return 0;
2064 2064
2065 init_completion(&req->done); 2065 init_completion(&req->done);
2066 req->task = p; 2066 req->task = p;
2067 req->dest_cpu = dest_cpu; 2067 req->dest_cpu = dest_cpu;
2068 list_add(&req->list, &rq->migration_queue); 2068 list_add(&req->list, &rq->migration_queue);
2069 2069
2070 return 1; 2070 return 1;
2071 } 2071 }
2072 2072
2073 /* 2073 /*
2074 * wait_task_context_switch - wait for a thread to complete at least one 2074 * wait_task_context_switch - wait for a thread to complete at least one
2075 * context switch. 2075 * context switch.
2076 * 2076 *
2077 * @p must not be current. 2077 * @p must not be current.
2078 */ 2078 */
2079 void wait_task_context_switch(struct task_struct *p) 2079 void wait_task_context_switch(struct task_struct *p)
2080 { 2080 {
2081 unsigned long nvcsw, nivcsw, flags; 2081 unsigned long nvcsw, nivcsw, flags;
2082 int running; 2082 int running;
2083 struct rq *rq; 2083 struct rq *rq;
2084 2084
2085 nvcsw = p->nvcsw; 2085 nvcsw = p->nvcsw;
2086 nivcsw = p->nivcsw; 2086 nivcsw = p->nivcsw;
2087 for (;;) { 2087 for (;;) {
2088 /* 2088 /*
2089 * The runqueue is assigned before the actual context 2089 * The runqueue is assigned before the actual context
2090 * switch. We need to take the runqueue lock. 2090 * switch. We need to take the runqueue lock.
2091 * 2091 *
2092 * We could check initially without the lock but it is 2092 * We could check initially without the lock but it is
2093 * very likely that we need to take the lock in every 2093 * very likely that we need to take the lock in every
2094 * iteration. 2094 * iteration.
2095 */ 2095 */
2096 rq = task_rq_lock(p, &flags); 2096 rq = task_rq_lock(p, &flags);
2097 running = task_running(rq, p); 2097 running = task_running(rq, p);
2098 task_rq_unlock(rq, &flags); 2098 task_rq_unlock(rq, &flags);
2099 2099
2100 if (likely(!running)) 2100 if (likely(!running))
2101 break; 2101 break;
2102 /* 2102 /*
2103 * The switch count is incremented before the actual 2103 * The switch count is incremented before the actual
2104 * context switch. We thus wait for two switches to be 2104 * context switch. We thus wait for two switches to be
2105 * sure at least one completed. 2105 * sure at least one completed.
2106 */ 2106 */
2107 if ((p->nvcsw - nvcsw) > 1) 2107 if ((p->nvcsw - nvcsw) > 1)
2108 break; 2108 break;
2109 if ((p->nivcsw - nivcsw) > 1) 2109 if ((p->nivcsw - nivcsw) > 1)
2110 break; 2110 break;
2111 2111
2112 cpu_relax(); 2112 cpu_relax();
2113 } 2113 }
2114 } 2114 }
2115 2115
2116 /* 2116 /*
2117 * wait_task_inactive - wait for a thread to unschedule. 2117 * wait_task_inactive - wait for a thread to unschedule.
2118 * 2118 *
2119 * If @match_state is nonzero, it's the @p->state value just checked and 2119 * If @match_state is nonzero, it's the @p->state value just checked and
2120 * not expected to change. If it changes, i.e. @p might have woken up, 2120 * not expected to change. If it changes, i.e. @p might have woken up,
2121 * then return zero. When we succeed in waiting for @p to be off its CPU, 2121 * then return zero. When we succeed in waiting for @p to be off its CPU,
2122 * we return a positive number (its total switch count). If a second call 2122 * we return a positive number (its total switch count). If a second call
2123 * a short while later returns the same number, the caller can be sure that 2123 * a short while later returns the same number, the caller can be sure that
2124 * @p has remained unscheduled the whole time. 2124 * @p has remained unscheduled the whole time.
2125 * 2125 *
2126 * The caller must ensure that the task *will* unschedule sometime soon, 2126 * The caller must ensure that the task *will* unschedule sometime soon,
2127 * else this function might spin for a *long* time. This function can't 2127 * else this function might spin for a *long* time. This function can't
2128 * be called with interrupts off, or it may introduce deadlock with 2128 * be called with interrupts off, or it may introduce deadlock with
2129 * smp_call_function() if an IPI is sent by the same process we are 2129 * smp_call_function() if an IPI is sent by the same process we are
2130 * waiting to become inactive. 2130 * waiting to become inactive.
2131 */ 2131 */
2132 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 2132 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2133 { 2133 {
2134 unsigned long flags; 2134 unsigned long flags;
2135 int running, on_rq; 2135 int running, on_rq;
2136 unsigned long ncsw; 2136 unsigned long ncsw;
2137 struct rq *rq; 2137 struct rq *rq;
2138 2138
2139 for (;;) { 2139 for (;;) {
2140 /* 2140 /*
2141 * We do the initial early heuristics without holding 2141 * We do the initial early heuristics without holding
2142 * any task-queue locks at all. We'll only try to get 2142 * any task-queue locks at all. We'll only try to get
2143 * the runqueue lock when things look like they will 2143 * the runqueue lock when things look like they will
2144 * work out! 2144 * work out!
2145 */ 2145 */
2146 rq = task_rq(p); 2146 rq = task_rq(p);
2147 2147
2148 /* 2148 /*
2149 * If the task is actively running on another CPU 2149 * If the task is actively running on another CPU
2150 * still, just relax and busy-wait without holding 2150 * still, just relax and busy-wait without holding
2151 * any locks. 2151 * any locks.
2152 * 2152 *
2153 * NOTE! Since we don't hold any locks, it's not 2153 * NOTE! Since we don't hold any locks, it's not
2154 * even sure that "rq" stays as the right runqueue! 2154 * even sure that "rq" stays as the right runqueue!
2155 * But we don't care, since "task_running()" will 2155 * But we don't care, since "task_running()" will
2156 * return false if the runqueue has changed and p 2156 * return false if the runqueue has changed and p
2157 * is actually now running somewhere else! 2157 * is actually now running somewhere else!
2158 */ 2158 */
2159 while (task_running(rq, p)) { 2159 while (task_running(rq, p)) {
2160 if (match_state && unlikely(p->state != match_state)) 2160 if (match_state && unlikely(p->state != match_state))
2161 return 0; 2161 return 0;
2162 cpu_relax(); 2162 cpu_relax();
2163 } 2163 }
2164 2164
2165 /* 2165 /*
2166 * Ok, time to look more closely! We need the rq 2166 * Ok, time to look more closely! We need the rq
2167 * lock now, to be *sure*. If we're wrong, we'll 2167 * lock now, to be *sure*. If we're wrong, we'll
2168 * just go back and repeat. 2168 * just go back and repeat.
2169 */ 2169 */
2170 rq = task_rq_lock(p, &flags); 2170 rq = task_rq_lock(p, &flags);
2171 trace_sched_wait_task(rq, p); 2171 trace_sched_wait_task(rq, p);
2172 running = task_running(rq, p); 2172 running = task_running(rq, p);
2173 on_rq = p->se.on_rq; 2173 on_rq = p->se.on_rq;
2174 ncsw = 0; 2174 ncsw = 0;
2175 if (!match_state || p->state == match_state) 2175 if (!match_state || p->state == match_state)
2176 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2176 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2177 task_rq_unlock(rq, &flags); 2177 task_rq_unlock(rq, &flags);
2178 2178
2179 /* 2179 /*
2180 * If it changed from the expected state, bail out now. 2180 * If it changed from the expected state, bail out now.
2181 */ 2181 */
2182 if (unlikely(!ncsw)) 2182 if (unlikely(!ncsw))
2183 break; 2183 break;
2184 2184
2185 /* 2185 /*
2186 * Was it really running after all now that we 2186 * Was it really running after all now that we
2187 * checked with the proper locks actually held? 2187 * checked with the proper locks actually held?
2188 * 2188 *
2189 * Oops. Go back and try again.. 2189 * Oops. Go back and try again..
2190 */ 2190 */
2191 if (unlikely(running)) { 2191 if (unlikely(running)) {
2192 cpu_relax(); 2192 cpu_relax();
2193 continue; 2193 continue;
2194 } 2194 }
2195 2195
2196 /* 2196 /*
2197 * It's not enough that it's not actively running, 2197 * It's not enough that it's not actively running,
2198 * it must be off the runqueue _entirely_, and not 2198 * it must be off the runqueue _entirely_, and not
2199 * preempted! 2199 * preempted!
2200 * 2200 *
2201 * So if it was still runnable (but just not actively 2201 * So if it was still runnable (but just not actively
2202 * running right now), it's preempted, and we should 2202 * running right now), it's preempted, and we should
2203 * yield - it could be a while. 2203 * yield - it could be a while.
2204 */ 2204 */
2205 if (unlikely(on_rq)) { 2205 if (unlikely(on_rq)) {
2206 schedule_timeout_uninterruptible(1); 2206 schedule_timeout_uninterruptible(1);
2207 continue; 2207 continue;
2208 } 2208 }
2209 2209
2210 /* 2210 /*
2211 * Ahh, all good. It wasn't running, and it wasn't 2211 * Ahh, all good. It wasn't running, and it wasn't
2212 * runnable, which means that it will never become 2212 * runnable, which means that it will never become
2213 * running in the future either. We're all done! 2213 * running in the future either. We're all done!
2214 */ 2214 */
2215 break; 2215 break;
2216 } 2216 }
2217 2217
2218 return ncsw; 2218 return ncsw;
2219 } 2219 }
2220 2220
2221 /*** 2221 /***
2222 * kick_process - kick a running thread to enter/exit the kernel 2222 * kick_process - kick a running thread to enter/exit the kernel
2223 * @p: the to-be-kicked thread 2223 * @p: the to-be-kicked thread
2224 * 2224 *
2225 * Cause a process which is running on another CPU to enter 2225 * Cause a process which is running on another CPU to enter
2226 * kernel-mode, without any delay. (to get signals handled.) 2226 * kernel-mode, without any delay. (to get signals handled.)
2227 * 2227 *
2228 * NOTE: this function doesnt have to take the runqueue lock, 2228 * NOTE: this function doesnt have to take the runqueue lock,
2229 * because all it wants to ensure is that the remote task enters 2229 * because all it wants to ensure is that the remote task enters
2230 * the kernel. If the IPI races and the task has been migrated 2230 * the kernel. If the IPI races and the task has been migrated
2231 * to another CPU then no harm is done and the purpose has been 2231 * to another CPU then no harm is done and the purpose has been
2232 * achieved as well. 2232 * achieved as well.
2233 */ 2233 */
2234 void kick_process(struct task_struct *p) 2234 void kick_process(struct task_struct *p)
2235 { 2235 {
2236 int cpu; 2236 int cpu;
2237 2237
2238 preempt_disable(); 2238 preempt_disable();
2239 cpu = task_cpu(p); 2239 cpu = task_cpu(p);
2240 if ((cpu != smp_processor_id()) && task_curr(p)) 2240 if ((cpu != smp_processor_id()) && task_curr(p))
2241 smp_send_reschedule(cpu); 2241 smp_send_reschedule(cpu);
2242 preempt_enable(); 2242 preempt_enable();
2243 } 2243 }
2244 EXPORT_SYMBOL_GPL(kick_process); 2244 EXPORT_SYMBOL_GPL(kick_process);
2245 #endif /* CONFIG_SMP */ 2245 #endif /* CONFIG_SMP */
2246 2246
2247 /** 2247 /**
2248 * task_oncpu_function_call - call a function on the cpu on which a task runs 2248 * task_oncpu_function_call - call a function on the cpu on which a task runs
2249 * @p: the task to evaluate 2249 * @p: the task to evaluate
2250 * @func: the function to be called 2250 * @func: the function to be called
2251 * @info: the function call argument 2251 * @info: the function call argument
2252 * 2252 *
2253 * Calls the function @func when the task is currently running. This might 2253 * Calls the function @func when the task is currently running. This might
2254 * be on the current CPU, which just calls the function directly 2254 * be on the current CPU, which just calls the function directly
2255 */ 2255 */
2256 void task_oncpu_function_call(struct task_struct *p, 2256 void task_oncpu_function_call(struct task_struct *p,
2257 void (*func) (void *info), void *info) 2257 void (*func) (void *info), void *info)
2258 { 2258 {
2259 int cpu; 2259 int cpu;
2260 2260
2261 preempt_disable(); 2261 preempt_disable();
2262 cpu = task_cpu(p); 2262 cpu = task_cpu(p);
2263 if (task_curr(p)) 2263 if (task_curr(p))
2264 smp_call_function_single(cpu, func, info, 1); 2264 smp_call_function_single(cpu, func, info, 1);
2265 preempt_enable(); 2265 preempt_enable();
2266 } 2266 }
2267 2267
2268 #ifdef CONFIG_SMP 2268 #ifdef CONFIG_SMP
2269 /* 2269 /*
2270 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2270 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2271 */ 2271 */
2272 static int select_fallback_rq(int cpu, struct task_struct *p) 2272 static int select_fallback_rq(int cpu, struct task_struct *p)
2273 { 2273 {
2274 int dest_cpu; 2274 int dest_cpu;
2275 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 2275 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2276 2276
2277 /* Look for allowed, online CPU in same node. */ 2277 /* Look for allowed, online CPU in same node. */
2278 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 2278 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2279 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 2279 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2280 return dest_cpu; 2280 return dest_cpu;
2281 2281
2282 /* Any allowed, online CPU? */ 2282 /* Any allowed, online CPU? */
2283 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); 2283 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2284 if (dest_cpu < nr_cpu_ids) 2284 if (dest_cpu < nr_cpu_ids)
2285 return dest_cpu; 2285 return dest_cpu;
2286 2286
2287 /* No more Mr. Nice Guy. */ 2287 /* No more Mr. Nice Guy. */
2288 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2288 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2289 dest_cpu = cpuset_cpus_allowed_fallback(p); 2289 dest_cpu = cpuset_cpus_allowed_fallback(p);
2290 /* 2290 /*
2291 * Don't tell them about moving exiting tasks or 2291 * Don't tell them about moving exiting tasks or
2292 * kernel threads (both mm NULL), since they never 2292 * kernel threads (both mm NULL), since they never
2293 * leave kernel. 2293 * leave kernel.
2294 */ 2294 */
2295 if (p->mm && printk_ratelimit()) { 2295 if (p->mm && printk_ratelimit()) {
2296 printk(KERN_INFO "process %d (%s) no " 2296 printk(KERN_INFO "process %d (%s) no "
2297 "longer affine to cpu%d\n", 2297 "longer affine to cpu%d\n",
2298 task_pid_nr(p), p->comm, cpu); 2298 task_pid_nr(p), p->comm, cpu);
2299 } 2299 }
2300 } 2300 }
2301 2301
2302 return dest_cpu; 2302 return dest_cpu;
2303 } 2303 }
2304 2304
2305 /* 2305 /*
2306 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2306 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2307 */ 2307 */
2308 static inline 2308 static inline
2309 int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2309 int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2310 { 2310 {
2311 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2311 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2312 2312
2313 /* 2313 /*
2314 * In order not to call set_task_cpu() on a blocking task we need 2314 * In order not to call set_task_cpu() on a blocking task we need
2315 * to rely on ttwu() to place the task on a valid ->cpus_allowed 2315 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2316 * cpu. 2316 * cpu.
2317 * 2317 *
2318 * Since this is common to all placement strategies, this lives here. 2318 * Since this is common to all placement strategies, this lives here.
2319 * 2319 *
2320 * [ this allows ->select_task() to simply return task_cpu(p) and 2320 * [ this allows ->select_task() to simply return task_cpu(p) and
2321 * not worry about this generic constraint ] 2321 * not worry about this generic constraint ]
2322 */ 2322 */
2323 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || 2323 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2324 !cpu_online(cpu))) 2324 !cpu_online(cpu)))
2325 cpu = select_fallback_rq(task_cpu(p), p); 2325 cpu = select_fallback_rq(task_cpu(p), p);
2326 2326
2327 return cpu; 2327 return cpu;
2328 } 2328 }
2329 2329
2330 static void update_avg(u64 *avg, u64 sample) 2330 static void update_avg(u64 *avg, u64 sample)
2331 { 2331 {
2332 s64 diff = sample - *avg; 2332 s64 diff = sample - *avg;
2333 *avg += diff >> 3; 2333 *avg += diff >> 3;
2334 } 2334 }
2335 #endif 2335 #endif
2336 2336
2337 /*** 2337 /***
2338 * try_to_wake_up - wake up a thread 2338 * try_to_wake_up - wake up a thread
2339 * @p: the to-be-woken-up thread 2339 * @p: the to-be-woken-up thread
2340 * @state: the mask of task states that can be woken 2340 * @state: the mask of task states that can be woken
2341 * @sync: do a synchronous wakeup? 2341 * @sync: do a synchronous wakeup?
2342 * 2342 *
2343 * Put it on the run-queue if it's not already there. The "current" 2343 * Put it on the run-queue if it's not already there. The "current"
2344 * thread is always on the run-queue (except when the actual 2344 * thread is always on the run-queue (except when the actual
2345 * re-schedule is in progress), and as such you're allowed to do 2345 * re-schedule is in progress), and as such you're allowed to do
2346 * the simpler "current->state = TASK_RUNNING" to mark yourself 2346 * the simpler "current->state = TASK_RUNNING" to mark yourself
2347 * runnable without the overhead of this. 2347 * runnable without the overhead of this.
2348 * 2348 *
2349 * returns failure only if the task is already active. 2349 * returns failure only if the task is already active.
2350 */ 2350 */
2351 static int try_to_wake_up(struct task_struct *p, unsigned int state, 2351 static int try_to_wake_up(struct task_struct *p, unsigned int state,
2352 int wake_flags) 2352 int wake_flags)
2353 { 2353 {
2354 int cpu, orig_cpu, this_cpu, success = 0; 2354 int cpu, orig_cpu, this_cpu, success = 0;
2355 unsigned long flags; 2355 unsigned long flags;
2356 unsigned long en_flags = ENQUEUE_WAKEUP; 2356 unsigned long en_flags = ENQUEUE_WAKEUP;
2357 struct rq *rq; 2357 struct rq *rq;
2358 2358
2359 this_cpu = get_cpu(); 2359 this_cpu = get_cpu();
2360 2360
2361 smp_wmb(); 2361 smp_wmb();
2362 rq = task_rq_lock(p, &flags); 2362 rq = task_rq_lock(p, &flags);
2363 if (!(p->state & state)) 2363 if (!(p->state & state))
2364 goto out; 2364 goto out;
2365 2365
2366 if (p->se.on_rq) 2366 if (p->se.on_rq)
2367 goto out_running; 2367 goto out_running;
2368 2368
2369 cpu = task_cpu(p); 2369 cpu = task_cpu(p);
2370 orig_cpu = cpu; 2370 orig_cpu = cpu;
2371 2371
2372 #ifdef CONFIG_SMP 2372 #ifdef CONFIG_SMP
2373 if (unlikely(task_running(rq, p))) 2373 if (unlikely(task_running(rq, p)))
2374 goto out_activate; 2374 goto out_activate;
2375 2375
2376 /* 2376 /*
2377 * In order to handle concurrent wakeups and release the rq->lock 2377 * In order to handle concurrent wakeups and release the rq->lock
2378 * we put the task in TASK_WAKING state. 2378 * we put the task in TASK_WAKING state.
2379 * 2379 *
2380 * First fix up the nr_uninterruptible count: 2380 * First fix up the nr_uninterruptible count:
2381 */ 2381 */
2382 if (task_contributes_to_load(p)) { 2382 if (task_contributes_to_load(p)) {
2383 if (likely(cpu_online(orig_cpu))) 2383 if (likely(cpu_online(orig_cpu)))
2384 rq->nr_uninterruptible--; 2384 rq->nr_uninterruptible--;
2385 else 2385 else
2386 this_rq()->nr_uninterruptible--; 2386 this_rq()->nr_uninterruptible--;
2387 } 2387 }
2388 p->state = TASK_WAKING; 2388 p->state = TASK_WAKING;
2389 2389
2390 if (p->sched_class->task_waking) { 2390 if (p->sched_class->task_waking) {
2391 p->sched_class->task_waking(rq, p); 2391 p->sched_class->task_waking(rq, p);
2392 en_flags |= ENQUEUE_WAKING; 2392 en_flags |= ENQUEUE_WAKING;
2393 } 2393 }
2394 2394
2395 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); 2395 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2396 if (cpu != orig_cpu) 2396 if (cpu != orig_cpu)
2397 set_task_cpu(p, cpu); 2397 set_task_cpu(p, cpu);
2398 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2399 2399
2400 rq = cpu_rq(cpu); 2400 rq = cpu_rq(cpu);
2401 raw_spin_lock(&rq->lock); 2401 raw_spin_lock(&rq->lock);
2402 2402
2403 /* 2403 /*
2404 * We migrated the task without holding either rq->lock, however 2404 * We migrated the task without holding either rq->lock, however
2405 * since the task is not on the task list itself, nobody else 2405 * since the task is not on the task list itself, nobody else
2406 * will try and migrate the task, hence the rq should match the 2406 * will try and migrate the task, hence the rq should match the
2407 * cpu we just moved it to. 2407 * cpu we just moved it to.
2408 */ 2408 */
2409 WARN_ON(task_cpu(p) != cpu); 2409 WARN_ON(task_cpu(p) != cpu);
2410 WARN_ON(p->state != TASK_WAKING); 2410 WARN_ON(p->state != TASK_WAKING);
2411 2411
2412 #ifdef CONFIG_SCHEDSTATS 2412 #ifdef CONFIG_SCHEDSTATS
2413 schedstat_inc(rq, ttwu_count); 2413 schedstat_inc(rq, ttwu_count);
2414 if (cpu == this_cpu) 2414 if (cpu == this_cpu)
2415 schedstat_inc(rq, ttwu_local); 2415 schedstat_inc(rq, ttwu_local);
2416 else { 2416 else {
2417 struct sched_domain *sd; 2417 struct sched_domain *sd;
2418 for_each_domain(this_cpu, sd) { 2418 for_each_domain(this_cpu, sd) {
2419 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2419 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2420 schedstat_inc(sd, ttwu_wake_remote); 2420 schedstat_inc(sd, ttwu_wake_remote);
2421 break; 2421 break;
2422 } 2422 }
2423 } 2423 }
2424 } 2424 }
2425 #endif /* CONFIG_SCHEDSTATS */ 2425 #endif /* CONFIG_SCHEDSTATS */
2426 2426
2427 out_activate: 2427 out_activate:
2428 #endif /* CONFIG_SMP */ 2428 #endif /* CONFIG_SMP */
2429 schedstat_inc(p, se.statistics.nr_wakeups); 2429 schedstat_inc(p, se.statistics.nr_wakeups);
2430 if (wake_flags & WF_SYNC) 2430 if (wake_flags & WF_SYNC)
2431 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2431 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2432 if (orig_cpu != cpu) 2432 if (orig_cpu != cpu)
2433 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2433 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2434 if (cpu == this_cpu) 2434 if (cpu == this_cpu)
2435 schedstat_inc(p, se.statistics.nr_wakeups_local); 2435 schedstat_inc(p, se.statistics.nr_wakeups_local);
2436 else 2436 else
2437 schedstat_inc(p, se.statistics.nr_wakeups_remote); 2437 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2438 activate_task(rq, p, en_flags); 2438 activate_task(rq, p, en_flags);
2439 success = 1; 2439 success = 1;
2440 2440
2441 out_running: 2441 out_running:
2442 trace_sched_wakeup(rq, p, success); 2442 trace_sched_wakeup(rq, p, success);
2443 check_preempt_curr(rq, p, wake_flags); 2443 check_preempt_curr(rq, p, wake_flags);
2444 2444
2445 p->state = TASK_RUNNING; 2445 p->state = TASK_RUNNING;
2446 #ifdef CONFIG_SMP 2446 #ifdef CONFIG_SMP
2447 if (p->sched_class->task_woken) 2447 if (p->sched_class->task_woken)
2448 p->sched_class->task_woken(rq, p); 2448 p->sched_class->task_woken(rq, p);
2449 2449
2450 if (unlikely(rq->idle_stamp)) { 2450 if (unlikely(rq->idle_stamp)) {
2451 u64 delta = rq->clock - rq->idle_stamp; 2451 u64 delta = rq->clock - rq->idle_stamp;
2452 u64 max = 2*sysctl_sched_migration_cost; 2452 u64 max = 2*sysctl_sched_migration_cost;
2453 2453
2454 if (delta > max) 2454 if (delta > max)
2455 rq->avg_idle = max; 2455 rq->avg_idle = max;
2456 else 2456 else
2457 update_avg(&rq->avg_idle, delta); 2457 update_avg(&rq->avg_idle, delta);
2458 rq->idle_stamp = 0; 2458 rq->idle_stamp = 0;
2459 } 2459 }
2460 #endif 2460 #endif
2461 out: 2461 out:
2462 task_rq_unlock(rq, &flags); 2462 task_rq_unlock(rq, &flags);
2463 put_cpu(); 2463 put_cpu();
2464 2464
2465 return success; 2465 return success;
2466 } 2466 }
2467 2467
2468 /** 2468 /**
2469 * wake_up_process - Wake up a specific process 2469 * wake_up_process - Wake up a specific process
2470 * @p: The process to be woken up. 2470 * @p: The process to be woken up.
2471 * 2471 *
2472 * Attempt to wake up the nominated process and move it to the set of runnable 2472 * Attempt to wake up the nominated process and move it to the set of runnable
2473 * processes. Returns 1 if the process was woken up, 0 if it was already 2473 * processes. Returns 1 if the process was woken up, 0 if it was already
2474 * running. 2474 * running.
2475 * 2475 *
2476 * It may be assumed that this function implies a write memory barrier before 2476 * It may be assumed that this function implies a write memory barrier before
2477 * changing the task state if and only if any tasks are woken up. 2477 * changing the task state if and only if any tasks are woken up.
2478 */ 2478 */
2479 int wake_up_process(struct task_struct *p) 2479 int wake_up_process(struct task_struct *p)
2480 { 2480 {
2481 return try_to_wake_up(p, TASK_ALL, 0); 2481 return try_to_wake_up(p, TASK_ALL, 0);
2482 } 2482 }
2483 EXPORT_SYMBOL(wake_up_process); 2483 EXPORT_SYMBOL(wake_up_process);
2484 2484
2485 int wake_up_state(struct task_struct *p, unsigned int state) 2485 int wake_up_state(struct task_struct *p, unsigned int state)
2486 { 2486 {
2487 return try_to_wake_up(p, state, 0); 2487 return try_to_wake_up(p, state, 0);
2488 } 2488 }
2489 2489
2490 /* 2490 /*
2491 * Perform scheduler related setup for a newly forked process p. 2491 * Perform scheduler related setup for a newly forked process p.
2492 * p is forked by current. 2492 * p is forked by current.
2493 * 2493 *
2494 * __sched_fork() is basic setup used by init_idle() too: 2494 * __sched_fork() is basic setup used by init_idle() too:
2495 */ 2495 */
2496 static void __sched_fork(struct task_struct *p) 2496 static void __sched_fork(struct task_struct *p)
2497 { 2497 {
2498 p->se.exec_start = 0; 2498 p->se.exec_start = 0;
2499 p->se.sum_exec_runtime = 0; 2499 p->se.sum_exec_runtime = 0;
2500 p->se.prev_sum_exec_runtime = 0; 2500 p->se.prev_sum_exec_runtime = 0;
2501 p->se.nr_migrations = 0; 2501 p->se.nr_migrations = 0;
2502 2502
2503 #ifdef CONFIG_SCHEDSTATS 2503 #ifdef CONFIG_SCHEDSTATS
2504 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2504 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2505 #endif 2505 #endif
2506 2506
2507 INIT_LIST_HEAD(&p->rt.run_list); 2507 INIT_LIST_HEAD(&p->rt.run_list);
2508 p->se.on_rq = 0; 2508 p->se.on_rq = 0;
2509 INIT_LIST_HEAD(&p->se.group_node); 2509 INIT_LIST_HEAD(&p->se.group_node);
2510 2510
2511 #ifdef CONFIG_PREEMPT_NOTIFIERS 2511 #ifdef CONFIG_PREEMPT_NOTIFIERS
2512 INIT_HLIST_HEAD(&p->preempt_notifiers); 2512 INIT_HLIST_HEAD(&p->preempt_notifiers);
2513 #endif 2513 #endif
2514 } 2514 }
2515 2515
2516 /* 2516 /*
2517 * fork()/clone()-time setup: 2517 * fork()/clone()-time setup:
2518 */ 2518 */
2519 void sched_fork(struct task_struct *p, int clone_flags) 2519 void sched_fork(struct task_struct *p, int clone_flags)
2520 { 2520 {
2521 int cpu = get_cpu(); 2521 int cpu = get_cpu();
2522 2522
2523 __sched_fork(p); 2523 __sched_fork(p);
2524 /* 2524 /*
2525 * We mark the process as running here. This guarantees that 2525 * We mark the process as running here. This guarantees that
2526 * nobody will actually run it, and a signal or other external 2526 * nobody will actually run it, and a signal or other external
2527 * event cannot wake it up and insert it on the runqueue either. 2527 * event cannot wake it up and insert it on the runqueue either.
2528 */ 2528 */
2529 p->state = TASK_RUNNING; 2529 p->state = TASK_RUNNING;
2530 2530
2531 /* 2531 /*
2532 * Revert to default priority/policy on fork if requested. 2532 * Revert to default priority/policy on fork if requested.
2533 */ 2533 */
2534 if (unlikely(p->sched_reset_on_fork)) { 2534 if (unlikely(p->sched_reset_on_fork)) {
2535 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2535 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2536 p->policy = SCHED_NORMAL; 2536 p->policy = SCHED_NORMAL;
2537 p->normal_prio = p->static_prio; 2537 p->normal_prio = p->static_prio;
2538 } 2538 }
2539 2539
2540 if (PRIO_TO_NICE(p->static_prio) < 0) { 2540 if (PRIO_TO_NICE(p->static_prio) < 0) {
2541 p->static_prio = NICE_TO_PRIO(0); 2541 p->static_prio = NICE_TO_PRIO(0);
2542 p->normal_prio = p->static_prio; 2542 p->normal_prio = p->static_prio;
2543 set_load_weight(p); 2543 set_load_weight(p);
2544 } 2544 }
2545 2545
2546 /* 2546 /*
2547 * We don't need the reset flag anymore after the fork. It has 2547 * We don't need the reset flag anymore after the fork. It has
2548 * fulfilled its duty: 2548 * fulfilled its duty:
2549 */ 2549 */
2550 p->sched_reset_on_fork = 0; 2550 p->sched_reset_on_fork = 0;
2551 } 2551 }
2552 2552
2553 /* 2553 /*
2554 * Make sure we do not leak PI boosting priority to the child. 2554 * Make sure we do not leak PI boosting priority to the child.
2555 */ 2555 */
2556 p->prio = current->normal_prio; 2556 p->prio = current->normal_prio;
2557 2557
2558 if (!rt_prio(p->prio)) 2558 if (!rt_prio(p->prio))
2559 p->sched_class = &fair_sched_class; 2559 p->sched_class = &fair_sched_class;
2560 2560
2561 if (p->sched_class->task_fork) 2561 if (p->sched_class->task_fork)
2562 p->sched_class->task_fork(p); 2562 p->sched_class->task_fork(p);
2563 2563
2564 set_task_cpu(p, cpu); 2564 set_task_cpu(p, cpu);
2565 2565
2566 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2566 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2567 if (likely(sched_info_on())) 2567 if (likely(sched_info_on()))
2568 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2568 memset(&p->sched_info, 0, sizeof(p->sched_info));
2569 #endif 2569 #endif
2570 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2570 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2571 p->oncpu = 0; 2571 p->oncpu = 0;
2572 #endif 2572 #endif
2573 #ifdef CONFIG_PREEMPT 2573 #ifdef CONFIG_PREEMPT
2574 /* Want to start with kernel preemption disabled. */ 2574 /* Want to start with kernel preemption disabled. */
2575 task_thread_info(p)->preempt_count = 1; 2575 task_thread_info(p)->preempt_count = 1;
2576 #endif 2576 #endif
2577 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2577 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2578 2578
2579 put_cpu(); 2579 put_cpu();
2580 } 2580 }
2581 2581
2582 /* 2582 /*
2583 * wake_up_new_task - wake up a newly created task for the first time. 2583 * wake_up_new_task - wake up a newly created task for the first time.
2584 * 2584 *
2585 * This function will do some initial scheduler statistics housekeeping 2585 * This function will do some initial scheduler statistics housekeeping
2586 * that must be done for every newly created context, then puts the task 2586 * that must be done for every newly created context, then puts the task
2587 * on the runqueue and wakes it. 2587 * on the runqueue and wakes it.
2588 */ 2588 */
2589 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2589 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2590 { 2590 {
2591 unsigned long flags; 2591 unsigned long flags;
2592 struct rq *rq; 2592 struct rq *rq;
2593 int cpu __maybe_unused = get_cpu(); 2593 int cpu __maybe_unused = get_cpu();
2594 2594
2595 #ifdef CONFIG_SMP 2595 #ifdef CONFIG_SMP
2596 rq = task_rq_lock(p, &flags); 2596 rq = task_rq_lock(p, &flags);
2597 p->state = TASK_WAKING; 2597 p->state = TASK_WAKING;
2598 2598
2599 /* 2599 /*
2600 * Fork balancing, do it here and not earlier because: 2600 * Fork balancing, do it here and not earlier because:
2601 * - cpus_allowed can change in the fork path 2601 * - cpus_allowed can change in the fork path
2602 * - any previously selected cpu might disappear through hotplug 2602 * - any previously selected cpu might disappear through hotplug
2603 * 2603 *
2604 * We set TASK_WAKING so that select_task_rq() can drop rq->lock 2604 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2605 * without people poking at ->cpus_allowed. 2605 * without people poking at ->cpus_allowed.
2606 */ 2606 */
2607 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2607 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2608 set_task_cpu(p, cpu); 2608 set_task_cpu(p, cpu);
2609 2609
2610 p->state = TASK_RUNNING; 2610 p->state = TASK_RUNNING;
2611 task_rq_unlock(rq, &flags); 2611 task_rq_unlock(rq, &flags);
2612 #endif 2612 #endif
2613 2613
2614 rq = task_rq_lock(p, &flags); 2614 rq = task_rq_lock(p, &flags);
2615 activate_task(rq, p, 0); 2615 activate_task(rq, p, 0);
2616 trace_sched_wakeup_new(rq, p, 1); 2616 trace_sched_wakeup_new(rq, p, 1);
2617 check_preempt_curr(rq, p, WF_FORK); 2617 check_preempt_curr(rq, p, WF_FORK);
2618 #ifdef CONFIG_SMP 2618 #ifdef CONFIG_SMP
2619 if (p->sched_class->task_woken) 2619 if (p->sched_class->task_woken)
2620 p->sched_class->task_woken(rq, p); 2620 p->sched_class->task_woken(rq, p);
2621 #endif 2621 #endif
2622 task_rq_unlock(rq, &flags); 2622 task_rq_unlock(rq, &flags);
2623 put_cpu(); 2623 put_cpu();
2624 } 2624 }
2625 2625
2626 #ifdef CONFIG_PREEMPT_NOTIFIERS 2626 #ifdef CONFIG_PREEMPT_NOTIFIERS
2627 2627
2628 /** 2628 /**
2629 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2629 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2630 * @notifier: notifier struct to register 2630 * @notifier: notifier struct to register
2631 */ 2631 */
2632 void preempt_notifier_register(struct preempt_notifier *notifier) 2632 void preempt_notifier_register(struct preempt_notifier *notifier)
2633 { 2633 {
2634 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2634 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2635 } 2635 }
2636 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2636 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2637 2637
2638 /** 2638 /**
2639 * preempt_notifier_unregister - no longer interested in preemption notifications 2639 * preempt_notifier_unregister - no longer interested in preemption notifications
2640 * @notifier: notifier struct to unregister 2640 * @notifier: notifier struct to unregister
2641 * 2641 *
2642 * This is safe to call from within a preemption notifier. 2642 * This is safe to call from within a preemption notifier.
2643 */ 2643 */
2644 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2644 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2645 { 2645 {
2646 hlist_del(&notifier->link); 2646 hlist_del(&notifier->link);
2647 } 2647 }
2648 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2648 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2649 2649
2650 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2650 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2651 { 2651 {
2652 struct preempt_notifier *notifier; 2652 struct preempt_notifier *notifier;
2653 struct hlist_node *node; 2653 struct hlist_node *node;
2654 2654
2655 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2655 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2656 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2656 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2657 } 2657 }
2658 2658
2659 static void 2659 static void
2660 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2660 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2661 struct task_struct *next) 2661 struct task_struct *next)
2662 { 2662 {
2663 struct preempt_notifier *notifier; 2663 struct preempt_notifier *notifier;
2664 struct hlist_node *node; 2664 struct hlist_node *node;
2665 2665
2666 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2666 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2667 notifier->ops->sched_out(notifier, next); 2667 notifier->ops->sched_out(notifier, next);
2668 } 2668 }
2669 2669
2670 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2670 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2671 2671
2672 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2672 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2673 { 2673 {
2674 } 2674 }
2675 2675
2676 static void 2676 static void
2677 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2677 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2678 struct task_struct *next) 2678 struct task_struct *next)
2679 { 2679 {
2680 } 2680 }
2681 2681
2682 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2682 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2683 2683
2684 /** 2684 /**
2685 * prepare_task_switch - prepare to switch tasks 2685 * prepare_task_switch - prepare to switch tasks
2686 * @rq: the runqueue preparing to switch 2686 * @rq: the runqueue preparing to switch
2687 * @prev: the current task that is being switched out 2687 * @prev: the current task that is being switched out
2688 * @next: the task we are going to switch to. 2688 * @next: the task we are going to switch to.
2689 * 2689 *
2690 * This is called with the rq lock held and interrupts off. It must 2690 * This is called with the rq lock held and interrupts off. It must
2691 * be paired with a subsequent finish_task_switch after the context 2691 * be paired with a subsequent finish_task_switch after the context
2692 * switch. 2692 * switch.
2693 * 2693 *
2694 * prepare_task_switch sets up locking and calls architecture specific 2694 * prepare_task_switch sets up locking and calls architecture specific
2695 * hooks. 2695 * hooks.
2696 */ 2696 */
2697 static inline void 2697 static inline void
2698 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2698 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2699 struct task_struct *next) 2699 struct task_struct *next)
2700 { 2700 {
2701 fire_sched_out_preempt_notifiers(prev, next); 2701 fire_sched_out_preempt_notifiers(prev, next);
2702 prepare_lock_switch(rq, next); 2702 prepare_lock_switch(rq, next);
2703 prepare_arch_switch(next); 2703 prepare_arch_switch(next);
2704 } 2704 }
2705 2705
2706 /** 2706 /**
2707 * finish_task_switch - clean up after a task-switch 2707 * finish_task_switch - clean up after a task-switch
2708 * @rq: runqueue associated with task-switch 2708 * @rq: runqueue associated with task-switch
2709 * @prev: the thread we just switched away from. 2709 * @prev: the thread we just switched away from.
2710 * 2710 *
2711 * finish_task_switch must be called after the context switch, paired 2711 * finish_task_switch must be called after the context switch, paired
2712 * with a prepare_task_switch call before the context switch. 2712 * with a prepare_task_switch call before the context switch.
2713 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2713 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2714 * and do any other architecture-specific cleanup actions. 2714 * and do any other architecture-specific cleanup actions.
2715 * 2715 *
2716 * Note that we may have delayed dropping an mm in context_switch(). If 2716 * Note that we may have delayed dropping an mm in context_switch(). If
2717 * so, we finish that here outside of the runqueue lock. (Doing it 2717 * so, we finish that here outside of the runqueue lock. (Doing it
2718 * with the lock held can cause deadlocks; see schedule() for 2718 * with the lock held can cause deadlocks; see schedule() for
2719 * details.) 2719 * details.)
2720 */ 2720 */
2721 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2721 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2722 __releases(rq->lock) 2722 __releases(rq->lock)
2723 { 2723 {
2724 struct mm_struct *mm = rq->prev_mm; 2724 struct mm_struct *mm = rq->prev_mm;
2725 long prev_state; 2725 long prev_state;
2726 2726
2727 rq->prev_mm = NULL; 2727 rq->prev_mm = NULL;
2728 2728
2729 /* 2729 /*
2730 * A task struct has one reference for the use as "current". 2730 * A task struct has one reference for the use as "current".
2731 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2731 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2732 * schedule one last time. The schedule call will never return, and 2732 * schedule one last time. The schedule call will never return, and
2733 * the scheduled task must drop that reference. 2733 * the scheduled task must drop that reference.
2734 * The test for TASK_DEAD must occur while the runqueue locks are 2734 * The test for TASK_DEAD must occur while the runqueue locks are
2735 * still held, otherwise prev could be scheduled on another cpu, die 2735 * still held, otherwise prev could be scheduled on another cpu, die
2736 * there before we look at prev->state, and then the reference would 2736 * there before we look at prev->state, and then the reference would
2737 * be dropped twice. 2737 * be dropped twice.
2738 * Manfred Spraul <manfred@colorfullife.com> 2738 * Manfred Spraul <manfred@colorfullife.com>
2739 */ 2739 */
2740 prev_state = prev->state; 2740 prev_state = prev->state;
2741 finish_arch_switch(prev); 2741 finish_arch_switch(prev);
2742 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2742 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2743 local_irq_disable(); 2743 local_irq_disable();
2744 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2744 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2745 perf_event_task_sched_in(current); 2745 perf_event_task_sched_in(current);
2746 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2746 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2747 local_irq_enable(); 2747 local_irq_enable();
2748 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 2748 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2749 finish_lock_switch(rq, prev); 2749 finish_lock_switch(rq, prev);
2750 2750
2751 fire_sched_in_preempt_notifiers(current); 2751 fire_sched_in_preempt_notifiers(current);
2752 if (mm) 2752 if (mm)
2753 mmdrop(mm); 2753 mmdrop(mm);
2754 if (unlikely(prev_state == TASK_DEAD)) { 2754 if (unlikely(prev_state == TASK_DEAD)) {
2755 /* 2755 /*
2756 * Remove function-return probe instances associated with this 2756 * Remove function-return probe instances associated with this
2757 * task and put them back on the free list. 2757 * task and put them back on the free list.
2758 */ 2758 */
2759 kprobe_flush_task(prev); 2759 kprobe_flush_task(prev);
2760 put_task_struct(prev); 2760 put_task_struct(prev);
2761 } 2761 }
2762 } 2762 }
2763 2763
2764 #ifdef CONFIG_SMP 2764 #ifdef CONFIG_SMP
2765 2765
2766 /* assumes rq->lock is held */ 2766 /* assumes rq->lock is held */
2767 static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 2767 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2768 { 2768 {
2769 if (prev->sched_class->pre_schedule) 2769 if (prev->sched_class->pre_schedule)
2770 prev->sched_class->pre_schedule(rq, prev); 2770 prev->sched_class->pre_schedule(rq, prev);
2771 } 2771 }
2772 2772
2773 /* rq->lock is NOT held, but preemption is disabled */ 2773 /* rq->lock is NOT held, but preemption is disabled */
2774 static inline void post_schedule(struct rq *rq) 2774 static inline void post_schedule(struct rq *rq)
2775 { 2775 {
2776 if (rq->post_schedule) { 2776 if (rq->post_schedule) {
2777 unsigned long flags; 2777 unsigned long flags;
2778 2778
2779 raw_spin_lock_irqsave(&rq->lock, flags); 2779 raw_spin_lock_irqsave(&rq->lock, flags);
2780 if (rq->curr->sched_class->post_schedule) 2780 if (rq->curr->sched_class->post_schedule)
2781 rq->curr->sched_class->post_schedule(rq); 2781 rq->curr->sched_class->post_schedule(rq);
2782 raw_spin_unlock_irqrestore(&rq->lock, flags); 2782 raw_spin_unlock_irqrestore(&rq->lock, flags);
2783 2783
2784 rq->post_schedule = 0; 2784 rq->post_schedule = 0;
2785 } 2785 }
2786 } 2786 }
2787 2787
2788 #else 2788 #else
2789 2789
2790 static inline void pre_schedule(struct rq *rq, struct task_struct *p) 2790 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2791 { 2791 {
2792 } 2792 }
2793 2793
2794 static inline void post_schedule(struct rq *rq) 2794 static inline void post_schedule(struct rq *rq)
2795 { 2795 {
2796 } 2796 }
2797 2797
2798 #endif 2798 #endif
2799 2799
2800 /** 2800 /**
2801 * schedule_tail - first thing a freshly forked thread must call. 2801 * schedule_tail - first thing a freshly forked thread must call.
2802 * @prev: the thread we just switched away from. 2802 * @prev: the thread we just switched away from.
2803 */ 2803 */
2804 asmlinkage void schedule_tail(struct task_struct *prev) 2804 asmlinkage void schedule_tail(struct task_struct *prev)
2805 __releases(rq->lock) 2805 __releases(rq->lock)
2806 { 2806 {
2807 struct rq *rq = this_rq(); 2807 struct rq *rq = this_rq();
2808 2808
2809 finish_task_switch(rq, prev); 2809 finish_task_switch(rq, prev);
2810 2810
2811 /* 2811 /*
2812 * FIXME: do we need to worry about rq being invalidated by the 2812 * FIXME: do we need to worry about rq being invalidated by the
2813 * task_switch? 2813 * task_switch?
2814 */ 2814 */
2815 post_schedule(rq); 2815 post_schedule(rq);
2816 2816
2817 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 2817 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
2818 /* In this case, finish_task_switch does not reenable preemption */ 2818 /* In this case, finish_task_switch does not reenable preemption */
2819 preempt_enable(); 2819 preempt_enable();
2820 #endif 2820 #endif
2821 if (current->set_child_tid) 2821 if (current->set_child_tid)
2822 put_user(task_pid_vnr(current), current->set_child_tid); 2822 put_user(task_pid_vnr(current), current->set_child_tid);
2823 } 2823 }
2824 2824
2825 /* 2825 /*
2826 * context_switch - switch to the new MM and the new 2826 * context_switch - switch to the new MM and the new
2827 * thread's register state. 2827 * thread's register state.
2828 */ 2828 */
2829 static inline void 2829 static inline void
2830 context_switch(struct rq *rq, struct task_struct *prev, 2830 context_switch(struct rq *rq, struct task_struct *prev,
2831 struct task_struct *next) 2831 struct task_struct *next)
2832 { 2832 {
2833 struct mm_struct *mm, *oldmm; 2833 struct mm_struct *mm, *oldmm;
2834 2834
2835 prepare_task_switch(rq, prev, next); 2835 prepare_task_switch(rq, prev, next);
2836 trace_sched_switch(rq, prev, next); 2836 trace_sched_switch(rq, prev, next);
2837 mm = next->mm; 2837 mm = next->mm;
2838 oldmm = prev->active_mm; 2838 oldmm = prev->active_mm;
2839 /* 2839 /*
2840 * For paravirt, this is coupled with an exit in switch_to to 2840 * For paravirt, this is coupled with an exit in switch_to to
2841 * combine the page table reload and the switch backend into 2841 * combine the page table reload and the switch backend into
2842 * one hypercall. 2842 * one hypercall.
2843 */ 2843 */
2844 arch_start_context_switch(prev); 2844 arch_start_context_switch(prev);
2845 2845
2846 if (likely(!mm)) { 2846 if (likely(!mm)) {
2847 next->active_mm = oldmm; 2847 next->active_mm = oldmm;
2848 atomic_inc(&oldmm->mm_count); 2848 atomic_inc(&oldmm->mm_count);
2849 enter_lazy_tlb(oldmm, next); 2849 enter_lazy_tlb(oldmm, next);
2850 } else 2850 } else
2851 switch_mm(oldmm, mm, next); 2851 switch_mm(oldmm, mm, next);
2852 2852
2853 if (likely(!prev->mm)) { 2853 if (likely(!prev->mm)) {
2854 prev->active_mm = NULL; 2854 prev->active_mm = NULL;
2855 rq->prev_mm = oldmm; 2855 rq->prev_mm = oldmm;
2856 } 2856 }
2857 /* 2857 /*
2858 * Since the runqueue lock will be released by the next 2858 * Since the runqueue lock will be released by the next
2859 * task (which is an invalid locking op but in the case 2859 * task (which is an invalid locking op but in the case
2860 * of the scheduler it's an obvious special-case), so we 2860 * of the scheduler it's an obvious special-case), so we
2861 * do an early lockdep release here: 2861 * do an early lockdep release here:
2862 */ 2862 */
2863 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 2863 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
2864 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2864 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2865 #endif 2865 #endif
2866 2866
2867 /* Here we just switch the register state and the stack. */ 2867 /* Here we just switch the register state and the stack. */
2868 switch_to(prev, next, prev); 2868 switch_to(prev, next, prev);
2869 2869
2870 barrier(); 2870 barrier();
2871 /* 2871 /*
2872 * this_rq must be evaluated again because prev may have moved 2872 * this_rq must be evaluated again because prev may have moved
2873 * CPUs since it called schedule(), thus the 'rq' on its stack 2873 * CPUs since it called schedule(), thus the 'rq' on its stack
2874 * frame will be invalid. 2874 * frame will be invalid.
2875 */ 2875 */
2876 finish_task_switch(this_rq(), prev); 2876 finish_task_switch(this_rq(), prev);
2877 } 2877 }
2878 2878
2879 /* 2879 /*
2880 * nr_running, nr_uninterruptible and nr_context_switches: 2880 * nr_running, nr_uninterruptible and nr_context_switches:
2881 * 2881 *
2882 * externally visible scheduler statistics: current number of runnable 2882 * externally visible scheduler statistics: current number of runnable
2883 * threads, current number of uninterruptible-sleeping threads, total 2883 * threads, current number of uninterruptible-sleeping threads, total
2884 * number of context switches performed since bootup. 2884 * number of context switches performed since bootup.
2885 */ 2885 */
2886 unsigned long nr_running(void) 2886 unsigned long nr_running(void)
2887 { 2887 {
2888 unsigned long i, sum = 0; 2888 unsigned long i, sum = 0;
2889 2889
2890 for_each_online_cpu(i) 2890 for_each_online_cpu(i)
2891 sum += cpu_rq(i)->nr_running; 2891 sum += cpu_rq(i)->nr_running;
2892 2892
2893 return sum; 2893 return sum;
2894 } 2894 }
2895 2895
2896 unsigned long nr_uninterruptible(void) 2896 unsigned long nr_uninterruptible(void)
2897 { 2897 {
2898 unsigned long i, sum = 0; 2898 unsigned long i, sum = 0;
2899 2899
2900 for_each_possible_cpu(i) 2900 for_each_possible_cpu(i)
2901 sum += cpu_rq(i)->nr_uninterruptible; 2901 sum += cpu_rq(i)->nr_uninterruptible;
2902 2902
2903 /* 2903 /*
2904 * Since we read the counters lockless, it might be slightly 2904 * Since we read the counters lockless, it might be slightly
2905 * inaccurate. Do not allow it to go below zero though: 2905 * inaccurate. Do not allow it to go below zero though:
2906 */ 2906 */
2907 if (unlikely((long)sum < 0)) 2907 if (unlikely((long)sum < 0))
2908 sum = 0; 2908 sum = 0;
2909 2909
2910 return sum; 2910 return sum;
2911 } 2911 }
2912 2912
2913 unsigned long long nr_context_switches(void) 2913 unsigned long long nr_context_switches(void)
2914 { 2914 {
2915 int i; 2915 int i;
2916 unsigned long long sum = 0; 2916 unsigned long long sum = 0;
2917 2917
2918 for_each_possible_cpu(i) 2918 for_each_possible_cpu(i)
2919 sum += cpu_rq(i)->nr_switches; 2919 sum += cpu_rq(i)->nr_switches;
2920 2920
2921 return sum; 2921 return sum;
2922 } 2922 }
2923 2923
2924 unsigned long nr_iowait(void) 2924 unsigned long nr_iowait(void)
2925 { 2925 {
2926 unsigned long i, sum = 0; 2926 unsigned long i, sum = 0;
2927 2927
2928 for_each_possible_cpu(i) 2928 for_each_possible_cpu(i)
2929 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2929 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2930 2930
2931 return sum; 2931 return sum;
2932 } 2932 }
2933 2933
2934 unsigned long nr_iowait_cpu(void) 2934 unsigned long nr_iowait_cpu(void)
2935 { 2935 {
2936 struct rq *this = this_rq(); 2936 struct rq *this = this_rq();
2937 return atomic_read(&this->nr_iowait); 2937 return atomic_read(&this->nr_iowait);
2938 } 2938 }
2939 2939
2940 unsigned long this_cpu_load(void) 2940 unsigned long this_cpu_load(void)
2941 { 2941 {
2942 struct rq *this = this_rq(); 2942 struct rq *this = this_rq();
2943 return this->cpu_load[0]; 2943 return this->cpu_load[0];
2944 } 2944 }
2945 2945
2946 2946
2947 /* Variables and functions for calc_load */ 2947 /* Variables and functions for calc_load */
2948 static atomic_long_t calc_load_tasks; 2948 static atomic_long_t calc_load_tasks;
2949 static unsigned long calc_load_update; 2949 static unsigned long calc_load_update;
2950 unsigned long avenrun[3]; 2950 unsigned long avenrun[3];
2951 EXPORT_SYMBOL(avenrun); 2951 EXPORT_SYMBOL(avenrun);
2952 2952
2953 static long calc_load_fold_active(struct rq *this_rq)
2954 {
2955 long nr_active, delta = 0;
2956
2957 nr_active = this_rq->nr_running;
2958 nr_active += (long) this_rq->nr_uninterruptible;
2959
2960 if (nr_active != this_rq->calc_load_active) {
2961 delta = nr_active - this_rq->calc_load_active;
2962 this_rq->calc_load_active = nr_active;
2963 }
2964
2965 return delta;
2966 }
2967
2968 #ifdef CONFIG_NO_HZ
2969 /*
2970 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2971 *
2972 * When making the ILB scale, we should try to pull this in as well.
2973 */
2974 static atomic_long_t calc_load_tasks_idle;
2975
2976 static void calc_load_account_idle(struct rq *this_rq)
2977 {
2978 long delta;
2979
2980 delta = calc_load_fold_active(this_rq);
2981 if (delta)
2982 atomic_long_add(delta, &calc_load_tasks_idle);
2983 }
2984
2985 static long calc_load_fold_idle(void)
2986 {
2987 long delta = 0;
2988
2989 /*
2990 * Its got a race, we don't care...
2991 */
2992 if (atomic_long_read(&calc_load_tasks_idle))
2993 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2994
2995 return delta;
2996 }
2997 #else
2998 static void calc_load_account_idle(struct rq *this_rq)
2999 {
3000 }
3001
3002 static inline long calc_load_fold_idle(void)
3003 {
3004 return 0;
3005 }
3006 #endif
3007
2953 /** 3008 /**
2954 * get_avenrun - get the load average array 3009 * get_avenrun - get the load average array
2955 * @loads: pointer to dest load array 3010 * @loads: pointer to dest load array
2956 * @offset: offset to add 3011 * @offset: offset to add
2957 * @shift: shift count to shift the result left 3012 * @shift: shift count to shift the result left
2958 * 3013 *
2959 * These values are estimates at best, so no need for locking. 3014 * These values are estimates at best, so no need for locking.
2960 */ 3015 */
2961 void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 3016 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2962 { 3017 {
2963 loads[0] = (avenrun[0] + offset) << shift; 3018 loads[0] = (avenrun[0] + offset) << shift;
2964 loads[1] = (avenrun[1] + offset) << shift; 3019 loads[1] = (avenrun[1] + offset) << shift;
2965 loads[2] = (avenrun[2] + offset) << shift; 3020 loads[2] = (avenrun[2] + offset) << shift;
2966 } 3021 }
2967 3022
2968 static unsigned long 3023 static unsigned long
2969 calc_load(unsigned long load, unsigned long exp, unsigned long active) 3024 calc_load(unsigned long load, unsigned long exp, unsigned long active)
2970 { 3025 {
2971 load *= exp; 3026 load *= exp;
2972 load += active * (FIXED_1 - exp); 3027 load += active * (FIXED_1 - exp);
2973 return load >> FSHIFT; 3028 return load >> FSHIFT;
2974 } 3029 }
2975 3030
2976 /* 3031 /*
2977 * calc_load - update the avenrun load estimates 10 ticks after the 3032 * calc_load - update the avenrun load estimates 10 ticks after the
2978 * CPUs have updated calc_load_tasks. 3033 * CPUs have updated calc_load_tasks.
2979 */ 3034 */
2980 void calc_global_load(void) 3035 void calc_global_load(void)
2981 { 3036 {
2982 unsigned long upd = calc_load_update + 10; 3037 unsigned long upd = calc_load_update + 10;
2983 long active; 3038 long active;
2984 3039
2985 if (time_before(jiffies, upd)) 3040 if (time_before(jiffies, upd))
2986 return; 3041 return;
2987 3042
2988 active = atomic_long_read(&calc_load_tasks); 3043 active = atomic_long_read(&calc_load_tasks);
2989 active = active > 0 ? active * FIXED_1 : 0; 3044 active = active > 0 ? active * FIXED_1 : 0;
2990 3045
2991 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 3046 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2992 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 3047 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2993 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 3048 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2994 3049
2995 calc_load_update += LOAD_FREQ; 3050 calc_load_update += LOAD_FREQ;
2996 } 3051 }
2997 3052
2998 /* 3053 /*
2999 * Either called from update_cpu_load() or from a cpu going idle 3054 * Called from update_cpu_load() to periodically update this CPU's
3055 * active count.
3000 */ 3056 */
3001 static void calc_load_account_active(struct rq *this_rq) 3057 static void calc_load_account_active(struct rq *this_rq)
3002 { 3058 {
3003 long nr_active, delta; 3059 long delta;
3004 3060
3005 nr_active = this_rq->nr_running; 3061 if (time_before(jiffies, this_rq->calc_load_update))
3006 nr_active += (long) this_rq->nr_uninterruptible; 3062 return;
3007 3063
3008 if (nr_active != this_rq->calc_load_active) { 3064 delta = calc_load_fold_active(this_rq);
3009 delta = nr_active - this_rq->calc_load_active; 3065 delta += calc_load_fold_idle();
3010 this_rq->calc_load_active = nr_active; 3066 if (delta)
3011 atomic_long_add(delta, &calc_load_tasks); 3067 atomic_long_add(delta, &calc_load_tasks);
3012 } 3068
3069 this_rq->calc_load_update += LOAD_FREQ;
3013 } 3070 }
3014 3071
3015 /* 3072 /*
3016 * Update rq->cpu_load[] statistics. This function is usually called every 3073 * Update rq->cpu_load[] statistics. This function is usually called every
3017 * scheduler tick (TICK_NSEC). 3074 * scheduler tick (TICK_NSEC).
3018 */ 3075 */
3019 static void update_cpu_load(struct rq *this_rq) 3076 static void update_cpu_load(struct rq *this_rq)
3020 { 3077 {
3021 unsigned long this_load = this_rq->load.weight; 3078 unsigned long this_load = this_rq->load.weight;
3022 int i, scale; 3079 int i, scale;
3023 3080
3024 this_rq->nr_load_updates++; 3081 this_rq->nr_load_updates++;
3025 3082
3026 /* Update our load: */ 3083 /* Update our load: */
3027 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3084 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3028 unsigned long old_load, new_load; 3085 unsigned long old_load, new_load;
3029 3086
3030 /* scale is effectively 1 << i now, and >> i divides by scale */ 3087 /* scale is effectively 1 << i now, and >> i divides by scale */
3031 3088
3032 old_load = this_rq->cpu_load[i]; 3089 old_load = this_rq->cpu_load[i];
3033 new_load = this_load; 3090 new_load = this_load;
3034 /* 3091 /*
3035 * Round up the averaging division if load is increasing. This 3092 * Round up the averaging division if load is increasing. This
3036 * prevents us from getting stuck on 9 if the load is 10, for 3093 * prevents us from getting stuck on 9 if the load is 10, for
3037 * example. 3094 * example.
3038 */ 3095 */
3039 if (new_load > old_load) 3096 if (new_load > old_load)
3040 new_load += scale-1; 3097 new_load += scale-1;
3041 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3098 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3042 } 3099 }
3043 3100
3044 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3101 calc_load_account_active(this_rq);
3045 this_rq->calc_load_update += LOAD_FREQ;
3046 calc_load_account_active(this_rq);
3047 }
3048 } 3102 }
3049 3103
3050 #ifdef CONFIG_SMP 3104 #ifdef CONFIG_SMP
3051 3105
3052 /* 3106 /*
3053 * sched_exec - execve() is a valuable balancing opportunity, because at 3107 * sched_exec - execve() is a valuable balancing opportunity, because at
3054 * this point the task has the smallest effective memory and cache footprint. 3108 * this point the task has the smallest effective memory and cache footprint.
3055 */ 3109 */
3056 void sched_exec(void) 3110 void sched_exec(void)
3057 { 3111 {
3058 struct task_struct *p = current; 3112 struct task_struct *p = current;
3059 struct migration_req req; 3113 struct migration_req req;
3060 unsigned long flags; 3114 unsigned long flags;
3061 struct rq *rq; 3115 struct rq *rq;
3062 int dest_cpu; 3116 int dest_cpu;
3063 3117
3064 rq = task_rq_lock(p, &flags); 3118 rq = task_rq_lock(p, &flags);
3065 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3119 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3066 if (dest_cpu == smp_processor_id()) 3120 if (dest_cpu == smp_processor_id())
3067 goto unlock; 3121 goto unlock;
3068 3122
3069 /* 3123 /*
3070 * select_task_rq() can race against ->cpus_allowed 3124 * select_task_rq() can race against ->cpus_allowed
3071 */ 3125 */
3072 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3126 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3073 likely(cpu_active(dest_cpu)) && 3127 likely(cpu_active(dest_cpu)) &&
3074 migrate_task(p, dest_cpu, &req)) { 3128 migrate_task(p, dest_cpu, &req)) {
3075 /* Need to wait for migration thread (might exit: take ref). */ 3129 /* Need to wait for migration thread (might exit: take ref). */
3076 struct task_struct *mt = rq->migration_thread; 3130 struct task_struct *mt = rq->migration_thread;
3077 3131
3078 get_task_struct(mt); 3132 get_task_struct(mt);
3079 task_rq_unlock(rq, &flags); 3133 task_rq_unlock(rq, &flags);
3080 wake_up_process(mt); 3134 wake_up_process(mt);
3081 put_task_struct(mt); 3135 put_task_struct(mt);
3082 wait_for_completion(&req.done); 3136 wait_for_completion(&req.done);
3083 3137
3084 return; 3138 return;
3085 } 3139 }
3086 unlock: 3140 unlock:
3087 task_rq_unlock(rq, &flags); 3141 task_rq_unlock(rq, &flags);
3088 } 3142 }
3089 3143
3090 #endif 3144 #endif
3091 3145
3092 DEFINE_PER_CPU(struct kernel_stat, kstat); 3146 DEFINE_PER_CPU(struct kernel_stat, kstat);
3093 3147
3094 EXPORT_PER_CPU_SYMBOL(kstat); 3148 EXPORT_PER_CPU_SYMBOL(kstat);
3095 3149
3096 /* 3150 /*
3097 * Return any ns on the sched_clock that have not yet been accounted in 3151 * Return any ns on the sched_clock that have not yet been accounted in
3098 * @p in case that task is currently running. 3152 * @p in case that task is currently running.
3099 * 3153 *
3100 * Called with task_rq_lock() held on @rq. 3154 * Called with task_rq_lock() held on @rq.
3101 */ 3155 */
3102 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 3156 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3103 { 3157 {
3104 u64 ns = 0; 3158 u64 ns = 0;
3105 3159
3106 if (task_current(rq, p)) { 3160 if (task_current(rq, p)) {
3107 update_rq_clock(rq); 3161 update_rq_clock(rq);
3108 ns = rq->clock - p->se.exec_start; 3162 ns = rq->clock - p->se.exec_start;
3109 if ((s64)ns < 0) 3163 if ((s64)ns < 0)
3110 ns = 0; 3164 ns = 0;
3111 } 3165 }
3112 3166
3113 return ns; 3167 return ns;
3114 } 3168 }
3115 3169
3116 unsigned long long task_delta_exec(struct task_struct *p) 3170 unsigned long long task_delta_exec(struct task_struct *p)
3117 { 3171 {
3118 unsigned long flags; 3172 unsigned long flags;
3119 struct rq *rq; 3173 struct rq *rq;
3120 u64 ns = 0; 3174 u64 ns = 0;
3121 3175
3122 rq = task_rq_lock(p, &flags); 3176 rq = task_rq_lock(p, &flags);
3123 ns = do_task_delta_exec(p, rq); 3177 ns = do_task_delta_exec(p, rq);
3124 task_rq_unlock(rq, &flags); 3178 task_rq_unlock(rq, &flags);
3125 3179
3126 return ns; 3180 return ns;
3127 } 3181 }
3128 3182
3129 /* 3183 /*
3130 * Return accounted runtime for the task. 3184 * Return accounted runtime for the task.
3131 * In case the task is currently running, return the runtime plus current's 3185 * In case the task is currently running, return the runtime plus current's
3132 * pending runtime that have not been accounted yet. 3186 * pending runtime that have not been accounted yet.
3133 */ 3187 */
3134 unsigned long long task_sched_runtime(struct task_struct *p) 3188 unsigned long long task_sched_runtime(struct task_struct *p)
3135 { 3189 {
3136 unsigned long flags; 3190 unsigned long flags;
3137 struct rq *rq; 3191 struct rq *rq;
3138 u64 ns = 0; 3192 u64 ns = 0;
3139 3193
3140 rq = task_rq_lock(p, &flags); 3194 rq = task_rq_lock(p, &flags);
3141 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3195 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3142 task_rq_unlock(rq, &flags); 3196 task_rq_unlock(rq, &flags);
3143 3197
3144 return ns; 3198 return ns;
3145 } 3199 }
3146 3200
3147 /* 3201 /*
3148 * Return sum_exec_runtime for the thread group. 3202 * Return sum_exec_runtime for the thread group.
3149 * In case the task is currently running, return the sum plus current's 3203 * In case the task is currently running, return the sum plus current's
3150 * pending runtime that have not been accounted yet. 3204 * pending runtime that have not been accounted yet.
3151 * 3205 *
3152 * Note that the thread group might have other running tasks as well, 3206 * Note that the thread group might have other running tasks as well,
3153 * so the return value not includes other pending runtime that other 3207 * so the return value not includes other pending runtime that other
3154 * running tasks might have. 3208 * running tasks might have.
3155 */ 3209 */
3156 unsigned long long thread_group_sched_runtime(struct task_struct *p) 3210 unsigned long long thread_group_sched_runtime(struct task_struct *p)
3157 { 3211 {
3158 struct task_cputime totals; 3212 struct task_cputime totals;
3159 unsigned long flags; 3213 unsigned long flags;
3160 struct rq *rq; 3214 struct rq *rq;
3161 u64 ns; 3215 u64 ns;
3162 3216
3163 rq = task_rq_lock(p, &flags); 3217 rq = task_rq_lock(p, &flags);
3164 thread_group_cputime(p, &totals); 3218 thread_group_cputime(p, &totals);
3165 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3219 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3166 task_rq_unlock(rq, &flags); 3220 task_rq_unlock(rq, &flags);
3167 3221
3168 return ns; 3222 return ns;
3169 } 3223 }
3170 3224
3171 /* 3225 /*
3172 * Account user cpu time to a process. 3226 * Account user cpu time to a process.
3173 * @p: the process that the cpu time gets accounted to 3227 * @p: the process that the cpu time gets accounted to
3174 * @cputime: the cpu time spent in user space since the last update 3228 * @cputime: the cpu time spent in user space since the last update
3175 * @cputime_scaled: cputime scaled by cpu frequency 3229 * @cputime_scaled: cputime scaled by cpu frequency
3176 */ 3230 */
3177 void account_user_time(struct task_struct *p, cputime_t cputime, 3231 void account_user_time(struct task_struct *p, cputime_t cputime,
3178 cputime_t cputime_scaled) 3232 cputime_t cputime_scaled)
3179 { 3233 {
3180 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3234 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3181 cputime64_t tmp; 3235 cputime64_t tmp;
3182 3236
3183 /* Add user time to process. */ 3237 /* Add user time to process. */
3184 p->utime = cputime_add(p->utime, cputime); 3238 p->utime = cputime_add(p->utime, cputime);
3185 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3239 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3186 account_group_user_time(p, cputime); 3240 account_group_user_time(p, cputime);
3187 3241
3188 /* Add user time to cpustat. */ 3242 /* Add user time to cpustat. */
3189 tmp = cputime_to_cputime64(cputime); 3243 tmp = cputime_to_cputime64(cputime);
3190 if (TASK_NICE(p) > 0) 3244 if (TASK_NICE(p) > 0)
3191 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3245 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3192 else 3246 else
3193 cpustat->user = cputime64_add(cpustat->user, tmp); 3247 cpustat->user = cputime64_add(cpustat->user, tmp);
3194 3248
3195 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 3249 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
3196 /* Account for user time used */ 3250 /* Account for user time used */
3197 acct_update_integrals(p); 3251 acct_update_integrals(p);
3198 } 3252 }
3199 3253
3200 /* 3254 /*
3201 * Account guest cpu time to a process. 3255 * Account guest cpu time to a process.
3202 * @p: the process that the cpu time gets accounted to 3256 * @p: the process that the cpu time gets accounted to
3203 * @cputime: the cpu time spent in virtual machine since the last update 3257 * @cputime: the cpu time spent in virtual machine since the last update
3204 * @cputime_scaled: cputime scaled by cpu frequency 3258 * @cputime_scaled: cputime scaled by cpu frequency
3205 */ 3259 */
3206 static void account_guest_time(struct task_struct *p, cputime_t cputime, 3260 static void account_guest_time(struct task_struct *p, cputime_t cputime,
3207 cputime_t cputime_scaled) 3261 cputime_t cputime_scaled)
3208 { 3262 {
3209 cputime64_t tmp; 3263 cputime64_t tmp;
3210 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3264 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3211 3265
3212 tmp = cputime_to_cputime64(cputime); 3266 tmp = cputime_to_cputime64(cputime);
3213 3267
3214 /* Add guest time to process. */ 3268 /* Add guest time to process. */
3215 p->utime = cputime_add(p->utime, cputime); 3269 p->utime = cputime_add(p->utime, cputime);
3216 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 3270 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
3217 account_group_user_time(p, cputime); 3271 account_group_user_time(p, cputime);
3218 p->gtime = cputime_add(p->gtime, cputime); 3272 p->gtime = cputime_add(p->gtime, cputime);
3219 3273
3220 /* Add guest time to cpustat. */ 3274 /* Add guest time to cpustat. */
3221 if (TASK_NICE(p) > 0) { 3275 if (TASK_NICE(p) > 0) {
3222 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3276 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3223 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); 3277 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3224 } else { 3278 } else {
3225 cpustat->user = cputime64_add(cpustat->user, tmp); 3279 cpustat->user = cputime64_add(cpustat->user, tmp);
3226 cpustat->guest = cputime64_add(cpustat->guest, tmp); 3280 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3227 } 3281 }
3228 } 3282 }
3229 3283
3230 /* 3284 /*
3231 * Account system cpu time to a process. 3285 * Account system cpu time to a process.
3232 * @p: the process that the cpu time gets accounted to 3286 * @p: the process that the cpu time gets accounted to
3233 * @hardirq_offset: the offset to subtract from hardirq_count() 3287 * @hardirq_offset: the offset to subtract from hardirq_count()
3234 * @cputime: the cpu time spent in kernel space since the last update 3288 * @cputime: the cpu time spent in kernel space since the last update
3235 * @cputime_scaled: cputime scaled by cpu frequency 3289 * @cputime_scaled: cputime scaled by cpu frequency
3236 */ 3290 */
3237 void account_system_time(struct task_struct *p, int hardirq_offset, 3291 void account_system_time(struct task_struct *p, int hardirq_offset,
3238 cputime_t cputime, cputime_t cputime_scaled) 3292 cputime_t cputime, cputime_t cputime_scaled)
3239 { 3293 {
3240 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3294 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3241 cputime64_t tmp; 3295 cputime64_t tmp;
3242 3296
3243 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 3297 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
3244 account_guest_time(p, cputime, cputime_scaled); 3298 account_guest_time(p, cputime, cputime_scaled);
3245 return; 3299 return;
3246 } 3300 }
3247 3301
3248 /* Add system time to process. */ 3302 /* Add system time to process. */
3249 p->stime = cputime_add(p->stime, cputime); 3303 p->stime = cputime_add(p->stime, cputime);
3250 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 3304 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3251 account_group_system_time(p, cputime); 3305 account_group_system_time(p, cputime);
3252 3306
3253 /* Add system time to cpustat. */ 3307 /* Add system time to cpustat. */
3254 tmp = cputime_to_cputime64(cputime); 3308 tmp = cputime_to_cputime64(cputime);
3255 if (hardirq_count() - hardirq_offset) 3309 if (hardirq_count() - hardirq_offset)
3256 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3310 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3257 else if (softirq_count()) 3311 else if (softirq_count())
3258 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3312 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3259 else 3313 else
3260 cpustat->system = cputime64_add(cpustat->system, tmp); 3314 cpustat->system = cputime64_add(cpustat->system, tmp);
3261 3315
3262 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 3316 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3263 3317
3264 /* Account for system time used */ 3318 /* Account for system time used */
3265 acct_update_integrals(p); 3319 acct_update_integrals(p);
3266 } 3320 }
3267 3321
3268 /* 3322 /*
3269 * Account for involuntary wait time. 3323 * Account for involuntary wait time.
3270 * @steal: the cpu time spent in involuntary wait 3324 * @steal: the cpu time spent in involuntary wait
3271 */ 3325 */
3272 void account_steal_time(cputime_t cputime) 3326 void account_steal_time(cputime_t cputime)
3273 { 3327 {
3274 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3328 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3275 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3329 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3276 3330
3277 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 3331 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
3278 } 3332 }
3279 3333
3280 /* 3334 /*
3281 * Account for idle time. 3335 * Account for idle time.
3282 * @cputime: the cpu time spent in idle wait 3336 * @cputime: the cpu time spent in idle wait
3283 */ 3337 */
3284 void account_idle_time(cputime_t cputime) 3338 void account_idle_time(cputime_t cputime)
3285 { 3339 {
3286 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3340 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3287 cputime64_t cputime64 = cputime_to_cputime64(cputime); 3341 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3288 struct rq *rq = this_rq(); 3342 struct rq *rq = this_rq();
3289 3343
3290 if (atomic_read(&rq->nr_iowait) > 0) 3344 if (atomic_read(&rq->nr_iowait) > 0)
3291 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 3345 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3292 else 3346 else
3293 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 3347 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
3294 } 3348 }
3295 3349
3296 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 3350 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
3297 3351
3298 /* 3352 /*
3299 * Account a single tick of cpu time. 3353 * Account a single tick of cpu time.
3300 * @p: the process that the cpu time gets accounted to 3354 * @p: the process that the cpu time gets accounted to
3301 * @user_tick: indicates if the tick is a user or a system tick 3355 * @user_tick: indicates if the tick is a user or a system tick
3302 */ 3356 */
3303 void account_process_tick(struct task_struct *p, int user_tick) 3357 void account_process_tick(struct task_struct *p, int user_tick)
3304 { 3358 {
3305 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 3359 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
3306 struct rq *rq = this_rq(); 3360 struct rq *rq = this_rq();
3307 3361
3308 if (user_tick) 3362 if (user_tick)
3309 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 3363 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
3310 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 3364 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
3311 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 3365 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
3312 one_jiffy_scaled); 3366 one_jiffy_scaled);
3313 else 3367 else
3314 account_idle_time(cputime_one_jiffy); 3368 account_idle_time(cputime_one_jiffy);
3315 } 3369 }
3316 3370
3317 /* 3371 /*
3318 * Account multiple ticks of steal time. 3372 * Account multiple ticks of steal time.
3319 * @p: the process from which the cpu time has been stolen 3373 * @p: the process from which the cpu time has been stolen
3320 * @ticks: number of stolen ticks 3374 * @ticks: number of stolen ticks
3321 */ 3375 */
3322 void account_steal_ticks(unsigned long ticks) 3376 void account_steal_ticks(unsigned long ticks)
3323 { 3377 {
3324 account_steal_time(jiffies_to_cputime(ticks)); 3378 account_steal_time(jiffies_to_cputime(ticks));
3325 } 3379 }
3326 3380
3327 /* 3381 /*
3328 * Account multiple ticks of idle time. 3382 * Account multiple ticks of idle time.
3329 * @ticks: number of stolen ticks 3383 * @ticks: number of stolen ticks
3330 */ 3384 */
3331 void account_idle_ticks(unsigned long ticks) 3385 void account_idle_ticks(unsigned long ticks)
3332 { 3386 {
3333 account_idle_time(jiffies_to_cputime(ticks)); 3387 account_idle_time(jiffies_to_cputime(ticks));
3334 } 3388 }
3335 3389
3336 #endif 3390 #endif
3337 3391
3338 /* 3392 /*
3339 * Use precise platform statistics if available: 3393 * Use precise platform statistics if available:
3340 */ 3394 */
3341 #ifdef CONFIG_VIRT_CPU_ACCOUNTING 3395 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
3342 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3396 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3343 { 3397 {
3344 *ut = p->utime; 3398 *ut = p->utime;
3345 *st = p->stime; 3399 *st = p->stime;
3346 } 3400 }
3347 3401
3348 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3402 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3349 { 3403 {
3350 struct task_cputime cputime; 3404 struct task_cputime cputime;
3351 3405
3352 thread_group_cputime(p, &cputime); 3406 thread_group_cputime(p, &cputime);
3353 3407
3354 *ut = cputime.utime; 3408 *ut = cputime.utime;
3355 *st = cputime.stime; 3409 *st = cputime.stime;
3356 } 3410 }
3357 #else 3411 #else
3358 3412
3359 #ifndef nsecs_to_cputime 3413 #ifndef nsecs_to_cputime
3360 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) 3414 # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
3361 #endif 3415 #endif
3362 3416
3363 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3417 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3364 { 3418 {
3365 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); 3419 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
3366 3420
3367 /* 3421 /*
3368 * Use CFS's precise accounting: 3422 * Use CFS's precise accounting:
3369 */ 3423 */
3370 rtime = nsecs_to_cputime(p->se.sum_exec_runtime); 3424 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
3371 3425
3372 if (total) { 3426 if (total) {
3373 u64 temp; 3427 u64 temp;
3374 3428
3375 temp = (u64)(rtime * utime); 3429 temp = (u64)(rtime * utime);
3376 do_div(temp, total); 3430 do_div(temp, total);
3377 utime = (cputime_t)temp; 3431 utime = (cputime_t)temp;
3378 } else 3432 } else
3379 utime = rtime; 3433 utime = rtime;
3380 3434
3381 /* 3435 /*
3382 * Compare with previous values, to keep monotonicity: 3436 * Compare with previous values, to keep monotonicity:
3383 */ 3437 */
3384 p->prev_utime = max(p->prev_utime, utime); 3438 p->prev_utime = max(p->prev_utime, utime);
3385 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); 3439 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
3386 3440
3387 *ut = p->prev_utime; 3441 *ut = p->prev_utime;
3388 *st = p->prev_stime; 3442 *st = p->prev_stime;
3389 } 3443 }
3390 3444
3391 /* 3445 /*
3392 * Must be called with siglock held. 3446 * Must be called with siglock held.
3393 */ 3447 */
3394 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) 3448 void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3395 { 3449 {
3396 struct signal_struct *sig = p->signal; 3450 struct signal_struct *sig = p->signal;
3397 struct task_cputime cputime; 3451 struct task_cputime cputime;
3398 cputime_t rtime, utime, total; 3452 cputime_t rtime, utime, total;
3399 3453
3400 thread_group_cputime(p, &cputime); 3454 thread_group_cputime(p, &cputime);
3401 3455
3402 total = cputime_add(cputime.utime, cputime.stime); 3456 total = cputime_add(cputime.utime, cputime.stime);
3403 rtime = nsecs_to_cputime(cputime.sum_exec_runtime); 3457 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
3404 3458
3405 if (total) { 3459 if (total) {
3406 u64 temp; 3460 u64 temp;
3407 3461
3408 temp = (u64)(rtime * cputime.utime); 3462 temp = (u64)(rtime * cputime.utime);
3409 do_div(temp, total); 3463 do_div(temp, total);
3410 utime = (cputime_t)temp; 3464 utime = (cputime_t)temp;
3411 } else 3465 } else
3412 utime = rtime; 3466 utime = rtime;
3413 3467
3414 sig->prev_utime = max(sig->prev_utime, utime); 3468 sig->prev_utime = max(sig->prev_utime, utime);
3415 sig->prev_stime = max(sig->prev_stime, 3469 sig->prev_stime = max(sig->prev_stime,
3416 cputime_sub(rtime, sig->prev_utime)); 3470 cputime_sub(rtime, sig->prev_utime));
3417 3471
3418 *ut = sig->prev_utime; 3472 *ut = sig->prev_utime;
3419 *st = sig->prev_stime; 3473 *st = sig->prev_stime;
3420 } 3474 }
3421 #endif 3475 #endif
3422 3476
3423 /* 3477 /*
3424 * This function gets called by the timer code, with HZ frequency. 3478 * This function gets called by the timer code, with HZ frequency.
3425 * We call it with interrupts disabled. 3479 * We call it with interrupts disabled.
3426 * 3480 *
3427 * It also gets called by the fork code, when changing the parent's 3481 * It also gets called by the fork code, when changing the parent's
3428 * timeslices. 3482 * timeslices.
3429 */ 3483 */
3430 void scheduler_tick(void) 3484 void scheduler_tick(void)
3431 { 3485 {
3432 int cpu = smp_processor_id(); 3486 int cpu = smp_processor_id();
3433 struct rq *rq = cpu_rq(cpu); 3487 struct rq *rq = cpu_rq(cpu);
3434 struct task_struct *curr = rq->curr; 3488 struct task_struct *curr = rq->curr;
3435 3489
3436 sched_clock_tick(); 3490 sched_clock_tick();
3437 3491
3438 raw_spin_lock(&rq->lock); 3492 raw_spin_lock(&rq->lock);
3439 update_rq_clock(rq); 3493 update_rq_clock(rq);
3440 update_cpu_load(rq); 3494 update_cpu_load(rq);
3441 curr->sched_class->task_tick(rq, curr, 0); 3495 curr->sched_class->task_tick(rq, curr, 0);
3442 raw_spin_unlock(&rq->lock); 3496 raw_spin_unlock(&rq->lock);
3443 3497
3444 perf_event_task_tick(curr); 3498 perf_event_task_tick(curr);
3445 3499
3446 #ifdef CONFIG_SMP 3500 #ifdef CONFIG_SMP
3447 rq->idle_at_tick = idle_cpu(cpu); 3501 rq->idle_at_tick = idle_cpu(cpu);
3448 trigger_load_balance(rq, cpu); 3502 trigger_load_balance(rq, cpu);
3449 #endif 3503 #endif
3450 } 3504 }
3451 3505
3452 notrace unsigned long get_parent_ip(unsigned long addr) 3506 notrace unsigned long get_parent_ip(unsigned long addr)
3453 { 3507 {
3454 if (in_lock_functions(addr)) { 3508 if (in_lock_functions(addr)) {
3455 addr = CALLER_ADDR2; 3509 addr = CALLER_ADDR2;
3456 if (in_lock_functions(addr)) 3510 if (in_lock_functions(addr))
3457 addr = CALLER_ADDR3; 3511 addr = CALLER_ADDR3;
3458 } 3512 }
3459 return addr; 3513 return addr;
3460 } 3514 }
3461 3515
3462 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3516 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3463 defined(CONFIG_PREEMPT_TRACER)) 3517 defined(CONFIG_PREEMPT_TRACER))
3464 3518
3465 void __kprobes add_preempt_count(int val) 3519 void __kprobes add_preempt_count(int val)
3466 { 3520 {
3467 #ifdef CONFIG_DEBUG_PREEMPT 3521 #ifdef CONFIG_DEBUG_PREEMPT
3468 /* 3522 /*
3469 * Underflow? 3523 * Underflow?
3470 */ 3524 */
3471 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3525 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3472 return; 3526 return;
3473 #endif 3527 #endif
3474 preempt_count() += val; 3528 preempt_count() += val;
3475 #ifdef CONFIG_DEBUG_PREEMPT 3529 #ifdef CONFIG_DEBUG_PREEMPT
3476 /* 3530 /*
3477 * Spinlock count overflowing soon? 3531 * Spinlock count overflowing soon?
3478 */ 3532 */
3479 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3533 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3480 PREEMPT_MASK - 10); 3534 PREEMPT_MASK - 10);
3481 #endif 3535 #endif
3482 if (preempt_count() == val) 3536 if (preempt_count() == val)
3483 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3537 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3484 } 3538 }
3485 EXPORT_SYMBOL(add_preempt_count); 3539 EXPORT_SYMBOL(add_preempt_count);
3486 3540
3487 void __kprobes sub_preempt_count(int val) 3541 void __kprobes sub_preempt_count(int val)
3488 { 3542 {
3489 #ifdef CONFIG_DEBUG_PREEMPT 3543 #ifdef CONFIG_DEBUG_PREEMPT
3490 /* 3544 /*
3491 * Underflow? 3545 * Underflow?
3492 */ 3546 */
3493 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3547 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3494 return; 3548 return;
3495 /* 3549 /*
3496 * Is the spinlock portion underflowing? 3550 * Is the spinlock portion underflowing?
3497 */ 3551 */
3498 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3552 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3499 !(preempt_count() & PREEMPT_MASK))) 3553 !(preempt_count() & PREEMPT_MASK)))
3500 return; 3554 return;
3501 #endif 3555 #endif
3502 3556
3503 if (preempt_count() == val) 3557 if (preempt_count() == val)
3504 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 3558 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
3505 preempt_count() -= val; 3559 preempt_count() -= val;
3506 } 3560 }
3507 EXPORT_SYMBOL(sub_preempt_count); 3561 EXPORT_SYMBOL(sub_preempt_count);
3508 3562
3509 #endif 3563 #endif
3510 3564
3511 /* 3565 /*
3512 * Print scheduling while atomic bug: 3566 * Print scheduling while atomic bug:
3513 */ 3567 */
3514 static noinline void __schedule_bug(struct task_struct *prev) 3568 static noinline void __schedule_bug(struct task_struct *prev)
3515 { 3569 {
3516 struct pt_regs *regs = get_irq_regs(); 3570 struct pt_regs *regs = get_irq_regs();
3517 3571
3518 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 3572 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3519 prev->comm, prev->pid, preempt_count()); 3573 prev->comm, prev->pid, preempt_count());
3520 3574
3521 debug_show_held_locks(prev); 3575 debug_show_held_locks(prev);
3522 print_modules(); 3576 print_modules();
3523 if (irqs_disabled()) 3577 if (irqs_disabled())
3524 print_irqtrace_events(prev); 3578 print_irqtrace_events(prev);
3525 3579
3526 if (regs) 3580 if (regs)
3527 show_regs(regs); 3581 show_regs(regs);
3528 else 3582 else
3529 dump_stack(); 3583 dump_stack();
3530 } 3584 }
3531 3585
3532 /* 3586 /*
3533 * Various schedule()-time debugging checks and statistics: 3587 * Various schedule()-time debugging checks and statistics:
3534 */ 3588 */
3535 static inline void schedule_debug(struct task_struct *prev) 3589 static inline void schedule_debug(struct task_struct *prev)
3536 { 3590 {
3537 /* 3591 /*
3538 * Test if we are atomic. Since do_exit() needs to call into 3592 * Test if we are atomic. Since do_exit() needs to call into
3539 * schedule() atomically, we ignore that path for now. 3593 * schedule() atomically, we ignore that path for now.
3540 * Otherwise, whine if we are scheduling when we should not be. 3594 * Otherwise, whine if we are scheduling when we should not be.
3541 */ 3595 */
3542 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 3596 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
3543 __schedule_bug(prev); 3597 __schedule_bug(prev);
3544 3598
3545 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3599 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3546 3600
3547 schedstat_inc(this_rq(), sched_count); 3601 schedstat_inc(this_rq(), sched_count);
3548 #ifdef CONFIG_SCHEDSTATS 3602 #ifdef CONFIG_SCHEDSTATS
3549 if (unlikely(prev->lock_depth >= 0)) { 3603 if (unlikely(prev->lock_depth >= 0)) {
3550 schedstat_inc(this_rq(), bkl_count); 3604 schedstat_inc(this_rq(), bkl_count);
3551 schedstat_inc(prev, sched_info.bkl_count); 3605 schedstat_inc(prev, sched_info.bkl_count);
3552 } 3606 }
3553 #endif 3607 #endif
3554 } 3608 }
3555 3609
3556 static void put_prev_task(struct rq *rq, struct task_struct *prev) 3610 static void put_prev_task(struct rq *rq, struct task_struct *prev)
3557 { 3611 {
3558 if (prev->se.on_rq) 3612 if (prev->se.on_rq)
3559 update_rq_clock(rq); 3613 update_rq_clock(rq);
3560 rq->skip_clock_update = 0; 3614 rq->skip_clock_update = 0;
3561 prev->sched_class->put_prev_task(rq, prev); 3615 prev->sched_class->put_prev_task(rq, prev);
3562 } 3616 }
3563 3617
3564 /* 3618 /*
3565 * Pick up the highest-prio task: 3619 * Pick up the highest-prio task:
3566 */ 3620 */
3567 static inline struct task_struct * 3621 static inline struct task_struct *
3568 pick_next_task(struct rq *rq) 3622 pick_next_task(struct rq *rq)
3569 { 3623 {
3570 const struct sched_class *class; 3624 const struct sched_class *class;
3571 struct task_struct *p; 3625 struct task_struct *p;
3572 3626
3573 /* 3627 /*
3574 * Optimization: we know that if all tasks are in 3628 * Optimization: we know that if all tasks are in
3575 * the fair class we can call that function directly: 3629 * the fair class we can call that function directly:
3576 */ 3630 */
3577 if (likely(rq->nr_running == rq->cfs.nr_running)) { 3631 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3578 p = fair_sched_class.pick_next_task(rq); 3632 p = fair_sched_class.pick_next_task(rq);
3579 if (likely(p)) 3633 if (likely(p))
3580 return p; 3634 return p;
3581 } 3635 }
3582 3636
3583 class = sched_class_highest; 3637 class = sched_class_highest;
3584 for ( ; ; ) { 3638 for ( ; ; ) {
3585 p = class->pick_next_task(rq); 3639 p = class->pick_next_task(rq);
3586 if (p) 3640 if (p)
3587 return p; 3641 return p;
3588 /* 3642 /*
3589 * Will never be NULL as the idle class always 3643 * Will never be NULL as the idle class always
3590 * returns a non-NULL p: 3644 * returns a non-NULL p:
3591 */ 3645 */
3592 class = class->next; 3646 class = class->next;
3593 } 3647 }
3594 } 3648 }
3595 3649
3596 /* 3650 /*
3597 * schedule() is the main scheduler function. 3651 * schedule() is the main scheduler function.
3598 */ 3652 */
3599 asmlinkage void __sched schedule(void) 3653 asmlinkage void __sched schedule(void)
3600 { 3654 {
3601 struct task_struct *prev, *next; 3655 struct task_struct *prev, *next;
3602 unsigned long *switch_count; 3656 unsigned long *switch_count;
3603 struct rq *rq; 3657 struct rq *rq;
3604 int cpu; 3658 int cpu;
3605 3659
3606 need_resched: 3660 need_resched:
3607 preempt_disable(); 3661 preempt_disable();
3608 cpu = smp_processor_id(); 3662 cpu = smp_processor_id();
3609 rq = cpu_rq(cpu); 3663 rq = cpu_rq(cpu);
3610 rcu_sched_qs(cpu); 3664 rcu_sched_qs(cpu);
3611 prev = rq->curr; 3665 prev = rq->curr;
3612 switch_count = &prev->nivcsw; 3666 switch_count = &prev->nivcsw;
3613 3667
3614 release_kernel_lock(prev); 3668 release_kernel_lock(prev);
3615 need_resched_nonpreemptible: 3669 need_resched_nonpreemptible:
3616 3670
3617 schedule_debug(prev); 3671 schedule_debug(prev);
3618 3672
3619 if (sched_feat(HRTICK)) 3673 if (sched_feat(HRTICK))
3620 hrtick_clear(rq); 3674 hrtick_clear(rq);
3621 3675
3622 raw_spin_lock_irq(&rq->lock); 3676 raw_spin_lock_irq(&rq->lock);
3623 clear_tsk_need_resched(prev); 3677 clear_tsk_need_resched(prev);
3624 3678
3625 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3679 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3626 if (unlikely(signal_pending_state(prev->state, prev))) 3680 if (unlikely(signal_pending_state(prev->state, prev)))
3627 prev->state = TASK_RUNNING; 3681 prev->state = TASK_RUNNING;
3628 else 3682 else
3629 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3683 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3630 switch_count = &prev->nvcsw; 3684 switch_count = &prev->nvcsw;
3631 } 3685 }
3632 3686
3633 pre_schedule(rq, prev); 3687 pre_schedule(rq, prev);
3634 3688
3635 if (unlikely(!rq->nr_running)) 3689 if (unlikely(!rq->nr_running))
3636 idle_balance(cpu, rq); 3690 idle_balance(cpu, rq);
3637 3691
3638 put_prev_task(rq, prev); 3692 put_prev_task(rq, prev);
3639 next = pick_next_task(rq); 3693 next = pick_next_task(rq);
3640 3694
3641 if (likely(prev != next)) { 3695 if (likely(prev != next)) {
3642 sched_info_switch(prev, next); 3696 sched_info_switch(prev, next);
3643 perf_event_task_sched_out(prev, next); 3697 perf_event_task_sched_out(prev, next);
3644 3698
3645 rq->nr_switches++; 3699 rq->nr_switches++;
3646 rq->curr = next; 3700 rq->curr = next;
3647 ++*switch_count; 3701 ++*switch_count;
3648 3702
3649 context_switch(rq, prev, next); /* unlocks the rq */ 3703 context_switch(rq, prev, next); /* unlocks the rq */
3650 /* 3704 /*
3651 * the context switch might have flipped the stack from under 3705 * the context switch might have flipped the stack from under
3652 * us, hence refresh the local variables. 3706 * us, hence refresh the local variables.
3653 */ 3707 */
3654 cpu = smp_processor_id(); 3708 cpu = smp_processor_id();
3655 rq = cpu_rq(cpu); 3709 rq = cpu_rq(cpu);
3656 } else 3710 } else
3657 raw_spin_unlock_irq(&rq->lock); 3711 raw_spin_unlock_irq(&rq->lock);
3658 3712
3659 post_schedule(rq); 3713 post_schedule(rq);
3660 3714
3661 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3715 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3662 prev = rq->curr; 3716 prev = rq->curr;
3663 switch_count = &prev->nivcsw; 3717 switch_count = &prev->nivcsw;
3664 goto need_resched_nonpreemptible; 3718 goto need_resched_nonpreemptible;
3665 } 3719 }
3666 3720
3667 preempt_enable_no_resched(); 3721 preempt_enable_no_resched();
3668 if (need_resched()) 3722 if (need_resched())
3669 goto need_resched; 3723 goto need_resched;
3670 } 3724 }
3671 EXPORT_SYMBOL(schedule); 3725 EXPORT_SYMBOL(schedule);
3672 3726
3673 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER 3727 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3674 /* 3728 /*
3675 * Look out! "owner" is an entirely speculative pointer 3729 * Look out! "owner" is an entirely speculative pointer
3676 * access and not reliable. 3730 * access and not reliable.
3677 */ 3731 */
3678 int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 3732 int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3679 { 3733 {
3680 unsigned int cpu; 3734 unsigned int cpu;
3681 struct rq *rq; 3735 struct rq *rq;
3682 3736
3683 if (!sched_feat(OWNER_SPIN)) 3737 if (!sched_feat(OWNER_SPIN))
3684 return 0; 3738 return 0;
3685 3739
3686 #ifdef CONFIG_DEBUG_PAGEALLOC 3740 #ifdef CONFIG_DEBUG_PAGEALLOC
3687 /* 3741 /*
3688 * Need to access the cpu field knowing that 3742 * Need to access the cpu field knowing that
3689 * DEBUG_PAGEALLOC could have unmapped it if 3743 * DEBUG_PAGEALLOC could have unmapped it if
3690 * the mutex owner just released it and exited. 3744 * the mutex owner just released it and exited.
3691 */ 3745 */
3692 if (probe_kernel_address(&owner->cpu, cpu)) 3746 if (probe_kernel_address(&owner->cpu, cpu))
3693 goto out; 3747 goto out;
3694 #else 3748 #else
3695 cpu = owner->cpu; 3749 cpu = owner->cpu;
3696 #endif 3750 #endif
3697 3751
3698 /* 3752 /*
3699 * Even if the access succeeded (likely case), 3753 * Even if the access succeeded (likely case),
3700 * the cpu field may no longer be valid. 3754 * the cpu field may no longer be valid.
3701 */ 3755 */
3702 if (cpu >= nr_cpumask_bits) 3756 if (cpu >= nr_cpumask_bits)
3703 goto out; 3757 goto out;
3704 3758
3705 /* 3759 /*
3706 * We need to validate that we can do a 3760 * We need to validate that we can do a
3707 * get_cpu() and that we have the percpu area. 3761 * get_cpu() and that we have the percpu area.
3708 */ 3762 */
3709 if (!cpu_online(cpu)) 3763 if (!cpu_online(cpu))
3710 goto out; 3764 goto out;
3711 3765
3712 rq = cpu_rq(cpu); 3766 rq = cpu_rq(cpu);
3713 3767
3714 for (;;) { 3768 for (;;) {
3715 /* 3769 /*
3716 * Owner changed, break to re-assess state. 3770 * Owner changed, break to re-assess state.
3717 */ 3771 */
3718 if (lock->owner != owner) 3772 if (lock->owner != owner)
3719 break; 3773 break;
3720 3774
3721 /* 3775 /*
3722 * Is that owner really running on that cpu? 3776 * Is that owner really running on that cpu?
3723 */ 3777 */
3724 if (task_thread_info(rq->curr) != owner || need_resched()) 3778 if (task_thread_info(rq->curr) != owner || need_resched())
3725 return 0; 3779 return 0;
3726 3780
3727 cpu_relax(); 3781 cpu_relax();
3728 } 3782 }
3729 out: 3783 out:
3730 return 1; 3784 return 1;
3731 } 3785 }
3732 #endif 3786 #endif
3733 3787
3734 #ifdef CONFIG_PREEMPT 3788 #ifdef CONFIG_PREEMPT
3735 /* 3789 /*
3736 * this is the entry point to schedule() from in-kernel preemption 3790 * this is the entry point to schedule() from in-kernel preemption
3737 * off of preempt_enable. Kernel preemptions off return from interrupt 3791 * off of preempt_enable. Kernel preemptions off return from interrupt
3738 * occur there and call schedule directly. 3792 * occur there and call schedule directly.
3739 */ 3793 */
3740 asmlinkage void __sched preempt_schedule(void) 3794 asmlinkage void __sched preempt_schedule(void)
3741 { 3795 {
3742 struct thread_info *ti = current_thread_info(); 3796 struct thread_info *ti = current_thread_info();
3743 3797
3744 /* 3798 /*
3745 * If there is a non-zero preempt_count or interrupts are disabled, 3799 * If there is a non-zero preempt_count or interrupts are disabled,
3746 * we do not want to preempt the current task. Just return.. 3800 * we do not want to preempt the current task. Just return..
3747 */ 3801 */
3748 if (likely(ti->preempt_count || irqs_disabled())) 3802 if (likely(ti->preempt_count || irqs_disabled()))
3749 return; 3803 return;
3750 3804
3751 do { 3805 do {
3752 add_preempt_count(PREEMPT_ACTIVE); 3806 add_preempt_count(PREEMPT_ACTIVE);
3753 schedule(); 3807 schedule();
3754 sub_preempt_count(PREEMPT_ACTIVE); 3808 sub_preempt_count(PREEMPT_ACTIVE);
3755 3809
3756 /* 3810 /*
3757 * Check again in case we missed a preemption opportunity 3811 * Check again in case we missed a preemption opportunity
3758 * between schedule and now. 3812 * between schedule and now.
3759 */ 3813 */
3760 barrier(); 3814 barrier();
3761 } while (need_resched()); 3815 } while (need_resched());
3762 } 3816 }
3763 EXPORT_SYMBOL(preempt_schedule); 3817 EXPORT_SYMBOL(preempt_schedule);
3764 3818
3765 /* 3819 /*
3766 * this is the entry point to schedule() from kernel preemption 3820 * this is the entry point to schedule() from kernel preemption
3767 * off of irq context. 3821 * off of irq context.
3768 * Note, that this is called and return with irqs disabled. This will 3822 * Note, that this is called and return with irqs disabled. This will
3769 * protect us against recursive calling from irq. 3823 * protect us against recursive calling from irq.
3770 */ 3824 */
3771 asmlinkage void __sched preempt_schedule_irq(void) 3825 asmlinkage void __sched preempt_schedule_irq(void)
3772 { 3826 {
3773 struct thread_info *ti = current_thread_info(); 3827 struct thread_info *ti = current_thread_info();
3774 3828
3775 /* Catch callers which need to be fixed */ 3829 /* Catch callers which need to be fixed */
3776 BUG_ON(ti->preempt_count || !irqs_disabled()); 3830 BUG_ON(ti->preempt_count || !irqs_disabled());
3777 3831
3778 do { 3832 do {
3779 add_preempt_count(PREEMPT_ACTIVE); 3833 add_preempt_count(PREEMPT_ACTIVE);
3780 local_irq_enable(); 3834 local_irq_enable();
3781 schedule(); 3835 schedule();
3782 local_irq_disable(); 3836 local_irq_disable();
3783 sub_preempt_count(PREEMPT_ACTIVE); 3837 sub_preempt_count(PREEMPT_ACTIVE);
3784 3838
3785 /* 3839 /*
3786 * Check again in case we missed a preemption opportunity 3840 * Check again in case we missed a preemption opportunity
3787 * between schedule and now. 3841 * between schedule and now.
3788 */ 3842 */
3789 barrier(); 3843 barrier();
3790 } while (need_resched()); 3844 } while (need_resched());
3791 } 3845 }
3792 3846
3793 #endif /* CONFIG_PREEMPT */ 3847 #endif /* CONFIG_PREEMPT */
3794 3848
3795 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 3849 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
3796 void *key) 3850 void *key)
3797 { 3851 {
3798 return try_to_wake_up(curr->private, mode, wake_flags); 3852 return try_to_wake_up(curr->private, mode, wake_flags);
3799 } 3853 }
3800 EXPORT_SYMBOL(default_wake_function); 3854 EXPORT_SYMBOL(default_wake_function);
3801 3855
3802 /* 3856 /*
3803 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3857 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3804 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3858 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3805 * number) then we wake all the non-exclusive tasks and one exclusive task. 3859 * number) then we wake all the non-exclusive tasks and one exclusive task.
3806 * 3860 *
3807 * There are circumstances in which we can try to wake a task which has already 3861 * There are circumstances in which we can try to wake a task which has already
3808 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3862 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3809 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3863 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3810 */ 3864 */
3811 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3865 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3812 int nr_exclusive, int wake_flags, void *key) 3866 int nr_exclusive, int wake_flags, void *key)
3813 { 3867 {
3814 wait_queue_t *curr, *next; 3868 wait_queue_t *curr, *next;
3815 3869
3816 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 3870 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
3817 unsigned flags = curr->flags; 3871 unsigned flags = curr->flags;
3818 3872
3819 if (curr->func(curr, mode, wake_flags, key) && 3873 if (curr->func(curr, mode, wake_flags, key) &&
3820 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 3874 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3821 break; 3875 break;
3822 } 3876 }
3823 } 3877 }
3824 3878
3825 /** 3879 /**
3826 * __wake_up - wake up threads blocked on a waitqueue. 3880 * __wake_up - wake up threads blocked on a waitqueue.
3827 * @q: the waitqueue 3881 * @q: the waitqueue
3828 * @mode: which threads 3882 * @mode: which threads
3829 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3883 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3830 * @key: is directly passed to the wakeup function 3884 * @key: is directly passed to the wakeup function
3831 * 3885 *
3832 * It may be assumed that this function implies a write memory barrier before 3886 * It may be assumed that this function implies a write memory barrier before
3833 * changing the task state if and only if any tasks are woken up. 3887 * changing the task state if and only if any tasks are woken up.
3834 */ 3888 */
3835 void __wake_up(wait_queue_head_t *q, unsigned int mode, 3889 void __wake_up(wait_queue_head_t *q, unsigned int mode,
3836 int nr_exclusive, void *key) 3890 int nr_exclusive, void *key)
3837 { 3891 {
3838 unsigned long flags; 3892 unsigned long flags;
3839 3893
3840 spin_lock_irqsave(&q->lock, flags); 3894 spin_lock_irqsave(&q->lock, flags);
3841 __wake_up_common(q, mode, nr_exclusive, 0, key); 3895 __wake_up_common(q, mode, nr_exclusive, 0, key);
3842 spin_unlock_irqrestore(&q->lock, flags); 3896 spin_unlock_irqrestore(&q->lock, flags);
3843 } 3897 }
3844 EXPORT_SYMBOL(__wake_up); 3898 EXPORT_SYMBOL(__wake_up);
3845 3899
3846 /* 3900 /*
3847 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3901 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3848 */ 3902 */
3849 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3903 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3850 { 3904 {
3851 __wake_up_common(q, mode, 1, 0, NULL); 3905 __wake_up_common(q, mode, 1, 0, NULL);
3852 } 3906 }
3853 3907
3854 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3908 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3855 { 3909 {
3856 __wake_up_common(q, mode, 1, 0, key); 3910 __wake_up_common(q, mode, 1, 0, key);
3857 } 3911 }
3858 3912
3859 /** 3913 /**
3860 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 3914 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
3861 * @q: the waitqueue 3915 * @q: the waitqueue
3862 * @mode: which threads 3916 * @mode: which threads
3863 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3917 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3864 * @key: opaque value to be passed to wakeup targets 3918 * @key: opaque value to be passed to wakeup targets
3865 * 3919 *
3866 * The sync wakeup differs that the waker knows that it will schedule 3920 * The sync wakeup differs that the waker knows that it will schedule
3867 * away soon, so while the target thread will be woken up, it will not 3921 * away soon, so while the target thread will be woken up, it will not
3868 * be migrated to another CPU - ie. the two threads are 'synchronized' 3922 * be migrated to another CPU - ie. the two threads are 'synchronized'
3869 * with each other. This can prevent needless bouncing between CPUs. 3923 * with each other. This can prevent needless bouncing between CPUs.
3870 * 3924 *
3871 * On UP it can prevent extra preemption. 3925 * On UP it can prevent extra preemption.
3872 * 3926 *
3873 * It may be assumed that this function implies a write memory barrier before 3927 * It may be assumed that this function implies a write memory barrier before
3874 * changing the task state if and only if any tasks are woken up. 3928 * changing the task state if and only if any tasks are woken up.
3875 */ 3929 */
3876 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 3930 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
3877 int nr_exclusive, void *key) 3931 int nr_exclusive, void *key)
3878 { 3932 {
3879 unsigned long flags; 3933 unsigned long flags;
3880 int wake_flags = WF_SYNC; 3934 int wake_flags = WF_SYNC;
3881 3935
3882 if (unlikely(!q)) 3936 if (unlikely(!q))
3883 return; 3937 return;
3884 3938
3885 if (unlikely(!nr_exclusive)) 3939 if (unlikely(!nr_exclusive))
3886 wake_flags = 0; 3940 wake_flags = 0;
3887 3941
3888 spin_lock_irqsave(&q->lock, flags); 3942 spin_lock_irqsave(&q->lock, flags);
3889 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 3943 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
3890 spin_unlock_irqrestore(&q->lock, flags); 3944 spin_unlock_irqrestore(&q->lock, flags);
3891 } 3945 }
3892 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 3946 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
3893 3947
3894 /* 3948 /*
3895 * __wake_up_sync - see __wake_up_sync_key() 3949 * __wake_up_sync - see __wake_up_sync_key()
3896 */ 3950 */
3897 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3951 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3898 { 3952 {
3899 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 3953 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
3900 } 3954 }
3901 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3955 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3902 3956
3903 /** 3957 /**
3904 * complete: - signals a single thread waiting on this completion 3958 * complete: - signals a single thread waiting on this completion
3905 * @x: holds the state of this particular completion 3959 * @x: holds the state of this particular completion
3906 * 3960 *
3907 * This will wake up a single thread waiting on this completion. Threads will be 3961 * This will wake up a single thread waiting on this completion. Threads will be
3908 * awakened in the same order in which they were queued. 3962 * awakened in the same order in which they were queued.
3909 * 3963 *
3910 * See also complete_all(), wait_for_completion() and related routines. 3964 * See also complete_all(), wait_for_completion() and related routines.
3911 * 3965 *
3912 * It may be assumed that this function implies a write memory barrier before 3966 * It may be assumed that this function implies a write memory barrier before
3913 * changing the task state if and only if any tasks are woken up. 3967 * changing the task state if and only if any tasks are woken up.
3914 */ 3968 */
3915 void complete(struct completion *x) 3969 void complete(struct completion *x)
3916 { 3970 {
3917 unsigned long flags; 3971 unsigned long flags;
3918 3972
3919 spin_lock_irqsave(&x->wait.lock, flags); 3973 spin_lock_irqsave(&x->wait.lock, flags);
3920 x->done++; 3974 x->done++;
3921 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 3975 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3922 spin_unlock_irqrestore(&x->wait.lock, flags); 3976 spin_unlock_irqrestore(&x->wait.lock, flags);
3923 } 3977 }
3924 EXPORT_SYMBOL(complete); 3978 EXPORT_SYMBOL(complete);
3925 3979
3926 /** 3980 /**
3927 * complete_all: - signals all threads waiting on this completion 3981 * complete_all: - signals all threads waiting on this completion
3928 * @x: holds the state of this particular completion 3982 * @x: holds the state of this particular completion
3929 * 3983 *
3930 * This will wake up all threads waiting on this particular completion event. 3984 * This will wake up all threads waiting on this particular completion event.
3931 * 3985 *
3932 * It may be assumed that this function implies a write memory barrier before 3986 * It may be assumed that this function implies a write memory barrier before
3933 * changing the task state if and only if any tasks are woken up. 3987 * changing the task state if and only if any tasks are woken up.
3934 */ 3988 */
3935 void complete_all(struct completion *x) 3989 void complete_all(struct completion *x)
3936 { 3990 {
3937 unsigned long flags; 3991 unsigned long flags;
3938 3992
3939 spin_lock_irqsave(&x->wait.lock, flags); 3993 spin_lock_irqsave(&x->wait.lock, flags);
3940 x->done += UINT_MAX/2; 3994 x->done += UINT_MAX/2;
3941 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 3995 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3942 spin_unlock_irqrestore(&x->wait.lock, flags); 3996 spin_unlock_irqrestore(&x->wait.lock, flags);
3943 } 3997 }
3944 EXPORT_SYMBOL(complete_all); 3998 EXPORT_SYMBOL(complete_all);
3945 3999
3946 static inline long __sched 4000 static inline long __sched
3947 do_wait_for_common(struct completion *x, long timeout, int state) 4001 do_wait_for_common(struct completion *x, long timeout, int state)
3948 { 4002 {
3949 if (!x->done) { 4003 if (!x->done) {
3950 DECLARE_WAITQUEUE(wait, current); 4004 DECLARE_WAITQUEUE(wait, current);
3951 4005
3952 wait.flags |= WQ_FLAG_EXCLUSIVE; 4006 wait.flags |= WQ_FLAG_EXCLUSIVE;
3953 __add_wait_queue_tail(&x->wait, &wait); 4007 __add_wait_queue_tail(&x->wait, &wait);
3954 do { 4008 do {
3955 if (signal_pending_state(state, current)) { 4009 if (signal_pending_state(state, current)) {
3956 timeout = -ERESTARTSYS; 4010 timeout = -ERESTARTSYS;
3957 break; 4011 break;
3958 } 4012 }
3959 __set_current_state(state); 4013 __set_current_state(state);
3960 spin_unlock_irq(&x->wait.lock); 4014 spin_unlock_irq(&x->wait.lock);
3961 timeout = schedule_timeout(timeout); 4015 timeout = schedule_timeout(timeout);
3962 spin_lock_irq(&x->wait.lock); 4016 spin_lock_irq(&x->wait.lock);
3963 } while (!x->done && timeout); 4017 } while (!x->done && timeout);
3964 __remove_wait_queue(&x->wait, &wait); 4018 __remove_wait_queue(&x->wait, &wait);
3965 if (!x->done) 4019 if (!x->done)
3966 return timeout; 4020 return timeout;
3967 } 4021 }
3968 x->done--; 4022 x->done--;
3969 return timeout ?: 1; 4023 return timeout ?: 1;
3970 } 4024 }
3971 4025
3972 static long __sched 4026 static long __sched
3973 wait_for_common(struct completion *x, long timeout, int state) 4027 wait_for_common(struct completion *x, long timeout, int state)
3974 { 4028 {
3975 might_sleep(); 4029 might_sleep();
3976 4030
3977 spin_lock_irq(&x->wait.lock); 4031 spin_lock_irq(&x->wait.lock);
3978 timeout = do_wait_for_common(x, timeout, state); 4032 timeout = do_wait_for_common(x, timeout, state);
3979 spin_unlock_irq(&x->wait.lock); 4033 spin_unlock_irq(&x->wait.lock);
3980 return timeout; 4034 return timeout;
3981 } 4035 }
3982 4036
3983 /** 4037 /**
3984 * wait_for_completion: - waits for completion of a task 4038 * wait_for_completion: - waits for completion of a task
3985 * @x: holds the state of this particular completion 4039 * @x: holds the state of this particular completion
3986 * 4040 *
3987 * This waits to be signaled for completion of a specific task. It is NOT 4041 * This waits to be signaled for completion of a specific task. It is NOT
3988 * interruptible and there is no timeout. 4042 * interruptible and there is no timeout.
3989 * 4043 *
3990 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 4044 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
3991 * and interrupt capability. Also see complete(). 4045 * and interrupt capability. Also see complete().
3992 */ 4046 */
3993 void __sched wait_for_completion(struct completion *x) 4047 void __sched wait_for_completion(struct completion *x)
3994 { 4048 {
3995 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4049 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
3996 } 4050 }
3997 EXPORT_SYMBOL(wait_for_completion); 4051 EXPORT_SYMBOL(wait_for_completion);
3998 4052
3999 /** 4053 /**
4000 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 4054 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4001 * @x: holds the state of this particular completion 4055 * @x: holds the state of this particular completion
4002 * @timeout: timeout value in jiffies 4056 * @timeout: timeout value in jiffies
4003 * 4057 *
4004 * This waits for either a completion of a specific task to be signaled or for a 4058 * This waits for either a completion of a specific task to be signaled or for a
4005 * specified timeout to expire. The timeout is in jiffies. It is not 4059 * specified timeout to expire. The timeout is in jiffies. It is not
4006 * interruptible. 4060 * interruptible.
4007 */ 4061 */
4008 unsigned long __sched 4062 unsigned long __sched
4009 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4063 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4010 { 4064 {
4011 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 4065 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
4012 } 4066 }
4013 EXPORT_SYMBOL(wait_for_completion_timeout); 4067 EXPORT_SYMBOL(wait_for_completion_timeout);
4014 4068
4015 /** 4069 /**
4016 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 4070 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4017 * @x: holds the state of this particular completion 4071 * @x: holds the state of this particular completion
4018 * 4072 *
4019 * This waits for completion of a specific task to be signaled. It is 4073 * This waits for completion of a specific task to be signaled. It is
4020 * interruptible. 4074 * interruptible.
4021 */ 4075 */
4022 int __sched wait_for_completion_interruptible(struct completion *x) 4076 int __sched wait_for_completion_interruptible(struct completion *x)
4023 { 4077 {
4024 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4078 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4025 if (t == -ERESTARTSYS) 4079 if (t == -ERESTARTSYS)
4026 return t; 4080 return t;
4027 return 0; 4081 return 0;
4028 } 4082 }
4029 EXPORT_SYMBOL(wait_for_completion_interruptible); 4083 EXPORT_SYMBOL(wait_for_completion_interruptible);
4030 4084
4031 /** 4085 /**
4032 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 4086 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4033 * @x: holds the state of this particular completion 4087 * @x: holds the state of this particular completion
4034 * @timeout: timeout value in jiffies 4088 * @timeout: timeout value in jiffies
4035 * 4089 *
4036 * This waits for either a completion of a specific task to be signaled or for a 4090 * This waits for either a completion of a specific task to be signaled or for a
4037 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4091 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4038 */ 4092 */
4039 unsigned long __sched 4093 unsigned long __sched
4040 wait_for_completion_interruptible_timeout(struct completion *x, 4094 wait_for_completion_interruptible_timeout(struct completion *x,
4041 unsigned long timeout) 4095 unsigned long timeout)
4042 { 4096 {
4043 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 4097 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
4044 } 4098 }
4045 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4099 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4046 4100
4047 /** 4101 /**
4048 * wait_for_completion_killable: - waits for completion of a task (killable) 4102 * wait_for_completion_killable: - waits for completion of a task (killable)
4049 * @x: holds the state of this particular completion 4103 * @x: holds the state of this particular completion
4050 * 4104 *
4051 * This waits to be signaled for completion of a specific task. It can be 4105 * This waits to be signaled for completion of a specific task. It can be
4052 * interrupted by a kill signal. 4106 * interrupted by a kill signal.
4053 */ 4107 */
4054 int __sched wait_for_completion_killable(struct completion *x) 4108 int __sched wait_for_completion_killable(struct completion *x)
4055 { 4109 {
4056 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4110 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4057 if (t == -ERESTARTSYS) 4111 if (t == -ERESTARTSYS)
4058 return t; 4112 return t;
4059 return 0; 4113 return 0;
4060 } 4114 }
4061 EXPORT_SYMBOL(wait_for_completion_killable); 4115 EXPORT_SYMBOL(wait_for_completion_killable);
4062 4116
4063 /** 4117 /**
4064 * try_wait_for_completion - try to decrement a completion without blocking 4118 * try_wait_for_completion - try to decrement a completion without blocking
4065 * @x: completion structure 4119 * @x: completion structure
4066 * 4120 *
4067 * Returns: 0 if a decrement cannot be done without blocking 4121 * Returns: 0 if a decrement cannot be done without blocking
4068 * 1 if a decrement succeeded. 4122 * 1 if a decrement succeeded.
4069 * 4123 *
4070 * If a completion is being used as a counting completion, 4124 * If a completion is being used as a counting completion,
4071 * attempt to decrement the counter without blocking. This 4125 * attempt to decrement the counter without blocking. This
4072 * enables us to avoid waiting if the resource the completion 4126 * enables us to avoid waiting if the resource the completion
4073 * is protecting is not available. 4127 * is protecting is not available.
4074 */ 4128 */
4075 bool try_wait_for_completion(struct completion *x) 4129 bool try_wait_for_completion(struct completion *x)
4076 { 4130 {
4077 unsigned long flags; 4131 unsigned long flags;
4078 int ret = 1; 4132 int ret = 1;
4079 4133
4080 spin_lock_irqsave(&x->wait.lock, flags); 4134 spin_lock_irqsave(&x->wait.lock, flags);
4081 if (!x->done) 4135 if (!x->done)
4082 ret = 0; 4136 ret = 0;
4083 else 4137 else
4084 x->done--; 4138 x->done--;
4085 spin_unlock_irqrestore(&x->wait.lock, flags); 4139 spin_unlock_irqrestore(&x->wait.lock, flags);
4086 return ret; 4140 return ret;
4087 } 4141 }
4088 EXPORT_SYMBOL(try_wait_for_completion); 4142 EXPORT_SYMBOL(try_wait_for_completion);
4089 4143
4090 /** 4144 /**
4091 * completion_done - Test to see if a completion has any waiters 4145 * completion_done - Test to see if a completion has any waiters
4092 * @x: completion structure 4146 * @x: completion structure
4093 * 4147 *
4094 * Returns: 0 if there are waiters (wait_for_completion() in progress) 4148 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4095 * 1 if there are no waiters. 4149 * 1 if there are no waiters.
4096 * 4150 *
4097 */ 4151 */
4098 bool completion_done(struct completion *x) 4152 bool completion_done(struct completion *x)
4099 { 4153 {
4100 unsigned long flags; 4154 unsigned long flags;
4101 int ret = 1; 4155 int ret = 1;
4102 4156
4103 spin_lock_irqsave(&x->wait.lock, flags); 4157 spin_lock_irqsave(&x->wait.lock, flags);
4104 if (!x->done) 4158 if (!x->done)
4105 ret = 0; 4159 ret = 0;
4106 spin_unlock_irqrestore(&x->wait.lock, flags); 4160 spin_unlock_irqrestore(&x->wait.lock, flags);
4107 return ret; 4161 return ret;
4108 } 4162 }
4109 EXPORT_SYMBOL(completion_done); 4163 EXPORT_SYMBOL(completion_done);
4110 4164
4111 static long __sched 4165 static long __sched
4112 sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4166 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4113 { 4167 {
4114 unsigned long flags; 4168 unsigned long flags;
4115 wait_queue_t wait; 4169 wait_queue_t wait;
4116 4170
4117 init_waitqueue_entry(&wait, current); 4171 init_waitqueue_entry(&wait, current);
4118 4172
4119 __set_current_state(state); 4173 __set_current_state(state);
4120 4174
4121 spin_lock_irqsave(&q->lock, flags); 4175 spin_lock_irqsave(&q->lock, flags);
4122 __add_wait_queue(q, &wait); 4176 __add_wait_queue(q, &wait);
4123 spin_unlock(&q->lock); 4177 spin_unlock(&q->lock);
4124 timeout = schedule_timeout(timeout); 4178 timeout = schedule_timeout(timeout);
4125 spin_lock_irq(&q->lock); 4179 spin_lock_irq(&q->lock);
4126 __remove_wait_queue(q, &wait); 4180 __remove_wait_queue(q, &wait);
4127 spin_unlock_irqrestore(&q->lock, flags); 4181 spin_unlock_irqrestore(&q->lock, flags);
4128 4182
4129 return timeout; 4183 return timeout;
4130 } 4184 }
4131 4185
4132 void __sched interruptible_sleep_on(wait_queue_head_t *q) 4186 void __sched interruptible_sleep_on(wait_queue_head_t *q)
4133 { 4187 {
4134 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4188 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4135 } 4189 }
4136 EXPORT_SYMBOL(interruptible_sleep_on); 4190 EXPORT_SYMBOL(interruptible_sleep_on);
4137 4191
4138 long __sched 4192 long __sched
4139 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 4193 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
4140 { 4194 {
4141 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 4195 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
4142 } 4196 }
4143 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 4197 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4144 4198
4145 void __sched sleep_on(wait_queue_head_t *q) 4199 void __sched sleep_on(wait_queue_head_t *q)
4146 { 4200 {
4147 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 4201 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
4148 } 4202 }
4149 EXPORT_SYMBOL(sleep_on); 4203 EXPORT_SYMBOL(sleep_on);
4150 4204
4151 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 4205 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
4152 { 4206 {
4153 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 4207 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
4154 } 4208 }
4155 EXPORT_SYMBOL(sleep_on_timeout); 4209 EXPORT_SYMBOL(sleep_on_timeout);
4156 4210
4157 #ifdef CONFIG_RT_MUTEXES 4211 #ifdef CONFIG_RT_MUTEXES
4158 4212
4159 /* 4213 /*
4160 * rt_mutex_setprio - set the current priority of a task 4214 * rt_mutex_setprio - set the current priority of a task
4161 * @p: task 4215 * @p: task
4162 * @prio: prio value (kernel-internal form) 4216 * @prio: prio value (kernel-internal form)
4163 * 4217 *
4164 * This function changes the 'effective' priority of a task. It does 4218 * This function changes the 'effective' priority of a task. It does
4165 * not touch ->normal_prio like __setscheduler(). 4219 * not touch ->normal_prio like __setscheduler().
4166 * 4220 *
4167 * Used by the rt_mutex code to implement priority inheritance logic. 4221 * Used by the rt_mutex code to implement priority inheritance logic.
4168 */ 4222 */
4169 void rt_mutex_setprio(struct task_struct *p, int prio) 4223 void rt_mutex_setprio(struct task_struct *p, int prio)
4170 { 4224 {
4171 unsigned long flags; 4225 unsigned long flags;
4172 int oldprio, on_rq, running; 4226 int oldprio, on_rq, running;
4173 struct rq *rq; 4227 struct rq *rq;
4174 const struct sched_class *prev_class; 4228 const struct sched_class *prev_class;
4175 4229
4176 BUG_ON(prio < 0 || prio > MAX_PRIO); 4230 BUG_ON(prio < 0 || prio > MAX_PRIO);
4177 4231
4178 rq = task_rq_lock(p, &flags); 4232 rq = task_rq_lock(p, &flags);
4179 4233
4180 oldprio = p->prio; 4234 oldprio = p->prio;
4181 prev_class = p->sched_class; 4235 prev_class = p->sched_class;
4182 on_rq = p->se.on_rq; 4236 on_rq = p->se.on_rq;
4183 running = task_current(rq, p); 4237 running = task_current(rq, p);
4184 if (on_rq) 4238 if (on_rq)
4185 dequeue_task(rq, p, 0); 4239 dequeue_task(rq, p, 0);
4186 if (running) 4240 if (running)
4187 p->sched_class->put_prev_task(rq, p); 4241 p->sched_class->put_prev_task(rq, p);
4188 4242
4189 if (rt_prio(prio)) 4243 if (rt_prio(prio))
4190 p->sched_class = &rt_sched_class; 4244 p->sched_class = &rt_sched_class;
4191 else 4245 else
4192 p->sched_class = &fair_sched_class; 4246 p->sched_class = &fair_sched_class;
4193 4247
4194 p->prio = prio; 4248 p->prio = prio;
4195 4249
4196 if (running) 4250 if (running)
4197 p->sched_class->set_curr_task(rq); 4251 p->sched_class->set_curr_task(rq);
4198 if (on_rq) { 4252 if (on_rq) {
4199 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4253 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4200 4254
4201 check_class_changed(rq, p, prev_class, oldprio, running); 4255 check_class_changed(rq, p, prev_class, oldprio, running);
4202 } 4256 }
4203 task_rq_unlock(rq, &flags); 4257 task_rq_unlock(rq, &flags);
4204 } 4258 }
4205 4259
4206 #endif 4260 #endif
4207 4261
4208 void set_user_nice(struct task_struct *p, long nice) 4262 void set_user_nice(struct task_struct *p, long nice)
4209 { 4263 {
4210 int old_prio, delta, on_rq; 4264 int old_prio, delta, on_rq;
4211 unsigned long flags; 4265 unsigned long flags;
4212 struct rq *rq; 4266 struct rq *rq;
4213 4267
4214 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 4268 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
4215 return; 4269 return;
4216 /* 4270 /*
4217 * We have to be careful, if called from sys_setpriority(), 4271 * We have to be careful, if called from sys_setpriority(),
4218 * the task might be in the middle of scheduling on another CPU. 4272 * the task might be in the middle of scheduling on another CPU.
4219 */ 4273 */
4220 rq = task_rq_lock(p, &flags); 4274 rq = task_rq_lock(p, &flags);
4221 /* 4275 /*
4222 * The RT priorities are set via sched_setscheduler(), but we still 4276 * The RT priorities are set via sched_setscheduler(), but we still
4223 * allow the 'normal' nice value to be set - but as expected 4277 * allow the 'normal' nice value to be set - but as expected
4224 * it wont have any effect on scheduling until the task is 4278 * it wont have any effect on scheduling until the task is
4225 * SCHED_FIFO/SCHED_RR: 4279 * SCHED_FIFO/SCHED_RR:
4226 */ 4280 */
4227 if (task_has_rt_policy(p)) { 4281 if (task_has_rt_policy(p)) {
4228 p->static_prio = NICE_TO_PRIO(nice); 4282 p->static_prio = NICE_TO_PRIO(nice);
4229 goto out_unlock; 4283 goto out_unlock;
4230 } 4284 }
4231 on_rq = p->se.on_rq; 4285 on_rq = p->se.on_rq;
4232 if (on_rq) 4286 if (on_rq)
4233 dequeue_task(rq, p, 0); 4287 dequeue_task(rq, p, 0);
4234 4288
4235 p->static_prio = NICE_TO_PRIO(nice); 4289 p->static_prio = NICE_TO_PRIO(nice);
4236 set_load_weight(p); 4290 set_load_weight(p);
4237 old_prio = p->prio; 4291 old_prio = p->prio;
4238 p->prio = effective_prio(p); 4292 p->prio = effective_prio(p);
4239 delta = p->prio - old_prio; 4293 delta = p->prio - old_prio;
4240 4294
4241 if (on_rq) { 4295 if (on_rq) {
4242 enqueue_task(rq, p, 0); 4296 enqueue_task(rq, p, 0);
4243 /* 4297 /*
4244 * If the task increased its priority or is running and 4298 * If the task increased its priority or is running and
4245 * lowered its priority, then reschedule its CPU: 4299 * lowered its priority, then reschedule its CPU:
4246 */ 4300 */
4247 if (delta < 0 || (delta > 0 && task_running(rq, p))) 4301 if (delta < 0 || (delta > 0 && task_running(rq, p)))
4248 resched_task(rq->curr); 4302 resched_task(rq->curr);
4249 } 4303 }
4250 out_unlock: 4304 out_unlock:
4251 task_rq_unlock(rq, &flags); 4305 task_rq_unlock(rq, &flags);
4252 } 4306 }
4253 EXPORT_SYMBOL(set_user_nice); 4307 EXPORT_SYMBOL(set_user_nice);
4254 4308
4255 /* 4309 /*
4256 * can_nice - check if a task can reduce its nice value 4310 * can_nice - check if a task can reduce its nice value
4257 * @p: task 4311 * @p: task
4258 * @nice: nice value 4312 * @nice: nice value
4259 */ 4313 */
4260 int can_nice(const struct task_struct *p, const int nice) 4314 int can_nice(const struct task_struct *p, const int nice)
4261 { 4315 {
4262 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4316 /* convert nice value [19,-20] to rlimit style value [1,40] */
4263 int nice_rlim = 20 - nice; 4317 int nice_rlim = 20 - nice;
4264 4318
4265 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || 4319 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
4266 capable(CAP_SYS_NICE)); 4320 capable(CAP_SYS_NICE));
4267 } 4321 }
4268 4322
4269 #ifdef __ARCH_WANT_SYS_NICE 4323 #ifdef __ARCH_WANT_SYS_NICE
4270 4324
4271 /* 4325 /*
4272 * sys_nice - change the priority of the current process. 4326 * sys_nice - change the priority of the current process.
4273 * @increment: priority increment 4327 * @increment: priority increment
4274 * 4328 *
4275 * sys_setpriority is a more generic, but much slower function that 4329 * sys_setpriority is a more generic, but much slower function that
4276 * does similar things. 4330 * does similar things.
4277 */ 4331 */
4278 SYSCALL_DEFINE1(nice, int, increment) 4332 SYSCALL_DEFINE1(nice, int, increment)
4279 { 4333 {
4280 long nice, retval; 4334 long nice, retval;
4281 4335
4282 /* 4336 /*
4283 * Setpriority might change our priority at the same moment. 4337 * Setpriority might change our priority at the same moment.
4284 * We don't have to worry. Conceptually one call occurs first 4338 * We don't have to worry. Conceptually one call occurs first
4285 * and we have a single winner. 4339 * and we have a single winner.
4286 */ 4340 */
4287 if (increment < -40) 4341 if (increment < -40)
4288 increment = -40; 4342 increment = -40;
4289 if (increment > 40) 4343 if (increment > 40)
4290 increment = 40; 4344 increment = 40;
4291 4345
4292 nice = TASK_NICE(current) + increment; 4346 nice = TASK_NICE(current) + increment;
4293 if (nice < -20) 4347 if (nice < -20)
4294 nice = -20; 4348 nice = -20;
4295 if (nice > 19) 4349 if (nice > 19)
4296 nice = 19; 4350 nice = 19;
4297 4351
4298 if (increment < 0 && !can_nice(current, nice)) 4352 if (increment < 0 && !can_nice(current, nice))
4299 return -EPERM; 4353 return -EPERM;
4300 4354
4301 retval = security_task_setnice(current, nice); 4355 retval = security_task_setnice(current, nice);
4302 if (retval) 4356 if (retval)
4303 return retval; 4357 return retval;
4304 4358
4305 set_user_nice(current, nice); 4359 set_user_nice(current, nice);
4306 return 0; 4360 return 0;
4307 } 4361 }
4308 4362
4309 #endif 4363 #endif
4310 4364
4311 /** 4365 /**
4312 * task_prio - return the priority value of a given task. 4366 * task_prio - return the priority value of a given task.
4313 * @p: the task in question. 4367 * @p: the task in question.
4314 * 4368 *
4315 * This is the priority value as seen by users in /proc. 4369 * This is the priority value as seen by users in /proc.
4316 * RT tasks are offset by -200. Normal tasks are centered 4370 * RT tasks are offset by -200. Normal tasks are centered
4317 * around 0, value goes from -16 to +15. 4371 * around 0, value goes from -16 to +15.
4318 */ 4372 */
4319 int task_prio(const struct task_struct *p) 4373 int task_prio(const struct task_struct *p)
4320 { 4374 {
4321 return p->prio - MAX_RT_PRIO; 4375 return p->prio - MAX_RT_PRIO;
4322 } 4376 }
4323 4377
4324 /** 4378 /**
4325 * task_nice - return the nice value of a given task. 4379 * task_nice - return the nice value of a given task.
4326 * @p: the task in question. 4380 * @p: the task in question.
4327 */ 4381 */
4328 int task_nice(const struct task_struct *p) 4382 int task_nice(const struct task_struct *p)
4329 { 4383 {
4330 return TASK_NICE(p); 4384 return TASK_NICE(p);
4331 } 4385 }
4332 EXPORT_SYMBOL(task_nice); 4386 EXPORT_SYMBOL(task_nice);
4333 4387
4334 /** 4388 /**
4335 * idle_cpu - is a given cpu idle currently? 4389 * idle_cpu - is a given cpu idle currently?
4336 * @cpu: the processor in question. 4390 * @cpu: the processor in question.
4337 */ 4391 */
4338 int idle_cpu(int cpu) 4392 int idle_cpu(int cpu)
4339 { 4393 {
4340 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 4394 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4341 } 4395 }
4342 4396
4343 /** 4397 /**
4344 * idle_task - return the idle task for a given cpu. 4398 * idle_task - return the idle task for a given cpu.
4345 * @cpu: the processor in question. 4399 * @cpu: the processor in question.
4346 */ 4400 */
4347 struct task_struct *idle_task(int cpu) 4401 struct task_struct *idle_task(int cpu)
4348 { 4402 {
4349 return cpu_rq(cpu)->idle; 4403 return cpu_rq(cpu)->idle;
4350 } 4404 }
4351 4405
4352 /** 4406 /**
4353 * find_process_by_pid - find a process with a matching PID value. 4407 * find_process_by_pid - find a process with a matching PID value.
4354 * @pid: the pid in question. 4408 * @pid: the pid in question.
4355 */ 4409 */
4356 static struct task_struct *find_process_by_pid(pid_t pid) 4410 static struct task_struct *find_process_by_pid(pid_t pid)
4357 { 4411 {
4358 return pid ? find_task_by_vpid(pid) : current; 4412 return pid ? find_task_by_vpid(pid) : current;
4359 } 4413 }
4360 4414
4361 /* Actually do priority change: must hold rq lock. */ 4415 /* Actually do priority change: must hold rq lock. */
4362 static void 4416 static void
4363 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4417 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4364 { 4418 {
4365 BUG_ON(p->se.on_rq); 4419 BUG_ON(p->se.on_rq);
4366 4420
4367 p->policy = policy; 4421 p->policy = policy;
4368 p->rt_priority = prio; 4422 p->rt_priority = prio;
4369 p->normal_prio = normal_prio(p); 4423 p->normal_prio = normal_prio(p);
4370 /* we are holding p->pi_lock already */ 4424 /* we are holding p->pi_lock already */
4371 p->prio = rt_mutex_getprio(p); 4425 p->prio = rt_mutex_getprio(p);
4372 if (rt_prio(p->prio)) 4426 if (rt_prio(p->prio))
4373 p->sched_class = &rt_sched_class; 4427 p->sched_class = &rt_sched_class;
4374 else 4428 else
4375 p->sched_class = &fair_sched_class; 4429 p->sched_class = &fair_sched_class;
4376 set_load_weight(p); 4430 set_load_weight(p);
4377 } 4431 }
4378 4432
4379 /* 4433 /*
4380 * check the target process has a UID that matches the current process's 4434 * check the target process has a UID that matches the current process's
4381 */ 4435 */
4382 static bool check_same_owner(struct task_struct *p) 4436 static bool check_same_owner(struct task_struct *p)
4383 { 4437 {
4384 const struct cred *cred = current_cred(), *pcred; 4438 const struct cred *cred = current_cred(), *pcred;
4385 bool match; 4439 bool match;
4386 4440
4387 rcu_read_lock(); 4441 rcu_read_lock();
4388 pcred = __task_cred(p); 4442 pcred = __task_cred(p);
4389 match = (cred->euid == pcred->euid || 4443 match = (cred->euid == pcred->euid ||
4390 cred->euid == pcred->uid); 4444 cred->euid == pcred->uid);
4391 rcu_read_unlock(); 4445 rcu_read_unlock();
4392 return match; 4446 return match;
4393 } 4447 }
4394 4448
4395 static int __sched_setscheduler(struct task_struct *p, int policy, 4449 static int __sched_setscheduler(struct task_struct *p, int policy,
4396 struct sched_param *param, bool user) 4450 struct sched_param *param, bool user)
4397 { 4451 {
4398 int retval, oldprio, oldpolicy = -1, on_rq, running; 4452 int retval, oldprio, oldpolicy = -1, on_rq, running;
4399 unsigned long flags; 4453 unsigned long flags;
4400 const struct sched_class *prev_class; 4454 const struct sched_class *prev_class;
4401 struct rq *rq; 4455 struct rq *rq;
4402 int reset_on_fork; 4456 int reset_on_fork;
4403 4457
4404 /* may grab non-irq protected spin_locks */ 4458 /* may grab non-irq protected spin_locks */
4405 BUG_ON(in_interrupt()); 4459 BUG_ON(in_interrupt());
4406 recheck: 4460 recheck:
4407 /* double check policy once rq lock held */ 4461 /* double check policy once rq lock held */
4408 if (policy < 0) { 4462 if (policy < 0) {
4409 reset_on_fork = p->sched_reset_on_fork; 4463 reset_on_fork = p->sched_reset_on_fork;
4410 policy = oldpolicy = p->policy; 4464 policy = oldpolicy = p->policy;
4411 } else { 4465 } else {
4412 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 4466 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
4413 policy &= ~SCHED_RESET_ON_FORK; 4467 policy &= ~SCHED_RESET_ON_FORK;
4414 4468
4415 if (policy != SCHED_FIFO && policy != SCHED_RR && 4469 if (policy != SCHED_FIFO && policy != SCHED_RR &&
4416 policy != SCHED_NORMAL && policy != SCHED_BATCH && 4470 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4417 policy != SCHED_IDLE) 4471 policy != SCHED_IDLE)
4418 return -EINVAL; 4472 return -EINVAL;
4419 } 4473 }
4420 4474
4421 /* 4475 /*
4422 * Valid priorities for SCHED_FIFO and SCHED_RR are 4476 * Valid priorities for SCHED_FIFO and SCHED_RR are
4423 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4477 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4424 * SCHED_BATCH and SCHED_IDLE is 0. 4478 * SCHED_BATCH and SCHED_IDLE is 0.
4425 */ 4479 */
4426 if (param->sched_priority < 0 || 4480 if (param->sched_priority < 0 ||
4427 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4481 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4428 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4482 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4429 return -EINVAL; 4483 return -EINVAL;
4430 if (rt_policy(policy) != (param->sched_priority != 0)) 4484 if (rt_policy(policy) != (param->sched_priority != 0))
4431 return -EINVAL; 4485 return -EINVAL;
4432 4486
4433 /* 4487 /*
4434 * Allow unprivileged RT tasks to decrease priority: 4488 * Allow unprivileged RT tasks to decrease priority:
4435 */ 4489 */
4436 if (user && !capable(CAP_SYS_NICE)) { 4490 if (user && !capable(CAP_SYS_NICE)) {
4437 if (rt_policy(policy)) { 4491 if (rt_policy(policy)) {
4438 unsigned long rlim_rtprio; 4492 unsigned long rlim_rtprio;
4439 4493
4440 if (!lock_task_sighand(p, &flags)) 4494 if (!lock_task_sighand(p, &flags))
4441 return -ESRCH; 4495 return -ESRCH;
4442 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); 4496 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4443 unlock_task_sighand(p, &flags); 4497 unlock_task_sighand(p, &flags);
4444 4498
4445 /* can't set/change the rt policy */ 4499 /* can't set/change the rt policy */
4446 if (policy != p->policy && !rlim_rtprio) 4500 if (policy != p->policy && !rlim_rtprio)
4447 return -EPERM; 4501 return -EPERM;
4448 4502
4449 /* can't increase priority */ 4503 /* can't increase priority */
4450 if (param->sched_priority > p->rt_priority && 4504 if (param->sched_priority > p->rt_priority &&
4451 param->sched_priority > rlim_rtprio) 4505 param->sched_priority > rlim_rtprio)
4452 return -EPERM; 4506 return -EPERM;
4453 } 4507 }
4454 /* 4508 /*
4455 * Like positive nice levels, dont allow tasks to 4509 * Like positive nice levels, dont allow tasks to
4456 * move out of SCHED_IDLE either: 4510 * move out of SCHED_IDLE either:
4457 */ 4511 */
4458 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4512 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4459 return -EPERM; 4513 return -EPERM;
4460 4514
4461 /* can't change other user's priorities */ 4515 /* can't change other user's priorities */
4462 if (!check_same_owner(p)) 4516 if (!check_same_owner(p))
4463 return -EPERM; 4517 return -EPERM;
4464 4518
4465 /* Normal users shall not reset the sched_reset_on_fork flag */ 4519 /* Normal users shall not reset the sched_reset_on_fork flag */
4466 if (p->sched_reset_on_fork && !reset_on_fork) 4520 if (p->sched_reset_on_fork && !reset_on_fork)
4467 return -EPERM; 4521 return -EPERM;
4468 } 4522 }
4469 4523
4470 if (user) { 4524 if (user) {
4471 #ifdef CONFIG_RT_GROUP_SCHED 4525 #ifdef CONFIG_RT_GROUP_SCHED
4472 /* 4526 /*
4473 * Do not allow realtime tasks into groups that have no runtime 4527 * Do not allow realtime tasks into groups that have no runtime
4474 * assigned. 4528 * assigned.
4475 */ 4529 */
4476 if (rt_bandwidth_enabled() && rt_policy(policy) && 4530 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4477 task_group(p)->rt_bandwidth.rt_runtime == 0) 4531 task_group(p)->rt_bandwidth.rt_runtime == 0)
4478 return -EPERM; 4532 return -EPERM;
4479 #endif 4533 #endif
4480 4534
4481 retval = security_task_setscheduler(p, policy, param); 4535 retval = security_task_setscheduler(p, policy, param);
4482 if (retval) 4536 if (retval)
4483 return retval; 4537 return retval;
4484 } 4538 }
4485 4539
4486 /* 4540 /*
4487 * make sure no PI-waiters arrive (or leave) while we are 4541 * make sure no PI-waiters arrive (or leave) while we are
4488 * changing the priority of the task: 4542 * changing the priority of the task:
4489 */ 4543 */
4490 raw_spin_lock_irqsave(&p->pi_lock, flags); 4544 raw_spin_lock_irqsave(&p->pi_lock, flags);
4491 /* 4545 /*
4492 * To be able to change p->policy safely, the apropriate 4546 * To be able to change p->policy safely, the apropriate
4493 * runqueue lock must be held. 4547 * runqueue lock must be held.
4494 */ 4548 */
4495 rq = __task_rq_lock(p); 4549 rq = __task_rq_lock(p);
4496 /* recheck policy now with rq lock held */ 4550 /* recheck policy now with rq lock held */
4497 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4551 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4498 policy = oldpolicy = -1; 4552 policy = oldpolicy = -1;
4499 __task_rq_unlock(rq); 4553 __task_rq_unlock(rq);
4500 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4554 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4501 goto recheck; 4555 goto recheck;
4502 } 4556 }
4503 on_rq = p->se.on_rq; 4557 on_rq = p->se.on_rq;
4504 running = task_current(rq, p); 4558 running = task_current(rq, p);
4505 if (on_rq) 4559 if (on_rq)
4506 deactivate_task(rq, p, 0); 4560 deactivate_task(rq, p, 0);
4507 if (running) 4561 if (running)
4508 p->sched_class->put_prev_task(rq, p); 4562 p->sched_class->put_prev_task(rq, p);
4509 4563
4510 p->sched_reset_on_fork = reset_on_fork; 4564 p->sched_reset_on_fork = reset_on_fork;
4511 4565
4512 oldprio = p->prio; 4566 oldprio = p->prio;
4513 prev_class = p->sched_class; 4567 prev_class = p->sched_class;
4514 __setscheduler(rq, p, policy, param->sched_priority); 4568 __setscheduler(rq, p, policy, param->sched_priority);
4515 4569
4516 if (running) 4570 if (running)
4517 p->sched_class->set_curr_task(rq); 4571 p->sched_class->set_curr_task(rq);
4518 if (on_rq) { 4572 if (on_rq) {
4519 activate_task(rq, p, 0); 4573 activate_task(rq, p, 0);
4520 4574
4521 check_class_changed(rq, p, prev_class, oldprio, running); 4575 check_class_changed(rq, p, prev_class, oldprio, running);
4522 } 4576 }
4523 __task_rq_unlock(rq); 4577 __task_rq_unlock(rq);
4524 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4578 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4525 4579
4526 rt_mutex_adjust_pi(p); 4580 rt_mutex_adjust_pi(p);
4527 4581
4528 return 0; 4582 return 0;
4529 } 4583 }
4530 4584
4531 /** 4585 /**
4532 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4586 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4533 * @p: the task in question. 4587 * @p: the task in question.
4534 * @policy: new policy. 4588 * @policy: new policy.
4535 * @param: structure containing the new RT priority. 4589 * @param: structure containing the new RT priority.
4536 * 4590 *
4537 * NOTE that the task may be already dead. 4591 * NOTE that the task may be already dead.
4538 */ 4592 */
4539 int sched_setscheduler(struct task_struct *p, int policy, 4593 int sched_setscheduler(struct task_struct *p, int policy,
4540 struct sched_param *param) 4594 struct sched_param *param)
4541 { 4595 {
4542 return __sched_setscheduler(p, policy, param, true); 4596 return __sched_setscheduler(p, policy, param, true);
4543 } 4597 }
4544 EXPORT_SYMBOL_GPL(sched_setscheduler); 4598 EXPORT_SYMBOL_GPL(sched_setscheduler);
4545 4599
4546 /** 4600 /**
4547 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4601 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4548 * @p: the task in question. 4602 * @p: the task in question.
4549 * @policy: new policy. 4603 * @policy: new policy.
4550 * @param: structure containing the new RT priority. 4604 * @param: structure containing the new RT priority.
4551 * 4605 *
4552 * Just like sched_setscheduler, only don't bother checking if the 4606 * Just like sched_setscheduler, only don't bother checking if the
4553 * current context has permission. For example, this is needed in 4607 * current context has permission. For example, this is needed in
4554 * stop_machine(): we create temporary high priority worker threads, 4608 * stop_machine(): we create temporary high priority worker threads,
4555 * but our caller might not have that capability. 4609 * but our caller might not have that capability.
4556 */ 4610 */
4557 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4611 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4558 struct sched_param *param) 4612 struct sched_param *param)
4559 { 4613 {
4560 return __sched_setscheduler(p, policy, param, false); 4614 return __sched_setscheduler(p, policy, param, false);
4561 } 4615 }
4562 4616
4563 static int 4617 static int
4564 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4618 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4565 { 4619 {
4566 struct sched_param lparam; 4620 struct sched_param lparam;
4567 struct task_struct *p; 4621 struct task_struct *p;
4568 int retval; 4622 int retval;
4569 4623
4570 if (!param || pid < 0) 4624 if (!param || pid < 0)
4571 return -EINVAL; 4625 return -EINVAL;
4572 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4626 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4573 return -EFAULT; 4627 return -EFAULT;
4574 4628
4575 rcu_read_lock(); 4629 rcu_read_lock();
4576 retval = -ESRCH; 4630 retval = -ESRCH;
4577 p = find_process_by_pid(pid); 4631 p = find_process_by_pid(pid);
4578 if (p != NULL) 4632 if (p != NULL)
4579 retval = sched_setscheduler(p, policy, &lparam); 4633 retval = sched_setscheduler(p, policy, &lparam);
4580 rcu_read_unlock(); 4634 rcu_read_unlock();
4581 4635
4582 return retval; 4636 return retval;
4583 } 4637 }
4584 4638
4585 /** 4639 /**
4586 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4640 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4587 * @pid: the pid in question. 4641 * @pid: the pid in question.
4588 * @policy: new policy. 4642 * @policy: new policy.
4589 * @param: structure containing the new RT priority. 4643 * @param: structure containing the new RT priority.
4590 */ 4644 */
4591 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 4645 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
4592 struct sched_param __user *, param) 4646 struct sched_param __user *, param)
4593 { 4647 {
4594 /* negative values for policy are not valid */ 4648 /* negative values for policy are not valid */
4595 if (policy < 0) 4649 if (policy < 0)
4596 return -EINVAL; 4650 return -EINVAL;
4597 4651
4598 return do_sched_setscheduler(pid, policy, param); 4652 return do_sched_setscheduler(pid, policy, param);
4599 } 4653 }
4600 4654
4601 /** 4655 /**
4602 * sys_sched_setparam - set/change the RT priority of a thread 4656 * sys_sched_setparam - set/change the RT priority of a thread
4603 * @pid: the pid in question. 4657 * @pid: the pid in question.
4604 * @param: structure containing the new RT priority. 4658 * @param: structure containing the new RT priority.
4605 */ 4659 */
4606 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 4660 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4607 { 4661 {
4608 return do_sched_setscheduler(pid, -1, param); 4662 return do_sched_setscheduler(pid, -1, param);
4609 } 4663 }
4610 4664
4611 /** 4665 /**
4612 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4666 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4613 * @pid: the pid in question. 4667 * @pid: the pid in question.
4614 */ 4668 */
4615 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 4669 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4616 { 4670 {
4617 struct task_struct *p; 4671 struct task_struct *p;
4618 int retval; 4672 int retval;
4619 4673
4620 if (pid < 0) 4674 if (pid < 0)
4621 return -EINVAL; 4675 return -EINVAL;
4622 4676
4623 retval = -ESRCH; 4677 retval = -ESRCH;
4624 rcu_read_lock(); 4678 rcu_read_lock();
4625 p = find_process_by_pid(pid); 4679 p = find_process_by_pid(pid);
4626 if (p) { 4680 if (p) {
4627 retval = security_task_getscheduler(p); 4681 retval = security_task_getscheduler(p);
4628 if (!retval) 4682 if (!retval)
4629 retval = p->policy 4683 retval = p->policy
4630 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4684 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4631 } 4685 }
4632 rcu_read_unlock(); 4686 rcu_read_unlock();
4633 return retval; 4687 return retval;
4634 } 4688 }
4635 4689
4636 /** 4690 /**
4637 * sys_sched_getparam - get the RT priority of a thread 4691 * sys_sched_getparam - get the RT priority of a thread
4638 * @pid: the pid in question. 4692 * @pid: the pid in question.
4639 * @param: structure containing the RT priority. 4693 * @param: structure containing the RT priority.
4640 */ 4694 */
4641 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 4695 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4642 { 4696 {
4643 struct sched_param lp; 4697 struct sched_param lp;
4644 struct task_struct *p; 4698 struct task_struct *p;
4645 int retval; 4699 int retval;
4646 4700
4647 if (!param || pid < 0) 4701 if (!param || pid < 0)
4648 return -EINVAL; 4702 return -EINVAL;
4649 4703
4650 rcu_read_lock(); 4704 rcu_read_lock();
4651 p = find_process_by_pid(pid); 4705 p = find_process_by_pid(pid);
4652 retval = -ESRCH; 4706 retval = -ESRCH;
4653 if (!p) 4707 if (!p)
4654 goto out_unlock; 4708 goto out_unlock;
4655 4709
4656 retval = security_task_getscheduler(p); 4710 retval = security_task_getscheduler(p);
4657 if (retval) 4711 if (retval)
4658 goto out_unlock; 4712 goto out_unlock;
4659 4713
4660 lp.sched_priority = p->rt_priority; 4714 lp.sched_priority = p->rt_priority;
4661 rcu_read_unlock(); 4715 rcu_read_unlock();
4662 4716
4663 /* 4717 /*
4664 * This one might sleep, we cannot do it with a spinlock held ... 4718 * This one might sleep, we cannot do it with a spinlock held ...
4665 */ 4719 */
4666 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4720 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4667 4721
4668 return retval; 4722 return retval;
4669 4723
4670 out_unlock: 4724 out_unlock:
4671 rcu_read_unlock(); 4725 rcu_read_unlock();
4672 return retval; 4726 return retval;
4673 } 4727 }
4674 4728
4675 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 4729 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4676 { 4730 {
4677 cpumask_var_t cpus_allowed, new_mask; 4731 cpumask_var_t cpus_allowed, new_mask;
4678 struct task_struct *p; 4732 struct task_struct *p;
4679 int retval; 4733 int retval;
4680 4734
4681 get_online_cpus(); 4735 get_online_cpus();
4682 rcu_read_lock(); 4736 rcu_read_lock();
4683 4737
4684 p = find_process_by_pid(pid); 4738 p = find_process_by_pid(pid);
4685 if (!p) { 4739 if (!p) {
4686 rcu_read_unlock(); 4740 rcu_read_unlock();
4687 put_online_cpus(); 4741 put_online_cpus();
4688 return -ESRCH; 4742 return -ESRCH;
4689 } 4743 }
4690 4744
4691 /* Prevent p going away */ 4745 /* Prevent p going away */
4692 get_task_struct(p); 4746 get_task_struct(p);
4693 rcu_read_unlock(); 4747 rcu_read_unlock();
4694 4748
4695 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4749 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4696 retval = -ENOMEM; 4750 retval = -ENOMEM;
4697 goto out_put_task; 4751 goto out_put_task;
4698 } 4752 }
4699 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 4753 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4700 retval = -ENOMEM; 4754 retval = -ENOMEM;
4701 goto out_free_cpus_allowed; 4755 goto out_free_cpus_allowed;
4702 } 4756 }
4703 retval = -EPERM; 4757 retval = -EPERM;
4704 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 4758 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4705 goto out_unlock; 4759 goto out_unlock;
4706 4760
4707 retval = security_task_setscheduler(p, 0, NULL); 4761 retval = security_task_setscheduler(p, 0, NULL);
4708 if (retval) 4762 if (retval)
4709 goto out_unlock; 4763 goto out_unlock;
4710 4764
4711 cpuset_cpus_allowed(p, cpus_allowed); 4765 cpuset_cpus_allowed(p, cpus_allowed);
4712 cpumask_and(new_mask, in_mask, cpus_allowed); 4766 cpumask_and(new_mask, in_mask, cpus_allowed);
4713 again: 4767 again:
4714 retval = set_cpus_allowed_ptr(p, new_mask); 4768 retval = set_cpus_allowed_ptr(p, new_mask);
4715 4769
4716 if (!retval) { 4770 if (!retval) {
4717 cpuset_cpus_allowed(p, cpus_allowed); 4771 cpuset_cpus_allowed(p, cpus_allowed);
4718 if (!cpumask_subset(new_mask, cpus_allowed)) { 4772 if (!cpumask_subset(new_mask, cpus_allowed)) {
4719 /* 4773 /*
4720 * We must have raced with a concurrent cpuset 4774 * We must have raced with a concurrent cpuset
4721 * update. Just reset the cpus_allowed to the 4775 * update. Just reset the cpus_allowed to the
4722 * cpuset's cpus_allowed 4776 * cpuset's cpus_allowed
4723 */ 4777 */
4724 cpumask_copy(new_mask, cpus_allowed); 4778 cpumask_copy(new_mask, cpus_allowed);
4725 goto again; 4779 goto again;
4726 } 4780 }
4727 } 4781 }
4728 out_unlock: 4782 out_unlock:
4729 free_cpumask_var(new_mask); 4783 free_cpumask_var(new_mask);
4730 out_free_cpus_allowed: 4784 out_free_cpus_allowed:
4731 free_cpumask_var(cpus_allowed); 4785 free_cpumask_var(cpus_allowed);
4732 out_put_task: 4786 out_put_task:
4733 put_task_struct(p); 4787 put_task_struct(p);
4734 put_online_cpus(); 4788 put_online_cpus();
4735 return retval; 4789 return retval;
4736 } 4790 }
4737 4791
4738 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4792 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4739 struct cpumask *new_mask) 4793 struct cpumask *new_mask)
4740 { 4794 {
4741 if (len < cpumask_size()) 4795 if (len < cpumask_size())
4742 cpumask_clear(new_mask); 4796 cpumask_clear(new_mask);
4743 else if (len > cpumask_size()) 4797 else if (len > cpumask_size())
4744 len = cpumask_size(); 4798 len = cpumask_size();
4745 4799
4746 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4800 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4747 } 4801 }
4748 4802
4749 /** 4803 /**
4750 * sys_sched_setaffinity - set the cpu affinity of a process 4804 * sys_sched_setaffinity - set the cpu affinity of a process
4751 * @pid: pid of the process 4805 * @pid: pid of the process
4752 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4806 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4753 * @user_mask_ptr: user-space pointer to the new cpu mask 4807 * @user_mask_ptr: user-space pointer to the new cpu mask
4754 */ 4808 */
4755 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 4809 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4756 unsigned long __user *, user_mask_ptr) 4810 unsigned long __user *, user_mask_ptr)
4757 { 4811 {
4758 cpumask_var_t new_mask; 4812 cpumask_var_t new_mask;
4759 int retval; 4813 int retval;
4760 4814
4761 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 4815 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4762 return -ENOMEM; 4816 return -ENOMEM;
4763 4817
4764 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 4818 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4765 if (retval == 0) 4819 if (retval == 0)
4766 retval = sched_setaffinity(pid, new_mask); 4820 retval = sched_setaffinity(pid, new_mask);
4767 free_cpumask_var(new_mask); 4821 free_cpumask_var(new_mask);
4768 return retval; 4822 return retval;
4769 } 4823 }
4770 4824
4771 long sched_getaffinity(pid_t pid, struct cpumask *mask) 4825 long sched_getaffinity(pid_t pid, struct cpumask *mask)
4772 { 4826 {
4773 struct task_struct *p; 4827 struct task_struct *p;
4774 unsigned long flags; 4828 unsigned long flags;
4775 struct rq *rq; 4829 struct rq *rq;
4776 int retval; 4830 int retval;
4777 4831
4778 get_online_cpus(); 4832 get_online_cpus();
4779 rcu_read_lock(); 4833 rcu_read_lock();
4780 4834
4781 retval = -ESRCH; 4835 retval = -ESRCH;
4782 p = find_process_by_pid(pid); 4836 p = find_process_by_pid(pid);
4783 if (!p) 4837 if (!p)
4784 goto out_unlock; 4838 goto out_unlock;
4785 4839
4786 retval = security_task_getscheduler(p); 4840 retval = security_task_getscheduler(p);
4787 if (retval) 4841 if (retval)
4788 goto out_unlock; 4842 goto out_unlock;
4789 4843
4790 rq = task_rq_lock(p, &flags); 4844 rq = task_rq_lock(p, &flags);
4791 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 4845 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4792 task_rq_unlock(rq, &flags); 4846 task_rq_unlock(rq, &flags);
4793 4847
4794 out_unlock: 4848 out_unlock:
4795 rcu_read_unlock(); 4849 rcu_read_unlock();
4796 put_online_cpus(); 4850 put_online_cpus();
4797 4851
4798 return retval; 4852 return retval;
4799 } 4853 }
4800 4854
4801 /** 4855 /**
4802 * sys_sched_getaffinity - get the cpu affinity of a process 4856 * sys_sched_getaffinity - get the cpu affinity of a process
4803 * @pid: pid of the process 4857 * @pid: pid of the process
4804 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4858 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4805 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4859 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4806 */ 4860 */
4807 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 4861 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4808 unsigned long __user *, user_mask_ptr) 4862 unsigned long __user *, user_mask_ptr)
4809 { 4863 {
4810 int ret; 4864 int ret;
4811 cpumask_var_t mask; 4865 cpumask_var_t mask;
4812 4866
4813 if ((len * BITS_PER_BYTE) < nr_cpu_ids) 4867 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4814 return -EINVAL; 4868 return -EINVAL;
4815 if (len & (sizeof(unsigned long)-1)) 4869 if (len & (sizeof(unsigned long)-1))
4816 return -EINVAL; 4870 return -EINVAL;
4817 4871
4818 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4872 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4819 return -ENOMEM; 4873 return -ENOMEM;
4820 4874
4821 ret = sched_getaffinity(pid, mask); 4875 ret = sched_getaffinity(pid, mask);
4822 if (ret == 0) { 4876 if (ret == 0) {
4823 size_t retlen = min_t(size_t, len, cpumask_size()); 4877 size_t retlen = min_t(size_t, len, cpumask_size());
4824 4878
4825 if (copy_to_user(user_mask_ptr, mask, retlen)) 4879 if (copy_to_user(user_mask_ptr, mask, retlen))
4826 ret = -EFAULT; 4880 ret = -EFAULT;
4827 else 4881 else
4828 ret = retlen; 4882 ret = retlen;
4829 } 4883 }
4830 free_cpumask_var(mask); 4884 free_cpumask_var(mask);
4831 4885
4832 return ret; 4886 return ret;
4833 } 4887 }
4834 4888
4835 /** 4889 /**
4836 * sys_sched_yield - yield the current processor to other threads. 4890 * sys_sched_yield - yield the current processor to other threads.
4837 * 4891 *
4838 * This function yields the current CPU to other tasks. If there are no 4892 * This function yields the current CPU to other tasks. If there are no
4839 * other threads running on this CPU then this function will return. 4893 * other threads running on this CPU then this function will return.
4840 */ 4894 */
4841 SYSCALL_DEFINE0(sched_yield) 4895 SYSCALL_DEFINE0(sched_yield)
4842 { 4896 {
4843 struct rq *rq = this_rq_lock(); 4897 struct rq *rq = this_rq_lock();
4844 4898
4845 schedstat_inc(rq, yld_count); 4899 schedstat_inc(rq, yld_count);
4846 current->sched_class->yield_task(rq); 4900 current->sched_class->yield_task(rq);
4847 4901
4848 /* 4902 /*
4849 * Since we are going to call schedule() anyway, there's 4903 * Since we are going to call schedule() anyway, there's
4850 * no need to preempt or enable interrupts: 4904 * no need to preempt or enable interrupts:
4851 */ 4905 */
4852 __release(rq->lock); 4906 __release(rq->lock);
4853 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4907 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4854 do_raw_spin_unlock(&rq->lock); 4908 do_raw_spin_unlock(&rq->lock);
4855 preempt_enable_no_resched(); 4909 preempt_enable_no_resched();
4856 4910
4857 schedule(); 4911 schedule();
4858 4912
4859 return 0; 4913 return 0;
4860 } 4914 }
4861 4915
4862 static inline int should_resched(void) 4916 static inline int should_resched(void)
4863 { 4917 {
4864 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 4918 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
4865 } 4919 }
4866 4920
4867 static void __cond_resched(void) 4921 static void __cond_resched(void)
4868 { 4922 {
4869 add_preempt_count(PREEMPT_ACTIVE); 4923 add_preempt_count(PREEMPT_ACTIVE);
4870 schedule(); 4924 schedule();
4871 sub_preempt_count(PREEMPT_ACTIVE); 4925 sub_preempt_count(PREEMPT_ACTIVE);
4872 } 4926 }
4873 4927
4874 int __sched _cond_resched(void) 4928 int __sched _cond_resched(void)
4875 { 4929 {
4876 if (should_resched()) { 4930 if (should_resched()) {
4877 __cond_resched(); 4931 __cond_resched();
4878 return 1; 4932 return 1;
4879 } 4933 }
4880 return 0; 4934 return 0;
4881 } 4935 }
4882 EXPORT_SYMBOL(_cond_resched); 4936 EXPORT_SYMBOL(_cond_resched);
4883 4937
4884 /* 4938 /*
4885 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 4939 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4886 * call schedule, and on return reacquire the lock. 4940 * call schedule, and on return reacquire the lock.
4887 * 4941 *
4888 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4942 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4889 * operations here to prevent schedule() from being called twice (once via 4943 * operations here to prevent schedule() from being called twice (once via
4890 * spin_unlock(), once by hand). 4944 * spin_unlock(), once by hand).
4891 */ 4945 */
4892 int __cond_resched_lock(spinlock_t *lock) 4946 int __cond_resched_lock(spinlock_t *lock)
4893 { 4947 {
4894 int resched = should_resched(); 4948 int resched = should_resched();
4895 int ret = 0; 4949 int ret = 0;
4896 4950
4897 lockdep_assert_held(lock); 4951 lockdep_assert_held(lock);
4898 4952
4899 if (spin_needbreak(lock) || resched) { 4953 if (spin_needbreak(lock) || resched) {
4900 spin_unlock(lock); 4954 spin_unlock(lock);
4901 if (resched) 4955 if (resched)
4902 __cond_resched(); 4956 __cond_resched();
4903 else 4957 else
4904 cpu_relax(); 4958 cpu_relax();
4905 ret = 1; 4959 ret = 1;
4906 spin_lock(lock); 4960 spin_lock(lock);
4907 } 4961 }
4908 return ret; 4962 return ret;
4909 } 4963 }
4910 EXPORT_SYMBOL(__cond_resched_lock); 4964 EXPORT_SYMBOL(__cond_resched_lock);
4911 4965
4912 int __sched __cond_resched_softirq(void) 4966 int __sched __cond_resched_softirq(void)
4913 { 4967 {
4914 BUG_ON(!in_softirq()); 4968 BUG_ON(!in_softirq());
4915 4969
4916 if (should_resched()) { 4970 if (should_resched()) {
4917 local_bh_enable(); 4971 local_bh_enable();
4918 __cond_resched(); 4972 __cond_resched();
4919 local_bh_disable(); 4973 local_bh_disable();
4920 return 1; 4974 return 1;
4921 } 4975 }
4922 return 0; 4976 return 0;
4923 } 4977 }
4924 EXPORT_SYMBOL(__cond_resched_softirq); 4978 EXPORT_SYMBOL(__cond_resched_softirq);
4925 4979
4926 /** 4980 /**
4927 * yield - yield the current processor to other threads. 4981 * yield - yield the current processor to other threads.
4928 * 4982 *
4929 * This is a shortcut for kernel-space yielding - it marks the 4983 * This is a shortcut for kernel-space yielding - it marks the
4930 * thread runnable and calls sys_sched_yield(). 4984 * thread runnable and calls sys_sched_yield().
4931 */ 4985 */
4932 void __sched yield(void) 4986 void __sched yield(void)
4933 { 4987 {
4934 set_current_state(TASK_RUNNING); 4988 set_current_state(TASK_RUNNING);
4935 sys_sched_yield(); 4989 sys_sched_yield();
4936 } 4990 }
4937 EXPORT_SYMBOL(yield); 4991 EXPORT_SYMBOL(yield);
4938 4992
4939 /* 4993 /*
4940 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4994 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4941 * that process accounting knows that this is a task in IO wait state. 4995 * that process accounting knows that this is a task in IO wait state.
4942 */ 4996 */
4943 void __sched io_schedule(void) 4997 void __sched io_schedule(void)
4944 { 4998 {
4945 struct rq *rq = raw_rq(); 4999 struct rq *rq = raw_rq();
4946 5000
4947 delayacct_blkio_start(); 5001 delayacct_blkio_start();
4948 atomic_inc(&rq->nr_iowait); 5002 atomic_inc(&rq->nr_iowait);
4949 current->in_iowait = 1; 5003 current->in_iowait = 1;
4950 schedule(); 5004 schedule();
4951 current->in_iowait = 0; 5005 current->in_iowait = 0;
4952 atomic_dec(&rq->nr_iowait); 5006 atomic_dec(&rq->nr_iowait);
4953 delayacct_blkio_end(); 5007 delayacct_blkio_end();
4954 } 5008 }
4955 EXPORT_SYMBOL(io_schedule); 5009 EXPORT_SYMBOL(io_schedule);
4956 5010
4957 long __sched io_schedule_timeout(long timeout) 5011 long __sched io_schedule_timeout(long timeout)
4958 { 5012 {
4959 struct rq *rq = raw_rq(); 5013 struct rq *rq = raw_rq();
4960 long ret; 5014 long ret;
4961 5015
4962 delayacct_blkio_start(); 5016 delayacct_blkio_start();
4963 atomic_inc(&rq->nr_iowait); 5017 atomic_inc(&rq->nr_iowait);
4964 current->in_iowait = 1; 5018 current->in_iowait = 1;
4965 ret = schedule_timeout(timeout); 5019 ret = schedule_timeout(timeout);
4966 current->in_iowait = 0; 5020 current->in_iowait = 0;
4967 atomic_dec(&rq->nr_iowait); 5021 atomic_dec(&rq->nr_iowait);
4968 delayacct_blkio_end(); 5022 delayacct_blkio_end();
4969 return ret; 5023 return ret;
4970 } 5024 }
4971 5025
4972 /** 5026 /**
4973 * sys_sched_get_priority_max - return maximum RT priority. 5027 * sys_sched_get_priority_max - return maximum RT priority.
4974 * @policy: scheduling class. 5028 * @policy: scheduling class.
4975 * 5029 *
4976 * this syscall returns the maximum rt_priority that can be used 5030 * this syscall returns the maximum rt_priority that can be used
4977 * by a given scheduling class. 5031 * by a given scheduling class.
4978 */ 5032 */
4979 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 5033 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
4980 { 5034 {
4981 int ret = -EINVAL; 5035 int ret = -EINVAL;
4982 5036
4983 switch (policy) { 5037 switch (policy) {
4984 case SCHED_FIFO: 5038 case SCHED_FIFO:
4985 case SCHED_RR: 5039 case SCHED_RR:
4986 ret = MAX_USER_RT_PRIO-1; 5040 ret = MAX_USER_RT_PRIO-1;
4987 break; 5041 break;
4988 case SCHED_NORMAL: 5042 case SCHED_NORMAL:
4989 case SCHED_BATCH: 5043 case SCHED_BATCH:
4990 case SCHED_IDLE: 5044 case SCHED_IDLE:
4991 ret = 0; 5045 ret = 0;
4992 break; 5046 break;
4993 } 5047 }
4994 return ret; 5048 return ret;
4995 } 5049 }
4996 5050
4997 /** 5051 /**
4998 * sys_sched_get_priority_min - return minimum RT priority. 5052 * sys_sched_get_priority_min - return minimum RT priority.
4999 * @policy: scheduling class. 5053 * @policy: scheduling class.
5000 * 5054 *
5001 * this syscall returns the minimum rt_priority that can be used 5055 * this syscall returns the minimum rt_priority that can be used
5002 * by a given scheduling class. 5056 * by a given scheduling class.
5003 */ 5057 */
5004 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 5058 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5005 { 5059 {
5006 int ret = -EINVAL; 5060 int ret = -EINVAL;
5007 5061
5008 switch (policy) { 5062 switch (policy) {
5009 case SCHED_FIFO: 5063 case SCHED_FIFO:
5010 case SCHED_RR: 5064 case SCHED_RR:
5011 ret = 1; 5065 ret = 1;
5012 break; 5066 break;
5013 case SCHED_NORMAL: 5067 case SCHED_NORMAL:
5014 case SCHED_BATCH: 5068 case SCHED_BATCH:
5015 case SCHED_IDLE: 5069 case SCHED_IDLE:
5016 ret = 0; 5070 ret = 0;
5017 } 5071 }
5018 return ret; 5072 return ret;
5019 } 5073 }
5020 5074
5021 /** 5075 /**
5022 * sys_sched_rr_get_interval - return the default timeslice of a process. 5076 * sys_sched_rr_get_interval - return the default timeslice of a process.
5023 * @pid: pid of the process. 5077 * @pid: pid of the process.
5024 * @interval: userspace pointer to the timeslice value. 5078 * @interval: userspace pointer to the timeslice value.
5025 * 5079 *
5026 * this syscall writes the default timeslice value of a given process 5080 * this syscall writes the default timeslice value of a given process
5027 * into the user-space timespec buffer. A value of '0' means infinity. 5081 * into the user-space timespec buffer. A value of '0' means infinity.
5028 */ 5082 */
5029 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 5083 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5030 struct timespec __user *, interval) 5084 struct timespec __user *, interval)
5031 { 5085 {
5032 struct task_struct *p; 5086 struct task_struct *p;
5033 unsigned int time_slice; 5087 unsigned int time_slice;
5034 unsigned long flags; 5088 unsigned long flags;
5035 struct rq *rq; 5089 struct rq *rq;
5036 int retval; 5090 int retval;
5037 struct timespec t; 5091 struct timespec t;
5038 5092
5039 if (pid < 0) 5093 if (pid < 0)
5040 return -EINVAL; 5094 return -EINVAL;
5041 5095
5042 retval = -ESRCH; 5096 retval = -ESRCH;
5043 rcu_read_lock(); 5097 rcu_read_lock();
5044 p = find_process_by_pid(pid); 5098 p = find_process_by_pid(pid);
5045 if (!p) 5099 if (!p)
5046 goto out_unlock; 5100 goto out_unlock;
5047 5101
5048 retval = security_task_getscheduler(p); 5102 retval = security_task_getscheduler(p);
5049 if (retval) 5103 if (retval)
5050 goto out_unlock; 5104 goto out_unlock;
5051 5105
5052 rq = task_rq_lock(p, &flags); 5106 rq = task_rq_lock(p, &flags);
5053 time_slice = p->sched_class->get_rr_interval(rq, p); 5107 time_slice = p->sched_class->get_rr_interval(rq, p);
5054 task_rq_unlock(rq, &flags); 5108 task_rq_unlock(rq, &flags);
5055 5109
5056 rcu_read_unlock(); 5110 rcu_read_unlock();
5057 jiffies_to_timespec(time_slice, &t); 5111 jiffies_to_timespec(time_slice, &t);
5058 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5112 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5059 return retval; 5113 return retval;
5060 5114
5061 out_unlock: 5115 out_unlock:
5062 rcu_read_unlock(); 5116 rcu_read_unlock();
5063 return retval; 5117 return retval;
5064 } 5118 }
5065 5119
5066 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 5120 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5067 5121
5068 void sched_show_task(struct task_struct *p) 5122 void sched_show_task(struct task_struct *p)
5069 { 5123 {
5070 unsigned long free = 0; 5124 unsigned long free = 0;
5071 unsigned state; 5125 unsigned state;
5072 5126
5073 state = p->state ? __ffs(p->state) + 1 : 0; 5127 state = p->state ? __ffs(p->state) + 1 : 0;
5074 printk(KERN_INFO "%-13.13s %c", p->comm, 5128 printk(KERN_INFO "%-13.13s %c", p->comm,
5075 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5129 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5076 #if BITS_PER_LONG == 32 5130 #if BITS_PER_LONG == 32
5077 if (state == TASK_RUNNING) 5131 if (state == TASK_RUNNING)
5078 printk(KERN_CONT " running "); 5132 printk(KERN_CONT " running ");
5079 else 5133 else
5080 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 5134 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
5081 #else 5135 #else
5082 if (state == TASK_RUNNING) 5136 if (state == TASK_RUNNING)
5083 printk(KERN_CONT " running task "); 5137 printk(KERN_CONT " running task ");
5084 else 5138 else
5085 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 5139 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5086 #endif 5140 #endif
5087 #ifdef CONFIG_DEBUG_STACK_USAGE 5141 #ifdef CONFIG_DEBUG_STACK_USAGE
5088 free = stack_not_used(p); 5142 free = stack_not_used(p);
5089 #endif 5143 #endif
5090 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 5144 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5091 task_pid_nr(p), task_pid_nr(p->real_parent), 5145 task_pid_nr(p), task_pid_nr(p->real_parent),
5092 (unsigned long)task_thread_info(p)->flags); 5146 (unsigned long)task_thread_info(p)->flags);
5093 5147
5094 show_stack(p, NULL); 5148 show_stack(p, NULL);
5095 } 5149 }
5096 5150
5097 void show_state_filter(unsigned long state_filter) 5151 void show_state_filter(unsigned long state_filter)
5098 { 5152 {
5099 struct task_struct *g, *p; 5153 struct task_struct *g, *p;
5100 5154
5101 #if BITS_PER_LONG == 32 5155 #if BITS_PER_LONG == 32
5102 printk(KERN_INFO 5156 printk(KERN_INFO
5103 " task PC stack pid father\n"); 5157 " task PC stack pid father\n");
5104 #else 5158 #else
5105 printk(KERN_INFO 5159 printk(KERN_INFO
5106 " task PC stack pid father\n"); 5160 " task PC stack pid father\n");
5107 #endif 5161 #endif
5108 read_lock(&tasklist_lock); 5162 read_lock(&tasklist_lock);
5109 do_each_thread(g, p) { 5163 do_each_thread(g, p) {
5110 /* 5164 /*
5111 * reset the NMI-timeout, listing all files on a slow 5165 * reset the NMI-timeout, listing all files on a slow
5112 * console might take alot of time: 5166 * console might take alot of time:
5113 */ 5167 */
5114 touch_nmi_watchdog(); 5168 touch_nmi_watchdog();
5115 if (!state_filter || (p->state & state_filter)) 5169 if (!state_filter || (p->state & state_filter))
5116 sched_show_task(p); 5170 sched_show_task(p);
5117 } while_each_thread(g, p); 5171 } while_each_thread(g, p);
5118 5172
5119 touch_all_softlockup_watchdogs(); 5173 touch_all_softlockup_watchdogs();
5120 5174
5121 #ifdef CONFIG_SCHED_DEBUG 5175 #ifdef CONFIG_SCHED_DEBUG
5122 sysrq_sched_debug_show(); 5176 sysrq_sched_debug_show();
5123 #endif 5177 #endif
5124 read_unlock(&tasklist_lock); 5178 read_unlock(&tasklist_lock);
5125 /* 5179 /*
5126 * Only show locks if all tasks are dumped: 5180 * Only show locks if all tasks are dumped:
5127 */ 5181 */
5128 if (!state_filter) 5182 if (!state_filter)
5129 debug_show_all_locks(); 5183 debug_show_all_locks();
5130 } 5184 }
5131 5185
5132 void __cpuinit init_idle_bootup_task(struct task_struct *idle) 5186 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
5133 { 5187 {
5134 idle->sched_class = &idle_sched_class; 5188 idle->sched_class = &idle_sched_class;
5135 } 5189 }
5136 5190
5137 /** 5191 /**
5138 * init_idle - set up an idle thread for a given CPU 5192 * init_idle - set up an idle thread for a given CPU
5139 * @idle: task in question 5193 * @idle: task in question
5140 * @cpu: cpu the idle task belongs to 5194 * @cpu: cpu the idle task belongs to
5141 * 5195 *
5142 * NOTE: this function does not set the idle thread's NEED_RESCHED 5196 * NOTE: this function does not set the idle thread's NEED_RESCHED
5143 * flag, to make booting more robust. 5197 * flag, to make booting more robust.
5144 */ 5198 */
5145 void __cpuinit init_idle(struct task_struct *idle, int cpu) 5199 void __cpuinit init_idle(struct task_struct *idle, int cpu)
5146 { 5200 {
5147 struct rq *rq = cpu_rq(cpu); 5201 struct rq *rq = cpu_rq(cpu);
5148 unsigned long flags; 5202 unsigned long flags;
5149 5203
5150 raw_spin_lock_irqsave(&rq->lock, flags); 5204 raw_spin_lock_irqsave(&rq->lock, flags);
5151 5205
5152 __sched_fork(idle); 5206 __sched_fork(idle);
5153 idle->state = TASK_RUNNING; 5207 idle->state = TASK_RUNNING;
5154 idle->se.exec_start = sched_clock(); 5208 idle->se.exec_start = sched_clock();
5155 5209
5156 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5210 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5157 __set_task_cpu(idle, cpu); 5211 __set_task_cpu(idle, cpu);
5158 5212
5159 rq->curr = rq->idle = idle; 5213 rq->curr = rq->idle = idle;
5160 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5214 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
5161 idle->oncpu = 1; 5215 idle->oncpu = 1;
5162 #endif 5216 #endif
5163 raw_spin_unlock_irqrestore(&rq->lock, flags); 5217 raw_spin_unlock_irqrestore(&rq->lock, flags);
5164 5218
5165 /* Set the preempt count _outside_ the spinlocks! */ 5219 /* Set the preempt count _outside_ the spinlocks! */
5166 #if defined(CONFIG_PREEMPT) 5220 #if defined(CONFIG_PREEMPT)
5167 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 5221 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5168 #else 5222 #else
5169 task_thread_info(idle)->preempt_count = 0; 5223 task_thread_info(idle)->preempt_count = 0;
5170 #endif 5224 #endif
5171 /* 5225 /*
5172 * The idle tasks have their own, simple scheduling class: 5226 * The idle tasks have their own, simple scheduling class:
5173 */ 5227 */
5174 idle->sched_class = &idle_sched_class; 5228 idle->sched_class = &idle_sched_class;
5175 ftrace_graph_init_task(idle); 5229 ftrace_graph_init_task(idle);
5176 } 5230 }
5177 5231
5178 /* 5232 /*
5179 * In a system that switches off the HZ timer nohz_cpu_mask 5233 * In a system that switches off the HZ timer nohz_cpu_mask
5180 * indicates which cpus entered this state. This is used 5234 * indicates which cpus entered this state. This is used
5181 * in the rcu update to wait only for active cpus. For system 5235 * in the rcu update to wait only for active cpus. For system
5182 * which do not switch off the HZ timer nohz_cpu_mask should 5236 * which do not switch off the HZ timer nohz_cpu_mask should
5183 * always be CPU_BITS_NONE. 5237 * always be CPU_BITS_NONE.
5184 */ 5238 */
5185 cpumask_var_t nohz_cpu_mask; 5239 cpumask_var_t nohz_cpu_mask;
5186 5240
5187 /* 5241 /*
5188 * Increase the granularity value when there are more CPUs, 5242 * Increase the granularity value when there are more CPUs,
5189 * because with more CPUs the 'effective latency' as visible 5243 * because with more CPUs the 'effective latency' as visible
5190 * to users decreases. But the relationship is not linear, 5244 * to users decreases. But the relationship is not linear,
5191 * so pick a second-best guess by going with the log2 of the 5245 * so pick a second-best guess by going with the log2 of the
5192 * number of CPUs. 5246 * number of CPUs.
5193 * 5247 *
5194 * This idea comes from the SD scheduler of Con Kolivas: 5248 * This idea comes from the SD scheduler of Con Kolivas:
5195 */ 5249 */
5196 static int get_update_sysctl_factor(void) 5250 static int get_update_sysctl_factor(void)
5197 { 5251 {
5198 unsigned int cpus = min_t(int, num_online_cpus(), 8); 5252 unsigned int cpus = min_t(int, num_online_cpus(), 8);
5199 unsigned int factor; 5253 unsigned int factor;
5200 5254
5201 switch (sysctl_sched_tunable_scaling) { 5255 switch (sysctl_sched_tunable_scaling) {
5202 case SCHED_TUNABLESCALING_NONE: 5256 case SCHED_TUNABLESCALING_NONE:
5203 factor = 1; 5257 factor = 1;
5204 break; 5258 break;
5205 case SCHED_TUNABLESCALING_LINEAR: 5259 case SCHED_TUNABLESCALING_LINEAR:
5206 factor = cpus; 5260 factor = cpus;
5207 break; 5261 break;
5208 case SCHED_TUNABLESCALING_LOG: 5262 case SCHED_TUNABLESCALING_LOG:
5209 default: 5263 default:
5210 factor = 1 + ilog2(cpus); 5264 factor = 1 + ilog2(cpus);
5211 break; 5265 break;
5212 } 5266 }
5213 5267
5214 return factor; 5268 return factor;
5215 } 5269 }
5216 5270
5217 static void update_sysctl(void) 5271 static void update_sysctl(void)
5218 { 5272 {
5219 unsigned int factor = get_update_sysctl_factor(); 5273 unsigned int factor = get_update_sysctl_factor();
5220 5274
5221 #define SET_SYSCTL(name) \ 5275 #define SET_SYSCTL(name) \
5222 (sysctl_##name = (factor) * normalized_sysctl_##name) 5276 (sysctl_##name = (factor) * normalized_sysctl_##name)
5223 SET_SYSCTL(sched_min_granularity); 5277 SET_SYSCTL(sched_min_granularity);
5224 SET_SYSCTL(sched_latency); 5278 SET_SYSCTL(sched_latency);
5225 SET_SYSCTL(sched_wakeup_granularity); 5279 SET_SYSCTL(sched_wakeup_granularity);
5226 SET_SYSCTL(sched_shares_ratelimit); 5280 SET_SYSCTL(sched_shares_ratelimit);
5227 #undef SET_SYSCTL 5281 #undef SET_SYSCTL
5228 } 5282 }
5229 5283
5230 static inline void sched_init_granularity(void) 5284 static inline void sched_init_granularity(void)
5231 { 5285 {
5232 update_sysctl(); 5286 update_sysctl();
5233 } 5287 }
5234 5288
5235 #ifdef CONFIG_SMP 5289 #ifdef CONFIG_SMP
5236 /* 5290 /*
5237 * This is how migration works: 5291 * This is how migration works:
5238 * 5292 *
5239 * 1) we queue a struct migration_req structure in the source CPU's 5293 * 1) we queue a struct migration_req structure in the source CPU's
5240 * runqueue and wake up that CPU's migration thread. 5294 * runqueue and wake up that CPU's migration thread.
5241 * 2) we down() the locked semaphore => thread blocks. 5295 * 2) we down() the locked semaphore => thread blocks.
5242 * 3) migration thread wakes up (implicitly it forces the migrated 5296 * 3) migration thread wakes up (implicitly it forces the migrated
5243 * thread off the CPU) 5297 * thread off the CPU)
5244 * 4) it gets the migration request and checks whether the migrated 5298 * 4) it gets the migration request and checks whether the migrated
5245 * task is still in the wrong runqueue. 5299 * task is still in the wrong runqueue.
5246 * 5) if it's in the wrong runqueue then the migration thread removes 5300 * 5) if it's in the wrong runqueue then the migration thread removes
5247 * it and puts it into the right queue. 5301 * it and puts it into the right queue.
5248 * 6) migration thread up()s the semaphore. 5302 * 6) migration thread up()s the semaphore.
5249 * 7) we wake up and the migration is done. 5303 * 7) we wake up and the migration is done.
5250 */ 5304 */
5251 5305
5252 /* 5306 /*
5253 * Change a given task's CPU affinity. Migrate the thread to a 5307 * Change a given task's CPU affinity. Migrate the thread to a
5254 * proper CPU and schedule it away if the CPU it's executing on 5308 * proper CPU and schedule it away if the CPU it's executing on
5255 * is removed from the allowed bitmask. 5309 * is removed from the allowed bitmask.
5256 * 5310 *
5257 * NOTE: the caller must have a valid reference to the task, the 5311 * NOTE: the caller must have a valid reference to the task, the
5258 * task must not exit() & deallocate itself prematurely. The 5312 * task must not exit() & deallocate itself prematurely. The
5259 * call is not atomic; no spinlocks may be held. 5313 * call is not atomic; no spinlocks may be held.
5260 */ 5314 */
5261 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5315 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5262 { 5316 {
5263 struct migration_req req; 5317 struct migration_req req;
5264 unsigned long flags; 5318 unsigned long flags;
5265 struct rq *rq; 5319 struct rq *rq;
5266 int ret = 0; 5320 int ret = 0;
5267 5321
5268 /* 5322 /*
5269 * Serialize against TASK_WAKING so that ttwu() and wunt() can 5323 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5270 * drop the rq->lock and still rely on ->cpus_allowed. 5324 * drop the rq->lock and still rely on ->cpus_allowed.
5271 */ 5325 */
5272 again: 5326 again:
5273 while (task_is_waking(p)) 5327 while (task_is_waking(p))
5274 cpu_relax(); 5328 cpu_relax();
5275 rq = task_rq_lock(p, &flags); 5329 rq = task_rq_lock(p, &flags);
5276 if (task_is_waking(p)) { 5330 if (task_is_waking(p)) {
5277 task_rq_unlock(rq, &flags); 5331 task_rq_unlock(rq, &flags);
5278 goto again; 5332 goto again;
5279 } 5333 }
5280 5334
5281 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5335 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5282 ret = -EINVAL; 5336 ret = -EINVAL;
5283 goto out; 5337 goto out;
5284 } 5338 }
5285 5339
5286 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5340 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5287 !cpumask_equal(&p->cpus_allowed, new_mask))) { 5341 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5288 ret = -EINVAL; 5342 ret = -EINVAL;
5289 goto out; 5343 goto out;
5290 } 5344 }
5291 5345
5292 if (p->sched_class->set_cpus_allowed) 5346 if (p->sched_class->set_cpus_allowed)
5293 p->sched_class->set_cpus_allowed(p, new_mask); 5347 p->sched_class->set_cpus_allowed(p, new_mask);
5294 else { 5348 else {
5295 cpumask_copy(&p->cpus_allowed, new_mask); 5349 cpumask_copy(&p->cpus_allowed, new_mask);
5296 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5350 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5297 } 5351 }
5298 5352
5299 /* Can the task run on the task's current CPU? If so, we're done */ 5353 /* Can the task run on the task's current CPU? If so, we're done */
5300 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5354 if (cpumask_test_cpu(task_cpu(p), new_mask))
5301 goto out; 5355 goto out;
5302 5356
5303 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5357 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
5304 /* Need help from migration thread: drop lock and wait. */ 5358 /* Need help from migration thread: drop lock and wait. */
5305 struct task_struct *mt = rq->migration_thread; 5359 struct task_struct *mt = rq->migration_thread;
5306 5360
5307 get_task_struct(mt); 5361 get_task_struct(mt);
5308 task_rq_unlock(rq, &flags); 5362 task_rq_unlock(rq, &flags);
5309 wake_up_process(mt); 5363 wake_up_process(mt);
5310 put_task_struct(mt); 5364 put_task_struct(mt);
5311 wait_for_completion(&req.done); 5365 wait_for_completion(&req.done);
5312 tlb_migrate_finish(p->mm); 5366 tlb_migrate_finish(p->mm);
5313 return 0; 5367 return 0;
5314 } 5368 }
5315 out: 5369 out:
5316 task_rq_unlock(rq, &flags); 5370 task_rq_unlock(rq, &flags);
5317 5371
5318 return ret; 5372 return ret;
5319 } 5373 }
5320 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 5374 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
5321 5375
5322 /* 5376 /*
5323 * Move (not current) task off this cpu, onto dest cpu. We're doing 5377 * Move (not current) task off this cpu, onto dest cpu. We're doing
5324 * this because either it can't run here any more (set_cpus_allowed() 5378 * this because either it can't run here any more (set_cpus_allowed()
5325 * away from this CPU, or CPU going down), or because we're 5379 * away from this CPU, or CPU going down), or because we're
5326 * attempting to rebalance this task on exec (sched_exec). 5380 * attempting to rebalance this task on exec (sched_exec).
5327 * 5381 *
5328 * So we race with normal scheduler movements, but that's OK, as long 5382 * So we race with normal scheduler movements, but that's OK, as long
5329 * as the task is no longer on this CPU. 5383 * as the task is no longer on this CPU.
5330 * 5384 *
5331 * Returns non-zero if task was successfully migrated. 5385 * Returns non-zero if task was successfully migrated.
5332 */ 5386 */
5333 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 5387 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5334 { 5388 {
5335 struct rq *rq_dest, *rq_src; 5389 struct rq *rq_dest, *rq_src;
5336 int ret = 0; 5390 int ret = 0;
5337 5391
5338 if (unlikely(!cpu_active(dest_cpu))) 5392 if (unlikely(!cpu_active(dest_cpu)))
5339 return ret; 5393 return ret;
5340 5394
5341 rq_src = cpu_rq(src_cpu); 5395 rq_src = cpu_rq(src_cpu);
5342 rq_dest = cpu_rq(dest_cpu); 5396 rq_dest = cpu_rq(dest_cpu);
5343 5397
5344 double_rq_lock(rq_src, rq_dest); 5398 double_rq_lock(rq_src, rq_dest);
5345 /* Already moved. */ 5399 /* Already moved. */
5346 if (task_cpu(p) != src_cpu) 5400 if (task_cpu(p) != src_cpu)
5347 goto done; 5401 goto done;
5348 /* Affinity changed (again). */ 5402 /* Affinity changed (again). */
5349 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 5403 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
5350 goto fail; 5404 goto fail;
5351 5405
5352 /* 5406 /*
5353 * If we're not on a rq, the next wake-up will ensure we're 5407 * If we're not on a rq, the next wake-up will ensure we're
5354 * placed properly. 5408 * placed properly.
5355 */ 5409 */
5356 if (p->se.on_rq) { 5410 if (p->se.on_rq) {
5357 deactivate_task(rq_src, p, 0); 5411 deactivate_task(rq_src, p, 0);
5358 set_task_cpu(p, dest_cpu); 5412 set_task_cpu(p, dest_cpu);
5359 activate_task(rq_dest, p, 0); 5413 activate_task(rq_dest, p, 0);
5360 check_preempt_curr(rq_dest, p, 0); 5414 check_preempt_curr(rq_dest, p, 0);
5361 } 5415 }
5362 done: 5416 done:
5363 ret = 1; 5417 ret = 1;
5364 fail: 5418 fail:
5365 double_rq_unlock(rq_src, rq_dest); 5419 double_rq_unlock(rq_src, rq_dest);
5366 return ret; 5420 return ret;
5367 } 5421 }
5368 5422
5369 #define RCU_MIGRATION_IDLE 0 5423 #define RCU_MIGRATION_IDLE 0
5370 #define RCU_MIGRATION_NEED_QS 1 5424 #define RCU_MIGRATION_NEED_QS 1
5371 #define RCU_MIGRATION_GOT_QS 2 5425 #define RCU_MIGRATION_GOT_QS 2
5372 #define RCU_MIGRATION_MUST_SYNC 3 5426 #define RCU_MIGRATION_MUST_SYNC 3
5373 5427
5374 /* 5428 /*
5375 * migration_thread - this is a highprio system thread that performs 5429 * migration_thread - this is a highprio system thread that performs
5376 * thread migration by bumping thread off CPU then 'pushing' onto 5430 * thread migration by bumping thread off CPU then 'pushing' onto
5377 * another runqueue. 5431 * another runqueue.
5378 */ 5432 */
5379 static int migration_thread(void *data) 5433 static int migration_thread(void *data)
5380 { 5434 {
5381 int badcpu; 5435 int badcpu;
5382 int cpu = (long)data; 5436 int cpu = (long)data;
5383 struct rq *rq; 5437 struct rq *rq;
5384 5438
5385 rq = cpu_rq(cpu); 5439 rq = cpu_rq(cpu);
5386 BUG_ON(rq->migration_thread != current); 5440 BUG_ON(rq->migration_thread != current);
5387 5441
5388 set_current_state(TASK_INTERRUPTIBLE); 5442 set_current_state(TASK_INTERRUPTIBLE);
5389 while (!kthread_should_stop()) { 5443 while (!kthread_should_stop()) {
5390 struct migration_req *req; 5444 struct migration_req *req;
5391 struct list_head *head; 5445 struct list_head *head;
5392 5446
5393 raw_spin_lock_irq(&rq->lock); 5447 raw_spin_lock_irq(&rq->lock);
5394 5448
5395 if (cpu_is_offline(cpu)) { 5449 if (cpu_is_offline(cpu)) {
5396 raw_spin_unlock_irq(&rq->lock); 5450 raw_spin_unlock_irq(&rq->lock);
5397 break; 5451 break;
5398 } 5452 }
5399 5453
5400 if (rq->active_balance) { 5454 if (rq->active_balance) {
5401 active_load_balance(rq, cpu); 5455 active_load_balance(rq, cpu);
5402 rq->active_balance = 0; 5456 rq->active_balance = 0;
5403 } 5457 }
5404 5458
5405 head = &rq->migration_queue; 5459 head = &rq->migration_queue;
5406 5460
5407 if (list_empty(head)) { 5461 if (list_empty(head)) {
5408 raw_spin_unlock_irq(&rq->lock); 5462 raw_spin_unlock_irq(&rq->lock);
5409 schedule(); 5463 schedule();
5410 set_current_state(TASK_INTERRUPTIBLE); 5464 set_current_state(TASK_INTERRUPTIBLE);
5411 continue; 5465 continue;
5412 } 5466 }
5413 req = list_entry(head->next, struct migration_req, list); 5467 req = list_entry(head->next, struct migration_req, list);
5414 list_del_init(head->next); 5468 list_del_init(head->next);
5415 5469
5416 if (req->task != NULL) { 5470 if (req->task != NULL) {
5417 raw_spin_unlock(&rq->lock); 5471 raw_spin_unlock(&rq->lock);
5418 __migrate_task(req->task, cpu, req->dest_cpu); 5472 __migrate_task(req->task, cpu, req->dest_cpu);
5419 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 5473 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5420 req->dest_cpu = RCU_MIGRATION_GOT_QS; 5474 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5421 raw_spin_unlock(&rq->lock); 5475 raw_spin_unlock(&rq->lock);
5422 } else { 5476 } else {
5423 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 5477 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5424 raw_spin_unlock(&rq->lock); 5478 raw_spin_unlock(&rq->lock);
5425 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 5479 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5426 } 5480 }
5427 local_irq_enable(); 5481 local_irq_enable();
5428 5482
5429 complete(&req->done); 5483 complete(&req->done);
5430 } 5484 }
5431 __set_current_state(TASK_RUNNING); 5485 __set_current_state(TASK_RUNNING);
5432 5486
5433 return 0; 5487 return 0;
5434 } 5488 }
5435 5489
5436 #ifdef CONFIG_HOTPLUG_CPU 5490 #ifdef CONFIG_HOTPLUG_CPU
5437 /* 5491 /*
5438 * Figure out where task on dead CPU should go, use force if necessary. 5492 * Figure out where task on dead CPU should go, use force if necessary.
5439 */ 5493 */
5440 void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5494 void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5441 { 5495 {
5442 struct rq *rq = cpu_rq(dead_cpu); 5496 struct rq *rq = cpu_rq(dead_cpu);
5443 int needs_cpu, uninitialized_var(dest_cpu); 5497 int needs_cpu, uninitialized_var(dest_cpu);
5444 unsigned long flags; 5498 unsigned long flags;
5445 5499
5446 local_irq_save(flags); 5500 local_irq_save(flags);
5447 5501
5448 raw_spin_lock(&rq->lock); 5502 raw_spin_lock(&rq->lock);
5449 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5503 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5450 if (needs_cpu) 5504 if (needs_cpu)
5451 dest_cpu = select_fallback_rq(dead_cpu, p); 5505 dest_cpu = select_fallback_rq(dead_cpu, p);
5452 raw_spin_unlock(&rq->lock); 5506 raw_spin_unlock(&rq->lock);
5453 /* 5507 /*
5454 * It can only fail if we race with set_cpus_allowed(), 5508 * It can only fail if we race with set_cpus_allowed(),
5455 * in the racer should migrate the task anyway. 5509 * in the racer should migrate the task anyway.
5456 */ 5510 */
5457 if (needs_cpu) 5511 if (needs_cpu)
5458 __migrate_task(p, dead_cpu, dest_cpu); 5512 __migrate_task(p, dead_cpu, dest_cpu);
5459 local_irq_restore(flags); 5513 local_irq_restore(flags);
5460 } 5514 }
5461 5515
5462 /* 5516 /*
5463 * While a dead CPU has no uninterruptible tasks queued at this point, 5517 * While a dead CPU has no uninterruptible tasks queued at this point,
5464 * it might still have a nonzero ->nr_uninterruptible counter, because 5518 * it might still have a nonzero ->nr_uninterruptible counter, because
5465 * for performance reasons the counter is not stricly tracking tasks to 5519 * for performance reasons the counter is not stricly tracking tasks to
5466 * their home CPUs. So we just add the counter to another CPU's counter, 5520 * their home CPUs. So we just add the counter to another CPU's counter,
5467 * to keep the global sum constant after CPU-down: 5521 * to keep the global sum constant after CPU-down:
5468 */ 5522 */
5469 static void migrate_nr_uninterruptible(struct rq *rq_src) 5523 static void migrate_nr_uninterruptible(struct rq *rq_src)
5470 { 5524 {
5471 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5525 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5472 unsigned long flags; 5526 unsigned long flags;
5473 5527
5474 local_irq_save(flags); 5528 local_irq_save(flags);
5475 double_rq_lock(rq_src, rq_dest); 5529 double_rq_lock(rq_src, rq_dest);
5476 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5530 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5477 rq_src->nr_uninterruptible = 0; 5531 rq_src->nr_uninterruptible = 0;
5478 double_rq_unlock(rq_src, rq_dest); 5532 double_rq_unlock(rq_src, rq_dest);
5479 local_irq_restore(flags); 5533 local_irq_restore(flags);
5480 } 5534 }
5481 5535
5482 /* Run through task list and migrate tasks from the dead cpu. */ 5536 /* Run through task list and migrate tasks from the dead cpu. */
5483 static void migrate_live_tasks(int src_cpu) 5537 static void migrate_live_tasks(int src_cpu)
5484 { 5538 {
5485 struct task_struct *p, *t; 5539 struct task_struct *p, *t;
5486 5540
5487 read_lock(&tasklist_lock); 5541 read_lock(&tasklist_lock);
5488 5542
5489 do_each_thread(t, p) { 5543 do_each_thread(t, p) {
5490 if (p == current) 5544 if (p == current)
5491 continue; 5545 continue;
5492 5546
5493 if (task_cpu(p) == src_cpu) 5547 if (task_cpu(p) == src_cpu)
5494 move_task_off_dead_cpu(src_cpu, p); 5548 move_task_off_dead_cpu(src_cpu, p);
5495 } while_each_thread(t, p); 5549 } while_each_thread(t, p);
5496 5550
5497 read_unlock(&tasklist_lock); 5551 read_unlock(&tasklist_lock);
5498 } 5552 }
5499 5553
5500 /* 5554 /*
5501 * Schedules idle task to be the next runnable task on current CPU. 5555 * Schedules idle task to be the next runnable task on current CPU.
5502 * It does so by boosting its priority to highest possible. 5556 * It does so by boosting its priority to highest possible.
5503 * Used by CPU offline code. 5557 * Used by CPU offline code.
5504 */ 5558 */
5505 void sched_idle_next(void) 5559 void sched_idle_next(void)
5506 { 5560 {
5507 int this_cpu = smp_processor_id(); 5561 int this_cpu = smp_processor_id();
5508 struct rq *rq = cpu_rq(this_cpu); 5562 struct rq *rq = cpu_rq(this_cpu);
5509 struct task_struct *p = rq->idle; 5563 struct task_struct *p = rq->idle;
5510 unsigned long flags; 5564 unsigned long flags;
5511 5565
5512 /* cpu has to be offline */ 5566 /* cpu has to be offline */
5513 BUG_ON(cpu_online(this_cpu)); 5567 BUG_ON(cpu_online(this_cpu));
5514 5568
5515 /* 5569 /*
5516 * Strictly not necessary since rest of the CPUs are stopped by now 5570 * Strictly not necessary since rest of the CPUs are stopped by now
5517 * and interrupts disabled on the current cpu. 5571 * and interrupts disabled on the current cpu.
5518 */ 5572 */
5519 raw_spin_lock_irqsave(&rq->lock, flags); 5573 raw_spin_lock_irqsave(&rq->lock, flags);
5520 5574
5521 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5575 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5522 5576
5523 activate_task(rq, p, 0); 5577 activate_task(rq, p, 0);
5524 5578
5525 raw_spin_unlock_irqrestore(&rq->lock, flags); 5579 raw_spin_unlock_irqrestore(&rq->lock, flags);
5526 } 5580 }
5527 5581
5528 /* 5582 /*
5529 * Ensures that the idle task is using init_mm right before its cpu goes 5583 * Ensures that the idle task is using init_mm right before its cpu goes
5530 * offline. 5584 * offline.
5531 */ 5585 */
5532 void idle_task_exit(void) 5586 void idle_task_exit(void)
5533 { 5587 {
5534 struct mm_struct *mm = current->active_mm; 5588 struct mm_struct *mm = current->active_mm;
5535 5589
5536 BUG_ON(cpu_online(smp_processor_id())); 5590 BUG_ON(cpu_online(smp_processor_id()));
5537 5591
5538 if (mm != &init_mm) 5592 if (mm != &init_mm)
5539 switch_mm(mm, &init_mm, current); 5593 switch_mm(mm, &init_mm, current);
5540 mmdrop(mm); 5594 mmdrop(mm);
5541 } 5595 }
5542 5596
5543 /* called under rq->lock with disabled interrupts */ 5597 /* called under rq->lock with disabled interrupts */
5544 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5598 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5545 { 5599 {
5546 struct rq *rq = cpu_rq(dead_cpu); 5600 struct rq *rq = cpu_rq(dead_cpu);
5547 5601
5548 /* Must be exiting, otherwise would be on tasklist. */ 5602 /* Must be exiting, otherwise would be on tasklist. */
5549 BUG_ON(!p->exit_state); 5603 BUG_ON(!p->exit_state);
5550 5604
5551 /* Cannot have done final schedule yet: would have vanished. */ 5605 /* Cannot have done final schedule yet: would have vanished. */
5552 BUG_ON(p->state == TASK_DEAD); 5606 BUG_ON(p->state == TASK_DEAD);
5553 5607
5554 get_task_struct(p); 5608 get_task_struct(p);
5555 5609
5556 /* 5610 /*
5557 * Drop lock around migration; if someone else moves it, 5611 * Drop lock around migration; if someone else moves it,
5558 * that's OK. No task can be added to this CPU, so iteration is 5612 * that's OK. No task can be added to this CPU, so iteration is
5559 * fine. 5613 * fine.
5560 */ 5614 */
5561 raw_spin_unlock_irq(&rq->lock); 5615 raw_spin_unlock_irq(&rq->lock);
5562 move_task_off_dead_cpu(dead_cpu, p); 5616 move_task_off_dead_cpu(dead_cpu, p);
5563 raw_spin_lock_irq(&rq->lock); 5617 raw_spin_lock_irq(&rq->lock);
5564 5618
5565 put_task_struct(p); 5619 put_task_struct(p);
5566 } 5620 }
5567 5621
5568 /* release_task() removes task from tasklist, so we won't find dead tasks. */ 5622 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5569 static void migrate_dead_tasks(unsigned int dead_cpu) 5623 static void migrate_dead_tasks(unsigned int dead_cpu)
5570 { 5624 {
5571 struct rq *rq = cpu_rq(dead_cpu); 5625 struct rq *rq = cpu_rq(dead_cpu);
5572 struct task_struct *next; 5626 struct task_struct *next;
5573 5627
5574 for ( ; ; ) { 5628 for ( ; ; ) {
5575 if (!rq->nr_running) 5629 if (!rq->nr_running)
5576 break; 5630 break;
5577 next = pick_next_task(rq); 5631 next = pick_next_task(rq);
5578 if (!next) 5632 if (!next)
5579 break; 5633 break;
5580 next->sched_class->put_prev_task(rq, next); 5634 next->sched_class->put_prev_task(rq, next);
5581 migrate_dead(dead_cpu, next); 5635 migrate_dead(dead_cpu, next);
5582 5636
5583 } 5637 }
5584 } 5638 }
5585 5639
5586 /* 5640 /*
5587 * remove the tasks which were accounted by rq from calc_load_tasks. 5641 * remove the tasks which were accounted by rq from calc_load_tasks.
5588 */ 5642 */
5589 static void calc_global_load_remove(struct rq *rq) 5643 static void calc_global_load_remove(struct rq *rq)
5590 { 5644 {
5591 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 5645 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5592 rq->calc_load_active = 0; 5646 rq->calc_load_active = 0;
5593 } 5647 }
5594 #endif /* CONFIG_HOTPLUG_CPU */ 5648 #endif /* CONFIG_HOTPLUG_CPU */
5595 5649
5596 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5650 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5597 5651
5598 static struct ctl_table sd_ctl_dir[] = { 5652 static struct ctl_table sd_ctl_dir[] = {
5599 { 5653 {
5600 .procname = "sched_domain", 5654 .procname = "sched_domain",
5601 .mode = 0555, 5655 .mode = 0555,
5602 }, 5656 },
5603 {} 5657 {}
5604 }; 5658 };
5605 5659
5606 static struct ctl_table sd_ctl_root[] = { 5660 static struct ctl_table sd_ctl_root[] = {
5607 { 5661 {
5608 .procname = "kernel", 5662 .procname = "kernel",
5609 .mode = 0555, 5663 .mode = 0555,
5610 .child = sd_ctl_dir, 5664 .child = sd_ctl_dir,
5611 }, 5665 },
5612 {} 5666 {}
5613 }; 5667 };
5614 5668
5615 static struct ctl_table *sd_alloc_ctl_entry(int n) 5669 static struct ctl_table *sd_alloc_ctl_entry(int n)
5616 { 5670 {
5617 struct ctl_table *entry = 5671 struct ctl_table *entry =
5618 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 5672 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
5619 5673
5620 return entry; 5674 return entry;
5621 } 5675 }
5622 5676
5623 static void sd_free_ctl_entry(struct ctl_table **tablep) 5677 static void sd_free_ctl_entry(struct ctl_table **tablep)
5624 { 5678 {
5625 struct ctl_table *entry; 5679 struct ctl_table *entry;
5626 5680
5627 /* 5681 /*
5628 * In the intermediate directories, both the child directory and 5682 * In the intermediate directories, both the child directory and
5629 * procname are dynamically allocated and could fail but the mode 5683 * procname are dynamically allocated and could fail but the mode
5630 * will always be set. In the lowest directory the names are 5684 * will always be set. In the lowest directory the names are
5631 * static strings and all have proc handlers. 5685 * static strings and all have proc handlers.
5632 */ 5686 */
5633 for (entry = *tablep; entry->mode; entry++) { 5687 for (entry = *tablep; entry->mode; entry++) {
5634 if (entry->child) 5688 if (entry->child)
5635 sd_free_ctl_entry(&entry->child); 5689 sd_free_ctl_entry(&entry->child);
5636 if (entry->proc_handler == NULL) 5690 if (entry->proc_handler == NULL)
5637 kfree(entry->procname); 5691 kfree(entry->procname);
5638 } 5692 }
5639 5693
5640 kfree(*tablep); 5694 kfree(*tablep);
5641 *tablep = NULL; 5695 *tablep = NULL;
5642 } 5696 }
5643 5697
5644 static void 5698 static void
5645 set_table_entry(struct ctl_table *entry, 5699 set_table_entry(struct ctl_table *entry,
5646 const char *procname, void *data, int maxlen, 5700 const char *procname, void *data, int maxlen,
5647 mode_t mode, proc_handler *proc_handler) 5701 mode_t mode, proc_handler *proc_handler)
5648 { 5702 {
5649 entry->procname = procname; 5703 entry->procname = procname;
5650 entry->data = data; 5704 entry->data = data;
5651 entry->maxlen = maxlen; 5705 entry->maxlen = maxlen;
5652 entry->mode = mode; 5706 entry->mode = mode;
5653 entry->proc_handler = proc_handler; 5707 entry->proc_handler = proc_handler;
5654 } 5708 }
5655 5709
5656 static struct ctl_table * 5710 static struct ctl_table *
5657 sd_alloc_ctl_domain_table(struct sched_domain *sd) 5711 sd_alloc_ctl_domain_table(struct sched_domain *sd)
5658 { 5712 {
5659 struct ctl_table *table = sd_alloc_ctl_entry(13); 5713 struct ctl_table *table = sd_alloc_ctl_entry(13);
5660 5714
5661 if (table == NULL) 5715 if (table == NULL)
5662 return NULL; 5716 return NULL;
5663 5717
5664 set_table_entry(&table[0], "min_interval", &sd->min_interval, 5718 set_table_entry(&table[0], "min_interval", &sd->min_interval,
5665 sizeof(long), 0644, proc_doulongvec_minmax); 5719 sizeof(long), 0644, proc_doulongvec_minmax);
5666 set_table_entry(&table[1], "max_interval", &sd->max_interval, 5720 set_table_entry(&table[1], "max_interval", &sd->max_interval,
5667 sizeof(long), 0644, proc_doulongvec_minmax); 5721 sizeof(long), 0644, proc_doulongvec_minmax);
5668 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 5722 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
5669 sizeof(int), 0644, proc_dointvec_minmax); 5723 sizeof(int), 0644, proc_dointvec_minmax);
5670 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 5724 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
5671 sizeof(int), 0644, proc_dointvec_minmax); 5725 sizeof(int), 0644, proc_dointvec_minmax);
5672 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 5726 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
5673 sizeof(int), 0644, proc_dointvec_minmax); 5727 sizeof(int), 0644, proc_dointvec_minmax);
5674 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 5728 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
5675 sizeof(int), 0644, proc_dointvec_minmax); 5729 sizeof(int), 0644, proc_dointvec_minmax);
5676 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 5730 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
5677 sizeof(int), 0644, proc_dointvec_minmax); 5731 sizeof(int), 0644, proc_dointvec_minmax);
5678 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 5732 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
5679 sizeof(int), 0644, proc_dointvec_minmax); 5733 sizeof(int), 0644, proc_dointvec_minmax);
5680 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 5734 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
5681 sizeof(int), 0644, proc_dointvec_minmax); 5735 sizeof(int), 0644, proc_dointvec_minmax);
5682 set_table_entry(&table[9], "cache_nice_tries", 5736 set_table_entry(&table[9], "cache_nice_tries",
5683 &sd->cache_nice_tries, 5737 &sd->cache_nice_tries,
5684 sizeof(int), 0644, proc_dointvec_minmax); 5738 sizeof(int), 0644, proc_dointvec_minmax);
5685 set_table_entry(&table[10], "flags", &sd->flags, 5739 set_table_entry(&table[10], "flags", &sd->flags,
5686 sizeof(int), 0644, proc_dointvec_minmax); 5740 sizeof(int), 0644, proc_dointvec_minmax);
5687 set_table_entry(&table[11], "name", sd->name, 5741 set_table_entry(&table[11], "name", sd->name,
5688 CORENAME_MAX_SIZE, 0444, proc_dostring); 5742 CORENAME_MAX_SIZE, 0444, proc_dostring);
5689 /* &table[12] is terminator */ 5743 /* &table[12] is terminator */
5690 5744
5691 return table; 5745 return table;
5692 } 5746 }
5693 5747
5694 static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5748 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5695 { 5749 {
5696 struct ctl_table *entry, *table; 5750 struct ctl_table *entry, *table;
5697 struct sched_domain *sd; 5751 struct sched_domain *sd;
5698 int domain_num = 0, i; 5752 int domain_num = 0, i;
5699 char buf[32]; 5753 char buf[32];
5700 5754
5701 for_each_domain(cpu, sd) 5755 for_each_domain(cpu, sd)
5702 domain_num++; 5756 domain_num++;
5703 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5757 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5704 if (table == NULL) 5758 if (table == NULL)
5705 return NULL; 5759 return NULL;
5706 5760
5707 i = 0; 5761 i = 0;
5708 for_each_domain(cpu, sd) { 5762 for_each_domain(cpu, sd) {
5709 snprintf(buf, 32, "domain%d", i); 5763 snprintf(buf, 32, "domain%d", i);
5710 entry->procname = kstrdup(buf, GFP_KERNEL); 5764 entry->procname = kstrdup(buf, GFP_KERNEL);
5711 entry->mode = 0555; 5765 entry->mode = 0555;
5712 entry->child = sd_alloc_ctl_domain_table(sd); 5766 entry->child = sd_alloc_ctl_domain_table(sd);
5713 entry++; 5767 entry++;
5714 i++; 5768 i++;
5715 } 5769 }
5716 return table; 5770 return table;
5717 } 5771 }
5718 5772
5719 static struct ctl_table_header *sd_sysctl_header; 5773 static struct ctl_table_header *sd_sysctl_header;
5720 static void register_sched_domain_sysctl(void) 5774 static void register_sched_domain_sysctl(void)
5721 { 5775 {
5722 int i, cpu_num = num_possible_cpus(); 5776 int i, cpu_num = num_possible_cpus();
5723 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5777 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5724 char buf[32]; 5778 char buf[32];
5725 5779
5726 WARN_ON(sd_ctl_dir[0].child); 5780 WARN_ON(sd_ctl_dir[0].child);
5727 sd_ctl_dir[0].child = entry; 5781 sd_ctl_dir[0].child = entry;
5728 5782
5729 if (entry == NULL) 5783 if (entry == NULL)
5730 return; 5784 return;
5731 5785
5732 for_each_possible_cpu(i) { 5786 for_each_possible_cpu(i) {
5733 snprintf(buf, 32, "cpu%d", i); 5787 snprintf(buf, 32, "cpu%d", i);
5734 entry->procname = kstrdup(buf, GFP_KERNEL); 5788 entry->procname = kstrdup(buf, GFP_KERNEL);
5735 entry->mode = 0555; 5789 entry->mode = 0555;
5736 entry->child = sd_alloc_ctl_cpu_table(i); 5790 entry->child = sd_alloc_ctl_cpu_table(i);
5737 entry++; 5791 entry++;
5738 } 5792 }
5739 5793
5740 WARN_ON(sd_sysctl_header); 5794 WARN_ON(sd_sysctl_header);
5741 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5795 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5742 } 5796 }
5743 5797
5744 /* may be called multiple times per register */ 5798 /* may be called multiple times per register */
5745 static void unregister_sched_domain_sysctl(void) 5799 static void unregister_sched_domain_sysctl(void)
5746 { 5800 {
5747 if (sd_sysctl_header) 5801 if (sd_sysctl_header)
5748 unregister_sysctl_table(sd_sysctl_header); 5802 unregister_sysctl_table(sd_sysctl_header);
5749 sd_sysctl_header = NULL; 5803 sd_sysctl_header = NULL;
5750 if (sd_ctl_dir[0].child) 5804 if (sd_ctl_dir[0].child)
5751 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5805 sd_free_ctl_entry(&sd_ctl_dir[0].child);
5752 } 5806 }
5753 #else 5807 #else
5754 static void register_sched_domain_sysctl(void) 5808 static void register_sched_domain_sysctl(void)
5755 { 5809 {
5756 } 5810 }
5757 static void unregister_sched_domain_sysctl(void) 5811 static void unregister_sched_domain_sysctl(void)
5758 { 5812 {
5759 } 5813 }
5760 #endif 5814 #endif
5761 5815
5762 static void set_rq_online(struct rq *rq) 5816 static void set_rq_online(struct rq *rq)
5763 { 5817 {
5764 if (!rq->online) { 5818 if (!rq->online) {
5765 const struct sched_class *class; 5819 const struct sched_class *class;
5766 5820
5767 cpumask_set_cpu(rq->cpu, rq->rd->online); 5821 cpumask_set_cpu(rq->cpu, rq->rd->online);
5768 rq->online = 1; 5822 rq->online = 1;
5769 5823
5770 for_each_class(class) { 5824 for_each_class(class) {
5771 if (class->rq_online) 5825 if (class->rq_online)
5772 class->rq_online(rq); 5826 class->rq_online(rq);
5773 } 5827 }
5774 } 5828 }
5775 } 5829 }
5776 5830
5777 static void set_rq_offline(struct rq *rq) 5831 static void set_rq_offline(struct rq *rq)
5778 { 5832 {
5779 if (rq->online) { 5833 if (rq->online) {
5780 const struct sched_class *class; 5834 const struct sched_class *class;
5781 5835
5782 for_each_class(class) { 5836 for_each_class(class) {
5783 if (class->rq_offline) 5837 if (class->rq_offline)
5784 class->rq_offline(rq); 5838 class->rq_offline(rq);
5785 } 5839 }
5786 5840
5787 cpumask_clear_cpu(rq->cpu, rq->rd->online); 5841 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5788 rq->online = 0; 5842 rq->online = 0;
5789 } 5843 }
5790 } 5844 }
5791 5845
5792 /* 5846 /*
5793 * migration_call - callback that gets triggered when a CPU is added. 5847 * migration_call - callback that gets triggered when a CPU is added.
5794 * Here we can start up the necessary migration thread for the new CPU. 5848 * Here we can start up the necessary migration thread for the new CPU.
5795 */ 5849 */
5796 static int __cpuinit 5850 static int __cpuinit
5797 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5851 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5798 { 5852 {
5799 struct task_struct *p; 5853 struct task_struct *p;
5800 int cpu = (long)hcpu; 5854 int cpu = (long)hcpu;
5801 unsigned long flags; 5855 unsigned long flags;
5802 struct rq *rq; 5856 struct rq *rq;
5803 5857
5804 switch (action) { 5858 switch (action) {
5805 5859
5806 case CPU_UP_PREPARE: 5860 case CPU_UP_PREPARE:
5807 case CPU_UP_PREPARE_FROZEN: 5861 case CPU_UP_PREPARE_FROZEN:
5808 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); 5862 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5809 if (IS_ERR(p)) 5863 if (IS_ERR(p))
5810 return NOTIFY_BAD; 5864 return NOTIFY_BAD;
5811 kthread_bind(p, cpu); 5865 kthread_bind(p, cpu);
5812 /* Must be high prio: stop_machine expects to yield to it. */ 5866 /* Must be high prio: stop_machine expects to yield to it. */
5813 rq = task_rq_lock(p, &flags); 5867 rq = task_rq_lock(p, &flags);
5814 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5868 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5815 task_rq_unlock(rq, &flags); 5869 task_rq_unlock(rq, &flags);
5816 get_task_struct(p); 5870 get_task_struct(p);
5817 cpu_rq(cpu)->migration_thread = p; 5871 cpu_rq(cpu)->migration_thread = p;
5818 rq->calc_load_update = calc_load_update; 5872 rq->calc_load_update = calc_load_update;
5819 break; 5873 break;
5820 5874
5821 case CPU_ONLINE: 5875 case CPU_ONLINE:
5822 case CPU_ONLINE_FROZEN: 5876 case CPU_ONLINE_FROZEN:
5823 /* Strictly unnecessary, as first user will wake it. */ 5877 /* Strictly unnecessary, as first user will wake it. */
5824 wake_up_process(cpu_rq(cpu)->migration_thread); 5878 wake_up_process(cpu_rq(cpu)->migration_thread);
5825 5879
5826 /* Update our root-domain */ 5880 /* Update our root-domain */
5827 rq = cpu_rq(cpu); 5881 rq = cpu_rq(cpu);
5828 raw_spin_lock_irqsave(&rq->lock, flags); 5882 raw_spin_lock_irqsave(&rq->lock, flags);
5829 if (rq->rd) { 5883 if (rq->rd) {
5830 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5884 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5831 5885
5832 set_rq_online(rq); 5886 set_rq_online(rq);
5833 } 5887 }
5834 raw_spin_unlock_irqrestore(&rq->lock, flags); 5888 raw_spin_unlock_irqrestore(&rq->lock, flags);
5835 break; 5889 break;
5836 5890
5837 #ifdef CONFIG_HOTPLUG_CPU 5891 #ifdef CONFIG_HOTPLUG_CPU
5838 case CPU_UP_CANCELED: 5892 case CPU_UP_CANCELED:
5839 case CPU_UP_CANCELED_FROZEN: 5893 case CPU_UP_CANCELED_FROZEN:
5840 if (!cpu_rq(cpu)->migration_thread) 5894 if (!cpu_rq(cpu)->migration_thread)
5841 break; 5895 break;
5842 /* Unbind it from offline cpu so it can run. Fall thru. */ 5896 /* Unbind it from offline cpu so it can run. Fall thru. */
5843 kthread_bind(cpu_rq(cpu)->migration_thread, 5897 kthread_bind(cpu_rq(cpu)->migration_thread,
5844 cpumask_any(cpu_online_mask)); 5898 cpumask_any(cpu_online_mask));
5845 kthread_stop(cpu_rq(cpu)->migration_thread); 5899 kthread_stop(cpu_rq(cpu)->migration_thread);
5846 put_task_struct(cpu_rq(cpu)->migration_thread); 5900 put_task_struct(cpu_rq(cpu)->migration_thread);
5847 cpu_rq(cpu)->migration_thread = NULL; 5901 cpu_rq(cpu)->migration_thread = NULL;
5848 break; 5902 break;
5849 5903
5850 case CPU_DEAD: 5904 case CPU_DEAD:
5851 case CPU_DEAD_FROZEN: 5905 case CPU_DEAD_FROZEN:
5852 migrate_live_tasks(cpu); 5906 migrate_live_tasks(cpu);
5853 rq = cpu_rq(cpu); 5907 rq = cpu_rq(cpu);
5854 kthread_stop(rq->migration_thread); 5908 kthread_stop(rq->migration_thread);
5855 put_task_struct(rq->migration_thread); 5909 put_task_struct(rq->migration_thread);
5856 rq->migration_thread = NULL; 5910 rq->migration_thread = NULL;
5857 /* Idle task back to normal (off runqueue, low prio) */ 5911 /* Idle task back to normal (off runqueue, low prio) */
5858 raw_spin_lock_irq(&rq->lock); 5912 raw_spin_lock_irq(&rq->lock);
5859 deactivate_task(rq, rq->idle, 0); 5913 deactivate_task(rq, rq->idle, 0);
5860 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5914 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5861 rq->idle->sched_class = &idle_sched_class; 5915 rq->idle->sched_class = &idle_sched_class;
5862 migrate_dead_tasks(cpu); 5916 migrate_dead_tasks(cpu);
5863 raw_spin_unlock_irq(&rq->lock); 5917 raw_spin_unlock_irq(&rq->lock);
5864 migrate_nr_uninterruptible(rq); 5918 migrate_nr_uninterruptible(rq);
5865 BUG_ON(rq->nr_running != 0); 5919 BUG_ON(rq->nr_running != 0);
5866 calc_global_load_remove(rq); 5920 calc_global_load_remove(rq);
5867 /* 5921 /*
5868 * No need to migrate the tasks: it was best-effort if 5922 * No need to migrate the tasks: it was best-effort if
5869 * they didn't take sched_hotcpu_mutex. Just wake up 5923 * they didn't take sched_hotcpu_mutex. Just wake up
5870 * the requestors. 5924 * the requestors.
5871 */ 5925 */
5872 raw_spin_lock_irq(&rq->lock); 5926 raw_spin_lock_irq(&rq->lock);
5873 while (!list_empty(&rq->migration_queue)) { 5927 while (!list_empty(&rq->migration_queue)) {
5874 struct migration_req *req; 5928 struct migration_req *req;
5875 5929
5876 req = list_entry(rq->migration_queue.next, 5930 req = list_entry(rq->migration_queue.next,
5877 struct migration_req, list); 5931 struct migration_req, list);
5878 list_del_init(&req->list); 5932 list_del_init(&req->list);
5879 raw_spin_unlock_irq(&rq->lock); 5933 raw_spin_unlock_irq(&rq->lock);
5880 complete(&req->done); 5934 complete(&req->done);
5881 raw_spin_lock_irq(&rq->lock); 5935 raw_spin_lock_irq(&rq->lock);
5882 } 5936 }
5883 raw_spin_unlock_irq(&rq->lock); 5937 raw_spin_unlock_irq(&rq->lock);
5884 break; 5938 break;
5885 5939
5886 case CPU_DYING: 5940 case CPU_DYING:
5887 case CPU_DYING_FROZEN: 5941 case CPU_DYING_FROZEN:
5888 /* Update our root-domain */ 5942 /* Update our root-domain */
5889 rq = cpu_rq(cpu); 5943 rq = cpu_rq(cpu);
5890 raw_spin_lock_irqsave(&rq->lock, flags); 5944 raw_spin_lock_irqsave(&rq->lock, flags);
5891 if (rq->rd) { 5945 if (rq->rd) {
5892 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5946 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5893 set_rq_offline(rq); 5947 set_rq_offline(rq);
5894 } 5948 }
5895 raw_spin_unlock_irqrestore(&rq->lock, flags); 5949 raw_spin_unlock_irqrestore(&rq->lock, flags);
5896 break; 5950 break;
5897 #endif 5951 #endif
5898 } 5952 }
5899 return NOTIFY_OK; 5953 return NOTIFY_OK;
5900 } 5954 }
5901 5955
5902 /* 5956 /*
5903 * Register at high priority so that task migration (migrate_all_tasks) 5957 * Register at high priority so that task migration (migrate_all_tasks)
5904 * happens before everything else. This has to be lower priority than 5958 * happens before everything else. This has to be lower priority than
5905 * the notifier in the perf_event subsystem, though. 5959 * the notifier in the perf_event subsystem, though.
5906 */ 5960 */
5907 static struct notifier_block __cpuinitdata migration_notifier = { 5961 static struct notifier_block __cpuinitdata migration_notifier = {
5908 .notifier_call = migration_call, 5962 .notifier_call = migration_call,
5909 .priority = 10 5963 .priority = 10
5910 }; 5964 };
5911 5965
5912 static int __init migration_init(void) 5966 static int __init migration_init(void)
5913 { 5967 {
5914 void *cpu = (void *)(long)smp_processor_id(); 5968 void *cpu = (void *)(long)smp_processor_id();
5915 int err; 5969 int err;
5916 5970
5917 /* Start one for the boot CPU: */ 5971 /* Start one for the boot CPU: */
5918 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5972 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5919 BUG_ON(err == NOTIFY_BAD); 5973 BUG_ON(err == NOTIFY_BAD);
5920 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5974 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5921 register_cpu_notifier(&migration_notifier); 5975 register_cpu_notifier(&migration_notifier);
5922 5976
5923 return 0; 5977 return 0;
5924 } 5978 }
5925 early_initcall(migration_init); 5979 early_initcall(migration_init);
5926 #endif 5980 #endif
5927 5981
5928 #ifdef CONFIG_SMP 5982 #ifdef CONFIG_SMP
5929 5983
5930 #ifdef CONFIG_SCHED_DEBUG 5984 #ifdef CONFIG_SCHED_DEBUG
5931 5985
5932 static __read_mostly int sched_domain_debug_enabled; 5986 static __read_mostly int sched_domain_debug_enabled;
5933 5987
5934 static int __init sched_domain_debug_setup(char *str) 5988 static int __init sched_domain_debug_setup(char *str)
5935 { 5989 {
5936 sched_domain_debug_enabled = 1; 5990 sched_domain_debug_enabled = 1;
5937 5991
5938 return 0; 5992 return 0;
5939 } 5993 }
5940 early_param("sched_debug", sched_domain_debug_setup); 5994 early_param("sched_debug", sched_domain_debug_setup);
5941 5995
5942 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5996 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5943 struct cpumask *groupmask) 5997 struct cpumask *groupmask)
5944 { 5998 {
5945 struct sched_group *group = sd->groups; 5999 struct sched_group *group = sd->groups;
5946 char str[256]; 6000 char str[256];
5947 6001
5948 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 6002 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
5949 cpumask_clear(groupmask); 6003 cpumask_clear(groupmask);
5950 6004
5951 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 6005 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
5952 6006
5953 if (!(sd->flags & SD_LOAD_BALANCE)) { 6007 if (!(sd->flags & SD_LOAD_BALANCE)) {
5954 printk("does not load-balance\n"); 6008 printk("does not load-balance\n");
5955 if (sd->parent) 6009 if (sd->parent)
5956 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 6010 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5957 " has parent"); 6011 " has parent");
5958 return -1; 6012 return -1;
5959 } 6013 }
5960 6014
5961 printk(KERN_CONT "span %s level %s\n", str, sd->name); 6015 printk(KERN_CONT "span %s level %s\n", str, sd->name);
5962 6016
5963 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 6017 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
5964 printk(KERN_ERR "ERROR: domain->span does not contain " 6018 printk(KERN_ERR "ERROR: domain->span does not contain "
5965 "CPU%d\n", cpu); 6019 "CPU%d\n", cpu);
5966 } 6020 }
5967 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 6021 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
5968 printk(KERN_ERR "ERROR: domain->groups does not contain" 6022 printk(KERN_ERR "ERROR: domain->groups does not contain"
5969 " CPU%d\n", cpu); 6023 " CPU%d\n", cpu);
5970 } 6024 }
5971 6025
5972 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 6026 printk(KERN_DEBUG "%*s groups:", level + 1, "");
5973 do { 6027 do {
5974 if (!group) { 6028 if (!group) {
5975 printk("\n"); 6029 printk("\n");
5976 printk(KERN_ERR "ERROR: group is NULL\n"); 6030 printk(KERN_ERR "ERROR: group is NULL\n");
5977 break; 6031 break;
5978 } 6032 }
5979 6033
5980 if (!group->cpu_power) { 6034 if (!group->cpu_power) {
5981 printk(KERN_CONT "\n"); 6035 printk(KERN_CONT "\n");
5982 printk(KERN_ERR "ERROR: domain->cpu_power not " 6036 printk(KERN_ERR "ERROR: domain->cpu_power not "
5983 "set\n"); 6037 "set\n");
5984 break; 6038 break;
5985 } 6039 }
5986 6040
5987 if (!cpumask_weight(sched_group_cpus(group))) { 6041 if (!cpumask_weight(sched_group_cpus(group))) {
5988 printk(KERN_CONT "\n"); 6042 printk(KERN_CONT "\n");
5989 printk(KERN_ERR "ERROR: empty group\n"); 6043 printk(KERN_ERR "ERROR: empty group\n");
5990 break; 6044 break;
5991 } 6045 }
5992 6046
5993 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 6047 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
5994 printk(KERN_CONT "\n"); 6048 printk(KERN_CONT "\n");
5995 printk(KERN_ERR "ERROR: repeated CPUs\n"); 6049 printk(KERN_ERR "ERROR: repeated CPUs\n");
5996 break; 6050 break;
5997 } 6051 }
5998 6052
5999 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 6053 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
6000 6054
6001 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6055 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6002 6056
6003 printk(KERN_CONT " %s", str); 6057 printk(KERN_CONT " %s", str);
6004 if (group->cpu_power != SCHED_LOAD_SCALE) { 6058 if (group->cpu_power != SCHED_LOAD_SCALE) {
6005 printk(KERN_CONT " (cpu_power = %d)", 6059 printk(KERN_CONT " (cpu_power = %d)",
6006 group->cpu_power); 6060 group->cpu_power);
6007 } 6061 }
6008 6062
6009 group = group->next; 6063 group = group->next;
6010 } while (group != sd->groups); 6064 } while (group != sd->groups);
6011 printk(KERN_CONT "\n"); 6065 printk(KERN_CONT "\n");
6012 6066
6013 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 6067 if (!cpumask_equal(sched_domain_span(sd), groupmask))
6014 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 6068 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
6015 6069
6016 if (sd->parent && 6070 if (sd->parent &&
6017 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 6071 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
6018 printk(KERN_ERR "ERROR: parent span is not a superset " 6072 printk(KERN_ERR "ERROR: parent span is not a superset "
6019 "of domain->span\n"); 6073 "of domain->span\n");
6020 return 0; 6074 return 0;
6021 } 6075 }
6022 6076
6023 static void sched_domain_debug(struct sched_domain *sd, int cpu) 6077 static void sched_domain_debug(struct sched_domain *sd, int cpu)
6024 { 6078 {
6025 cpumask_var_t groupmask; 6079 cpumask_var_t groupmask;
6026 int level = 0; 6080 int level = 0;
6027 6081
6028 if (!sched_domain_debug_enabled) 6082 if (!sched_domain_debug_enabled)
6029 return; 6083 return;
6030 6084
6031 if (!sd) { 6085 if (!sd) {
6032 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 6086 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6033 return; 6087 return;
6034 } 6088 }
6035 6089
6036 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6090 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6037 6091
6038 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { 6092 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6039 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 6093 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6040 return; 6094 return;
6041 } 6095 }
6042 6096
6043 for (;;) { 6097 for (;;) {
6044 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6098 if (sched_domain_debug_one(sd, cpu, level, groupmask))
6045 break; 6099 break;
6046 level++; 6100 level++;
6047 sd = sd->parent; 6101 sd = sd->parent;
6048 if (!sd) 6102 if (!sd)
6049 break; 6103 break;
6050 } 6104 }
6051 free_cpumask_var(groupmask); 6105 free_cpumask_var(groupmask);
6052 } 6106 }
6053 #else /* !CONFIG_SCHED_DEBUG */ 6107 #else /* !CONFIG_SCHED_DEBUG */
6054 # define sched_domain_debug(sd, cpu) do { } while (0) 6108 # define sched_domain_debug(sd, cpu) do { } while (0)
6055 #endif /* CONFIG_SCHED_DEBUG */ 6109 #endif /* CONFIG_SCHED_DEBUG */
6056 6110
6057 static int sd_degenerate(struct sched_domain *sd) 6111 static int sd_degenerate(struct sched_domain *sd)
6058 { 6112 {
6059 if (cpumask_weight(sched_domain_span(sd)) == 1) 6113 if (cpumask_weight(sched_domain_span(sd)) == 1)
6060 return 1; 6114 return 1;
6061 6115
6062 /* Following flags need at least 2 groups */ 6116 /* Following flags need at least 2 groups */
6063 if (sd->flags & (SD_LOAD_BALANCE | 6117 if (sd->flags & (SD_LOAD_BALANCE |
6064 SD_BALANCE_NEWIDLE | 6118 SD_BALANCE_NEWIDLE |
6065 SD_BALANCE_FORK | 6119 SD_BALANCE_FORK |
6066 SD_BALANCE_EXEC | 6120 SD_BALANCE_EXEC |
6067 SD_SHARE_CPUPOWER | 6121 SD_SHARE_CPUPOWER |
6068 SD_SHARE_PKG_RESOURCES)) { 6122 SD_SHARE_PKG_RESOURCES)) {
6069 if (sd->groups != sd->groups->next) 6123 if (sd->groups != sd->groups->next)
6070 return 0; 6124 return 0;
6071 } 6125 }
6072 6126
6073 /* Following flags don't use groups */ 6127 /* Following flags don't use groups */
6074 if (sd->flags & (SD_WAKE_AFFINE)) 6128 if (sd->flags & (SD_WAKE_AFFINE))
6075 return 0; 6129 return 0;
6076 6130
6077 return 1; 6131 return 1;
6078 } 6132 }
6079 6133
6080 static int 6134 static int
6081 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 6135 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6082 { 6136 {
6083 unsigned long cflags = sd->flags, pflags = parent->flags; 6137 unsigned long cflags = sd->flags, pflags = parent->flags;
6084 6138
6085 if (sd_degenerate(parent)) 6139 if (sd_degenerate(parent))
6086 return 1; 6140 return 1;
6087 6141
6088 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 6142 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
6089 return 0; 6143 return 0;
6090 6144
6091 /* Flags needing groups don't count if only 1 group in parent */ 6145 /* Flags needing groups don't count if only 1 group in parent */
6092 if (parent->groups == parent->groups->next) { 6146 if (parent->groups == parent->groups->next) {
6093 pflags &= ~(SD_LOAD_BALANCE | 6147 pflags &= ~(SD_LOAD_BALANCE |
6094 SD_BALANCE_NEWIDLE | 6148 SD_BALANCE_NEWIDLE |
6095 SD_BALANCE_FORK | 6149 SD_BALANCE_FORK |
6096 SD_BALANCE_EXEC | 6150 SD_BALANCE_EXEC |
6097 SD_SHARE_CPUPOWER | 6151 SD_SHARE_CPUPOWER |
6098 SD_SHARE_PKG_RESOURCES); 6152 SD_SHARE_PKG_RESOURCES);
6099 if (nr_node_ids == 1) 6153 if (nr_node_ids == 1)
6100 pflags &= ~SD_SERIALIZE; 6154 pflags &= ~SD_SERIALIZE;
6101 } 6155 }
6102 if (~cflags & pflags) 6156 if (~cflags & pflags)
6103 return 0; 6157 return 0;
6104 6158
6105 return 1; 6159 return 1;
6106 } 6160 }
6107 6161
6108 static void free_rootdomain(struct root_domain *rd) 6162 static void free_rootdomain(struct root_domain *rd)
6109 { 6163 {
6110 synchronize_sched(); 6164 synchronize_sched();
6111 6165
6112 cpupri_cleanup(&rd->cpupri); 6166 cpupri_cleanup(&rd->cpupri);
6113 6167
6114 free_cpumask_var(rd->rto_mask); 6168 free_cpumask_var(rd->rto_mask);
6115 free_cpumask_var(rd->online); 6169 free_cpumask_var(rd->online);
6116 free_cpumask_var(rd->span); 6170 free_cpumask_var(rd->span);
6117 kfree(rd); 6171 kfree(rd);
6118 } 6172 }
6119 6173
6120 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6174 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6121 { 6175 {
6122 struct root_domain *old_rd = NULL; 6176 struct root_domain *old_rd = NULL;
6123 unsigned long flags; 6177 unsigned long flags;
6124 6178
6125 raw_spin_lock_irqsave(&rq->lock, flags); 6179 raw_spin_lock_irqsave(&rq->lock, flags);
6126 6180
6127 if (rq->rd) { 6181 if (rq->rd) {
6128 old_rd = rq->rd; 6182 old_rd = rq->rd;
6129 6183
6130 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 6184 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6131 set_rq_offline(rq); 6185 set_rq_offline(rq);
6132 6186
6133 cpumask_clear_cpu(rq->cpu, old_rd->span); 6187 cpumask_clear_cpu(rq->cpu, old_rd->span);
6134 6188
6135 /* 6189 /*
6136 * If we dont want to free the old_rt yet then 6190 * If we dont want to free the old_rt yet then
6137 * set old_rd to NULL to skip the freeing later 6191 * set old_rd to NULL to skip the freeing later
6138 * in this function: 6192 * in this function:
6139 */ 6193 */
6140 if (!atomic_dec_and_test(&old_rd->refcount)) 6194 if (!atomic_dec_and_test(&old_rd->refcount))
6141 old_rd = NULL; 6195 old_rd = NULL;
6142 } 6196 }
6143 6197
6144 atomic_inc(&rd->refcount); 6198 atomic_inc(&rd->refcount);
6145 rq->rd = rd; 6199 rq->rd = rd;
6146 6200
6147 cpumask_set_cpu(rq->cpu, rd->span); 6201 cpumask_set_cpu(rq->cpu, rd->span);
6148 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 6202 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
6149 set_rq_online(rq); 6203 set_rq_online(rq);
6150 6204
6151 raw_spin_unlock_irqrestore(&rq->lock, flags); 6205 raw_spin_unlock_irqrestore(&rq->lock, flags);
6152 6206
6153 if (old_rd) 6207 if (old_rd)
6154 free_rootdomain(old_rd); 6208 free_rootdomain(old_rd);
6155 } 6209 }
6156 6210
6157 static int init_rootdomain(struct root_domain *rd, bool bootmem) 6211 static int init_rootdomain(struct root_domain *rd, bool bootmem)
6158 { 6212 {
6159 gfp_t gfp = GFP_KERNEL; 6213 gfp_t gfp = GFP_KERNEL;
6160 6214
6161 memset(rd, 0, sizeof(*rd)); 6215 memset(rd, 0, sizeof(*rd));
6162 6216
6163 if (bootmem) 6217 if (bootmem)
6164 gfp = GFP_NOWAIT; 6218 gfp = GFP_NOWAIT;
6165 6219
6166 if (!alloc_cpumask_var(&rd->span, gfp)) 6220 if (!alloc_cpumask_var(&rd->span, gfp))
6167 goto out; 6221 goto out;
6168 if (!alloc_cpumask_var(&rd->online, gfp)) 6222 if (!alloc_cpumask_var(&rd->online, gfp))
6169 goto free_span; 6223 goto free_span;
6170 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6224 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
6171 goto free_online; 6225 goto free_online;
6172 6226
6173 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6227 if (cpupri_init(&rd->cpupri, bootmem) != 0)
6174 goto free_rto_mask; 6228 goto free_rto_mask;
6175 return 0; 6229 return 0;
6176 6230
6177 free_rto_mask: 6231 free_rto_mask:
6178 free_cpumask_var(rd->rto_mask); 6232 free_cpumask_var(rd->rto_mask);
6179 free_online: 6233 free_online:
6180 free_cpumask_var(rd->online); 6234 free_cpumask_var(rd->online);
6181 free_span: 6235 free_span:
6182 free_cpumask_var(rd->span); 6236 free_cpumask_var(rd->span);
6183 out: 6237 out:
6184 return -ENOMEM; 6238 return -ENOMEM;
6185 } 6239 }
6186 6240
6187 static void init_defrootdomain(void) 6241 static void init_defrootdomain(void)
6188 { 6242 {
6189 init_rootdomain(&def_root_domain, true); 6243 init_rootdomain(&def_root_domain, true);
6190 6244
6191 atomic_set(&def_root_domain.refcount, 1); 6245 atomic_set(&def_root_domain.refcount, 1);
6192 } 6246 }
6193 6247
6194 static struct root_domain *alloc_rootdomain(void) 6248 static struct root_domain *alloc_rootdomain(void)
6195 { 6249 {
6196 struct root_domain *rd; 6250 struct root_domain *rd;
6197 6251
6198 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 6252 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6199 if (!rd) 6253 if (!rd)
6200 return NULL; 6254 return NULL;
6201 6255
6202 if (init_rootdomain(rd, false) != 0) { 6256 if (init_rootdomain(rd, false) != 0) {
6203 kfree(rd); 6257 kfree(rd);
6204 return NULL; 6258 return NULL;
6205 } 6259 }
6206 6260
6207 return rd; 6261 return rd;
6208 } 6262 }
6209 6263
6210 /* 6264 /*
6211 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6265 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6212 * hold the hotplug lock. 6266 * hold the hotplug lock.
6213 */ 6267 */
6214 static void 6268 static void
6215 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 6269 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6216 { 6270 {
6217 struct rq *rq = cpu_rq(cpu); 6271 struct rq *rq = cpu_rq(cpu);
6218 struct sched_domain *tmp; 6272 struct sched_domain *tmp;
6219 6273
6220 /* Remove the sched domains which do not contribute to scheduling. */ 6274 /* Remove the sched domains which do not contribute to scheduling. */
6221 for (tmp = sd; tmp; ) { 6275 for (tmp = sd; tmp; ) {
6222 struct sched_domain *parent = tmp->parent; 6276 struct sched_domain *parent = tmp->parent;
6223 if (!parent) 6277 if (!parent)
6224 break; 6278 break;
6225 6279
6226 if (sd_parent_degenerate(tmp, parent)) { 6280 if (sd_parent_degenerate(tmp, parent)) {
6227 tmp->parent = parent->parent; 6281 tmp->parent = parent->parent;
6228 if (parent->parent) 6282 if (parent->parent)
6229 parent->parent->child = tmp; 6283 parent->parent->child = tmp;
6230 } else 6284 } else
6231 tmp = tmp->parent; 6285 tmp = tmp->parent;
6232 } 6286 }
6233 6287
6234 if (sd && sd_degenerate(sd)) { 6288 if (sd && sd_degenerate(sd)) {
6235 sd = sd->parent; 6289 sd = sd->parent;
6236 if (sd) 6290 if (sd)
6237 sd->child = NULL; 6291 sd->child = NULL;
6238 } 6292 }
6239 6293
6240 sched_domain_debug(sd, cpu); 6294 sched_domain_debug(sd, cpu);
6241 6295
6242 rq_attach_root(rq, rd); 6296 rq_attach_root(rq, rd);
6243 rcu_assign_pointer(rq->sd, sd); 6297 rcu_assign_pointer(rq->sd, sd);
6244 } 6298 }
6245 6299
6246 /* cpus with isolated domains */ 6300 /* cpus with isolated domains */
6247 static cpumask_var_t cpu_isolated_map; 6301 static cpumask_var_t cpu_isolated_map;
6248 6302
6249 /* Setup the mask of cpus configured for isolated domains */ 6303 /* Setup the mask of cpus configured for isolated domains */
6250 static int __init isolated_cpu_setup(char *str) 6304 static int __init isolated_cpu_setup(char *str)
6251 { 6305 {
6252 alloc_bootmem_cpumask_var(&cpu_isolated_map); 6306 alloc_bootmem_cpumask_var(&cpu_isolated_map);
6253 cpulist_parse(str, cpu_isolated_map); 6307 cpulist_parse(str, cpu_isolated_map);
6254 return 1; 6308 return 1;
6255 } 6309 }
6256 6310
6257 __setup("isolcpus=", isolated_cpu_setup); 6311 __setup("isolcpus=", isolated_cpu_setup);
6258 6312
6259 /* 6313 /*
6260 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 6314 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6261 * to a function which identifies what group(along with sched group) a CPU 6315 * to a function which identifies what group(along with sched group) a CPU
6262 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 6316 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6263 * (due to the fact that we keep track of groups covered with a struct cpumask). 6317 * (due to the fact that we keep track of groups covered with a struct cpumask).
6264 * 6318 *
6265 * init_sched_build_groups will build a circular linked list of the groups 6319 * init_sched_build_groups will build a circular linked list of the groups
6266 * covered by the given span, and will set each group's ->cpumask correctly, 6320 * covered by the given span, and will set each group's ->cpumask correctly,
6267 * and ->cpu_power to 0. 6321 * and ->cpu_power to 0.
6268 */ 6322 */
6269 static void 6323 static void
6270 init_sched_build_groups(const struct cpumask *span, 6324 init_sched_build_groups(const struct cpumask *span,
6271 const struct cpumask *cpu_map, 6325 const struct cpumask *cpu_map,
6272 int (*group_fn)(int cpu, const struct cpumask *cpu_map, 6326 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6273 struct sched_group **sg, 6327 struct sched_group **sg,
6274 struct cpumask *tmpmask), 6328 struct cpumask *tmpmask),
6275 struct cpumask *covered, struct cpumask *tmpmask) 6329 struct cpumask *covered, struct cpumask *tmpmask)
6276 { 6330 {
6277 struct sched_group *first = NULL, *last = NULL; 6331 struct sched_group *first = NULL, *last = NULL;
6278 int i; 6332 int i;
6279 6333
6280 cpumask_clear(covered); 6334 cpumask_clear(covered);
6281 6335
6282 for_each_cpu(i, span) { 6336 for_each_cpu(i, span) {
6283 struct sched_group *sg; 6337 struct sched_group *sg;
6284 int group = group_fn(i, cpu_map, &sg, tmpmask); 6338 int group = group_fn(i, cpu_map, &sg, tmpmask);
6285 int j; 6339 int j;
6286 6340
6287 if (cpumask_test_cpu(i, covered)) 6341 if (cpumask_test_cpu(i, covered))
6288 continue; 6342 continue;
6289 6343
6290 cpumask_clear(sched_group_cpus(sg)); 6344 cpumask_clear(sched_group_cpus(sg));
6291 sg->cpu_power = 0; 6345 sg->cpu_power = 0;
6292 6346
6293 for_each_cpu(j, span) { 6347 for_each_cpu(j, span) {
6294 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6348 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6295 continue; 6349 continue;
6296 6350
6297 cpumask_set_cpu(j, covered); 6351 cpumask_set_cpu(j, covered);
6298 cpumask_set_cpu(j, sched_group_cpus(sg)); 6352 cpumask_set_cpu(j, sched_group_cpus(sg));
6299 } 6353 }
6300 if (!first) 6354 if (!first)
6301 first = sg; 6355 first = sg;
6302 if (last) 6356 if (last)
6303 last->next = sg; 6357 last->next = sg;
6304 last = sg; 6358 last = sg;
6305 } 6359 }
6306 last->next = first; 6360 last->next = first;
6307 } 6361 }
6308 6362
6309 #define SD_NODES_PER_DOMAIN 16 6363 #define SD_NODES_PER_DOMAIN 16
6310 6364
6311 #ifdef CONFIG_NUMA 6365 #ifdef CONFIG_NUMA
6312 6366
6313 /** 6367 /**
6314 * find_next_best_node - find the next node to include in a sched_domain 6368 * find_next_best_node - find the next node to include in a sched_domain
6315 * @node: node whose sched_domain we're building 6369 * @node: node whose sched_domain we're building
6316 * @used_nodes: nodes already in the sched_domain 6370 * @used_nodes: nodes already in the sched_domain
6317 * 6371 *
6318 * Find the next node to include in a given scheduling domain. Simply 6372 * Find the next node to include in a given scheduling domain. Simply
6319 * finds the closest node not already in the @used_nodes map. 6373 * finds the closest node not already in the @used_nodes map.
6320 * 6374 *
6321 * Should use nodemask_t. 6375 * Should use nodemask_t.
6322 */ 6376 */
6323 static int find_next_best_node(int node, nodemask_t *used_nodes) 6377 static int find_next_best_node(int node, nodemask_t *used_nodes)
6324 { 6378 {
6325 int i, n, val, min_val, best_node = 0; 6379 int i, n, val, min_val, best_node = 0;
6326 6380
6327 min_val = INT_MAX; 6381 min_val = INT_MAX;
6328 6382
6329 for (i = 0; i < nr_node_ids; i++) { 6383 for (i = 0; i < nr_node_ids; i++) {
6330 /* Start at @node */ 6384 /* Start at @node */
6331 n = (node + i) % nr_node_ids; 6385 n = (node + i) % nr_node_ids;
6332 6386
6333 if (!nr_cpus_node(n)) 6387 if (!nr_cpus_node(n))
6334 continue; 6388 continue;
6335 6389
6336 /* Skip already used nodes */ 6390 /* Skip already used nodes */
6337 if (node_isset(n, *used_nodes)) 6391 if (node_isset(n, *used_nodes))
6338 continue; 6392 continue;
6339 6393
6340 /* Simple min distance search */ 6394 /* Simple min distance search */
6341 val = node_distance(node, n); 6395 val = node_distance(node, n);
6342 6396
6343 if (val < min_val) { 6397 if (val < min_val) {
6344 min_val = val; 6398 min_val = val;
6345 best_node = n; 6399 best_node = n;
6346 } 6400 }
6347 } 6401 }
6348 6402
6349 node_set(best_node, *used_nodes); 6403 node_set(best_node, *used_nodes);
6350 return best_node; 6404 return best_node;
6351 } 6405 }
6352 6406
6353 /** 6407 /**
6354 * sched_domain_node_span - get a cpumask for a node's sched_domain 6408 * sched_domain_node_span - get a cpumask for a node's sched_domain
6355 * @node: node whose cpumask we're constructing 6409 * @node: node whose cpumask we're constructing
6356 * @span: resulting cpumask 6410 * @span: resulting cpumask
6357 * 6411 *
6358 * Given a node, construct a good cpumask for its sched_domain to span. It 6412 * Given a node, construct a good cpumask for its sched_domain to span. It
6359 * should be one that prevents unnecessary balancing, but also spreads tasks 6413 * should be one that prevents unnecessary balancing, but also spreads tasks
6360 * out optimally. 6414 * out optimally.
6361 */ 6415 */
6362 static void sched_domain_node_span(int node, struct cpumask *span) 6416 static void sched_domain_node_span(int node, struct cpumask *span)
6363 { 6417 {
6364 nodemask_t used_nodes; 6418 nodemask_t used_nodes;
6365 int i; 6419 int i;
6366 6420
6367 cpumask_clear(span); 6421 cpumask_clear(span);
6368 nodes_clear(used_nodes); 6422 nodes_clear(used_nodes);
6369 6423
6370 cpumask_or(span, span, cpumask_of_node(node)); 6424 cpumask_or(span, span, cpumask_of_node(node));
6371 node_set(node, used_nodes); 6425 node_set(node, used_nodes);
6372 6426
6373 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6427 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6374 int next_node = find_next_best_node(node, &used_nodes); 6428 int next_node = find_next_best_node(node, &used_nodes);
6375 6429
6376 cpumask_or(span, span, cpumask_of_node(next_node)); 6430 cpumask_or(span, span, cpumask_of_node(next_node));
6377 } 6431 }
6378 } 6432 }
6379 #endif /* CONFIG_NUMA */ 6433 #endif /* CONFIG_NUMA */
6380 6434
6381 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6435 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6382 6436
6383 /* 6437 /*
6384 * The cpus mask in sched_group and sched_domain hangs off the end. 6438 * The cpus mask in sched_group and sched_domain hangs off the end.
6385 * 6439 *
6386 * ( See the the comments in include/linux/sched.h:struct sched_group 6440 * ( See the the comments in include/linux/sched.h:struct sched_group
6387 * and struct sched_domain. ) 6441 * and struct sched_domain. )
6388 */ 6442 */
6389 struct static_sched_group { 6443 struct static_sched_group {
6390 struct sched_group sg; 6444 struct sched_group sg;
6391 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); 6445 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6392 }; 6446 };
6393 6447
6394 struct static_sched_domain { 6448 struct static_sched_domain {
6395 struct sched_domain sd; 6449 struct sched_domain sd;
6396 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6450 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
6397 }; 6451 };
6398 6452
6399 struct s_data { 6453 struct s_data {
6400 #ifdef CONFIG_NUMA 6454 #ifdef CONFIG_NUMA
6401 int sd_allnodes; 6455 int sd_allnodes;
6402 cpumask_var_t domainspan; 6456 cpumask_var_t domainspan;
6403 cpumask_var_t covered; 6457 cpumask_var_t covered;
6404 cpumask_var_t notcovered; 6458 cpumask_var_t notcovered;
6405 #endif 6459 #endif
6406 cpumask_var_t nodemask; 6460 cpumask_var_t nodemask;
6407 cpumask_var_t this_sibling_map; 6461 cpumask_var_t this_sibling_map;
6408 cpumask_var_t this_core_map; 6462 cpumask_var_t this_core_map;
6409 cpumask_var_t send_covered; 6463 cpumask_var_t send_covered;
6410 cpumask_var_t tmpmask; 6464 cpumask_var_t tmpmask;
6411 struct sched_group **sched_group_nodes; 6465 struct sched_group **sched_group_nodes;
6412 struct root_domain *rd; 6466 struct root_domain *rd;
6413 }; 6467 };
6414 6468
6415 enum s_alloc { 6469 enum s_alloc {
6416 sa_sched_groups = 0, 6470 sa_sched_groups = 0,
6417 sa_rootdomain, 6471 sa_rootdomain,
6418 sa_tmpmask, 6472 sa_tmpmask,
6419 sa_send_covered, 6473 sa_send_covered,
6420 sa_this_core_map, 6474 sa_this_core_map,
6421 sa_this_sibling_map, 6475 sa_this_sibling_map,
6422 sa_nodemask, 6476 sa_nodemask,
6423 sa_sched_group_nodes, 6477 sa_sched_group_nodes,
6424 #ifdef CONFIG_NUMA 6478 #ifdef CONFIG_NUMA
6425 sa_notcovered, 6479 sa_notcovered,
6426 sa_covered, 6480 sa_covered,
6427 sa_domainspan, 6481 sa_domainspan,
6428 #endif 6482 #endif
6429 sa_none, 6483 sa_none,
6430 }; 6484 };
6431 6485
6432 /* 6486 /*
6433 * SMT sched-domains: 6487 * SMT sched-domains:
6434 */ 6488 */
6435 #ifdef CONFIG_SCHED_SMT 6489 #ifdef CONFIG_SCHED_SMT
6436 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 6490 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6437 static DEFINE_PER_CPU(struct static_sched_group, sched_groups); 6491 static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6438 6492
6439 static int 6493 static int
6440 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6494 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6441 struct sched_group **sg, struct cpumask *unused) 6495 struct sched_group **sg, struct cpumask *unused)
6442 { 6496 {
6443 if (sg) 6497 if (sg)
6444 *sg = &per_cpu(sched_groups, cpu).sg; 6498 *sg = &per_cpu(sched_groups, cpu).sg;
6445 return cpu; 6499 return cpu;
6446 } 6500 }
6447 #endif /* CONFIG_SCHED_SMT */ 6501 #endif /* CONFIG_SCHED_SMT */
6448 6502
6449 /* 6503 /*
6450 * multi-core sched-domains: 6504 * multi-core sched-domains:
6451 */ 6505 */
6452 #ifdef CONFIG_SCHED_MC 6506 #ifdef CONFIG_SCHED_MC
6453 static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6507 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6454 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6508 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6455 #endif /* CONFIG_SCHED_MC */ 6509 #endif /* CONFIG_SCHED_MC */
6456 6510
6457 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6511 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6458 static int 6512 static int
6459 cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6513 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6460 struct sched_group **sg, struct cpumask *mask) 6514 struct sched_group **sg, struct cpumask *mask)
6461 { 6515 {
6462 int group; 6516 int group;
6463 6517
6464 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6518 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6465 group = cpumask_first(mask); 6519 group = cpumask_first(mask);
6466 if (sg) 6520 if (sg)
6467 *sg = &per_cpu(sched_group_core, group).sg; 6521 *sg = &per_cpu(sched_group_core, group).sg;
6468 return group; 6522 return group;
6469 } 6523 }
6470 #elif defined(CONFIG_SCHED_MC) 6524 #elif defined(CONFIG_SCHED_MC)
6471 static int 6525 static int
6472 cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6526 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6473 struct sched_group **sg, struct cpumask *unused) 6527 struct sched_group **sg, struct cpumask *unused)
6474 { 6528 {
6475 if (sg) 6529 if (sg)
6476 *sg = &per_cpu(sched_group_core, cpu).sg; 6530 *sg = &per_cpu(sched_group_core, cpu).sg;
6477 return cpu; 6531 return cpu;
6478 } 6532 }
6479 #endif 6533 #endif
6480 6534
6481 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6535 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6482 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6536 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
6483 6537
6484 static int 6538 static int
6485 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 6539 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6486 struct sched_group **sg, struct cpumask *mask) 6540 struct sched_group **sg, struct cpumask *mask)
6487 { 6541 {
6488 int group; 6542 int group;
6489 #ifdef CONFIG_SCHED_MC 6543 #ifdef CONFIG_SCHED_MC
6490 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6544 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6491 group = cpumask_first(mask); 6545 group = cpumask_first(mask);
6492 #elif defined(CONFIG_SCHED_SMT) 6546 #elif defined(CONFIG_SCHED_SMT)
6493 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6547 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6494 group = cpumask_first(mask); 6548 group = cpumask_first(mask);
6495 #else 6549 #else
6496 group = cpu; 6550 group = cpu;
6497 #endif 6551 #endif
6498 if (sg) 6552 if (sg)
6499 *sg = &per_cpu(sched_group_phys, group).sg; 6553 *sg = &per_cpu(sched_group_phys, group).sg;
6500 return group; 6554 return group;
6501 } 6555 }
6502 6556
6503 #ifdef CONFIG_NUMA 6557 #ifdef CONFIG_NUMA
6504 /* 6558 /*
6505 * The init_sched_build_groups can't handle what we want to do with node 6559 * The init_sched_build_groups can't handle what we want to do with node
6506 * groups, so roll our own. Now each node has its own list of groups which 6560 * groups, so roll our own. Now each node has its own list of groups which
6507 * gets dynamically allocated. 6561 * gets dynamically allocated.
6508 */ 6562 */
6509 static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6563 static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
6510 static struct sched_group ***sched_group_nodes_bycpu; 6564 static struct sched_group ***sched_group_nodes_bycpu;
6511 6565
6512 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 6566 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
6513 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 6567 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
6514 6568
6515 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 6569 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
6516 struct sched_group **sg, 6570 struct sched_group **sg,
6517 struct cpumask *nodemask) 6571 struct cpumask *nodemask)
6518 { 6572 {
6519 int group; 6573 int group;
6520 6574
6521 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 6575 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
6522 group = cpumask_first(nodemask); 6576 group = cpumask_first(nodemask);
6523 6577
6524 if (sg) 6578 if (sg)
6525 *sg = &per_cpu(sched_group_allnodes, group).sg; 6579 *sg = &per_cpu(sched_group_allnodes, group).sg;
6526 return group; 6580 return group;
6527 } 6581 }
6528 6582
6529 static void init_numa_sched_groups_power(struct sched_group *group_head) 6583 static void init_numa_sched_groups_power(struct sched_group *group_head)
6530 { 6584 {
6531 struct sched_group *sg = group_head; 6585 struct sched_group *sg = group_head;
6532 int j; 6586 int j;
6533 6587
6534 if (!sg) 6588 if (!sg)
6535 return; 6589 return;
6536 do { 6590 do {
6537 for_each_cpu(j, sched_group_cpus(sg)) { 6591 for_each_cpu(j, sched_group_cpus(sg)) {
6538 struct sched_domain *sd; 6592 struct sched_domain *sd;
6539 6593
6540 sd = &per_cpu(phys_domains, j).sd; 6594 sd = &per_cpu(phys_domains, j).sd;
6541 if (j != group_first_cpu(sd->groups)) { 6595 if (j != group_first_cpu(sd->groups)) {
6542 /* 6596 /*
6543 * Only add "power" once for each 6597 * Only add "power" once for each
6544 * physical package. 6598 * physical package.
6545 */ 6599 */
6546 continue; 6600 continue;
6547 } 6601 }
6548 6602
6549 sg->cpu_power += sd->groups->cpu_power; 6603 sg->cpu_power += sd->groups->cpu_power;
6550 } 6604 }
6551 sg = sg->next; 6605 sg = sg->next;
6552 } while (sg != group_head); 6606 } while (sg != group_head);
6553 } 6607 }
6554 6608
6555 static int build_numa_sched_groups(struct s_data *d, 6609 static int build_numa_sched_groups(struct s_data *d,
6556 const struct cpumask *cpu_map, int num) 6610 const struct cpumask *cpu_map, int num)
6557 { 6611 {
6558 struct sched_domain *sd; 6612 struct sched_domain *sd;
6559 struct sched_group *sg, *prev; 6613 struct sched_group *sg, *prev;
6560 int n, j; 6614 int n, j;
6561 6615
6562 cpumask_clear(d->covered); 6616 cpumask_clear(d->covered);
6563 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 6617 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
6564 if (cpumask_empty(d->nodemask)) { 6618 if (cpumask_empty(d->nodemask)) {
6565 d->sched_group_nodes[num] = NULL; 6619 d->sched_group_nodes[num] = NULL;
6566 goto out; 6620 goto out;
6567 } 6621 }
6568 6622
6569 sched_domain_node_span(num, d->domainspan); 6623 sched_domain_node_span(num, d->domainspan);
6570 cpumask_and(d->domainspan, d->domainspan, cpu_map); 6624 cpumask_and(d->domainspan, d->domainspan, cpu_map);
6571 6625
6572 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 6626 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6573 GFP_KERNEL, num); 6627 GFP_KERNEL, num);
6574 if (!sg) { 6628 if (!sg) {
6575 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 6629 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
6576 num); 6630 num);
6577 return -ENOMEM; 6631 return -ENOMEM;
6578 } 6632 }
6579 d->sched_group_nodes[num] = sg; 6633 d->sched_group_nodes[num] = sg;
6580 6634
6581 for_each_cpu(j, d->nodemask) { 6635 for_each_cpu(j, d->nodemask) {
6582 sd = &per_cpu(node_domains, j).sd; 6636 sd = &per_cpu(node_domains, j).sd;
6583 sd->groups = sg; 6637 sd->groups = sg;
6584 } 6638 }
6585 6639
6586 sg->cpu_power = 0; 6640 sg->cpu_power = 0;
6587 cpumask_copy(sched_group_cpus(sg), d->nodemask); 6641 cpumask_copy(sched_group_cpus(sg), d->nodemask);
6588 sg->next = sg; 6642 sg->next = sg;
6589 cpumask_or(d->covered, d->covered, d->nodemask); 6643 cpumask_or(d->covered, d->covered, d->nodemask);
6590 6644
6591 prev = sg; 6645 prev = sg;
6592 for (j = 0; j < nr_node_ids; j++) { 6646 for (j = 0; j < nr_node_ids; j++) {
6593 n = (num + j) % nr_node_ids; 6647 n = (num + j) % nr_node_ids;
6594 cpumask_complement(d->notcovered, d->covered); 6648 cpumask_complement(d->notcovered, d->covered);
6595 cpumask_and(d->tmpmask, d->notcovered, cpu_map); 6649 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
6596 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); 6650 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
6597 if (cpumask_empty(d->tmpmask)) 6651 if (cpumask_empty(d->tmpmask))
6598 break; 6652 break;
6599 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); 6653 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
6600 if (cpumask_empty(d->tmpmask)) 6654 if (cpumask_empty(d->tmpmask))
6601 continue; 6655 continue;
6602 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 6656 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
6603 GFP_KERNEL, num); 6657 GFP_KERNEL, num);
6604 if (!sg) { 6658 if (!sg) {
6605 printk(KERN_WARNING 6659 printk(KERN_WARNING
6606 "Can not alloc domain group for node %d\n", j); 6660 "Can not alloc domain group for node %d\n", j);
6607 return -ENOMEM; 6661 return -ENOMEM;
6608 } 6662 }
6609 sg->cpu_power = 0; 6663 sg->cpu_power = 0;
6610 cpumask_copy(sched_group_cpus(sg), d->tmpmask); 6664 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
6611 sg->next = prev->next; 6665 sg->next = prev->next;
6612 cpumask_or(d->covered, d->covered, d->tmpmask); 6666 cpumask_or(d->covered, d->covered, d->tmpmask);
6613 prev->next = sg; 6667 prev->next = sg;
6614 prev = sg; 6668 prev = sg;
6615 } 6669 }
6616 out: 6670 out:
6617 return 0; 6671 return 0;
6618 } 6672 }
6619 #endif /* CONFIG_NUMA */ 6673 #endif /* CONFIG_NUMA */
6620 6674
6621 #ifdef CONFIG_NUMA 6675 #ifdef CONFIG_NUMA
6622 /* Free memory allocated for various sched_group structures */ 6676 /* Free memory allocated for various sched_group structures */
6623 static void free_sched_groups(const struct cpumask *cpu_map, 6677 static void free_sched_groups(const struct cpumask *cpu_map,
6624 struct cpumask *nodemask) 6678 struct cpumask *nodemask)
6625 { 6679 {
6626 int cpu, i; 6680 int cpu, i;
6627 6681
6628 for_each_cpu(cpu, cpu_map) { 6682 for_each_cpu(cpu, cpu_map) {
6629 struct sched_group **sched_group_nodes 6683 struct sched_group **sched_group_nodes
6630 = sched_group_nodes_bycpu[cpu]; 6684 = sched_group_nodes_bycpu[cpu];
6631 6685
6632 if (!sched_group_nodes) 6686 if (!sched_group_nodes)
6633 continue; 6687 continue;
6634 6688
6635 for (i = 0; i < nr_node_ids; i++) { 6689 for (i = 0; i < nr_node_ids; i++) {
6636 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6690 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6637 6691
6638 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6692 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
6639 if (cpumask_empty(nodemask)) 6693 if (cpumask_empty(nodemask))
6640 continue; 6694 continue;
6641 6695
6642 if (sg == NULL) 6696 if (sg == NULL)
6643 continue; 6697 continue;
6644 sg = sg->next; 6698 sg = sg->next;
6645 next_sg: 6699 next_sg:
6646 oldsg = sg; 6700 oldsg = sg;
6647 sg = sg->next; 6701 sg = sg->next;
6648 kfree(oldsg); 6702 kfree(oldsg);
6649 if (oldsg != sched_group_nodes[i]) 6703 if (oldsg != sched_group_nodes[i])
6650 goto next_sg; 6704 goto next_sg;
6651 } 6705 }
6652 kfree(sched_group_nodes); 6706 kfree(sched_group_nodes);
6653 sched_group_nodes_bycpu[cpu] = NULL; 6707 sched_group_nodes_bycpu[cpu] = NULL;
6654 } 6708 }
6655 } 6709 }
6656 #else /* !CONFIG_NUMA */ 6710 #else /* !CONFIG_NUMA */
6657 static void free_sched_groups(const struct cpumask *cpu_map, 6711 static void free_sched_groups(const struct cpumask *cpu_map,
6658 struct cpumask *nodemask) 6712 struct cpumask *nodemask)
6659 { 6713 {
6660 } 6714 }
6661 #endif /* CONFIG_NUMA */ 6715 #endif /* CONFIG_NUMA */
6662 6716
6663 /* 6717 /*
6664 * Initialize sched groups cpu_power. 6718 * Initialize sched groups cpu_power.
6665 * 6719 *
6666 * cpu_power indicates the capacity of sched group, which is used while 6720 * cpu_power indicates the capacity of sched group, which is used while
6667 * distributing the load between different sched groups in a sched domain. 6721 * distributing the load between different sched groups in a sched domain.
6668 * Typically cpu_power for all the groups in a sched domain will be same unless 6722 * Typically cpu_power for all the groups in a sched domain will be same unless
6669 * there are asymmetries in the topology. If there are asymmetries, group 6723 * there are asymmetries in the topology. If there are asymmetries, group
6670 * having more cpu_power will pickup more load compared to the group having 6724 * having more cpu_power will pickup more load compared to the group having
6671 * less cpu_power. 6725 * less cpu_power.
6672 */ 6726 */
6673 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6727 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6674 { 6728 {
6675 struct sched_domain *child; 6729 struct sched_domain *child;
6676 struct sched_group *group; 6730 struct sched_group *group;
6677 long power; 6731 long power;
6678 int weight; 6732 int weight;
6679 6733
6680 WARN_ON(!sd || !sd->groups); 6734 WARN_ON(!sd || !sd->groups);
6681 6735
6682 if (cpu != group_first_cpu(sd->groups)) 6736 if (cpu != group_first_cpu(sd->groups))
6683 return; 6737 return;
6684 6738
6685 child = sd->child; 6739 child = sd->child;
6686 6740
6687 sd->groups->cpu_power = 0; 6741 sd->groups->cpu_power = 0;
6688 6742
6689 if (!child) { 6743 if (!child) {
6690 power = SCHED_LOAD_SCALE; 6744 power = SCHED_LOAD_SCALE;
6691 weight = cpumask_weight(sched_domain_span(sd)); 6745 weight = cpumask_weight(sched_domain_span(sd));
6692 /* 6746 /*
6693 * SMT siblings share the power of a single core. 6747 * SMT siblings share the power of a single core.
6694 * Usually multiple threads get a better yield out of 6748 * Usually multiple threads get a better yield out of
6695 * that one core than a single thread would have, 6749 * that one core than a single thread would have,
6696 * reflect that in sd->smt_gain. 6750 * reflect that in sd->smt_gain.
6697 */ 6751 */
6698 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 6752 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6699 power *= sd->smt_gain; 6753 power *= sd->smt_gain;
6700 power /= weight; 6754 power /= weight;
6701 power >>= SCHED_LOAD_SHIFT; 6755 power >>= SCHED_LOAD_SHIFT;
6702 } 6756 }
6703 sd->groups->cpu_power += power; 6757 sd->groups->cpu_power += power;
6704 return; 6758 return;
6705 } 6759 }
6706 6760
6707 /* 6761 /*
6708 * Add cpu_power of each child group to this groups cpu_power. 6762 * Add cpu_power of each child group to this groups cpu_power.
6709 */ 6763 */
6710 group = child->groups; 6764 group = child->groups;
6711 do { 6765 do {
6712 sd->groups->cpu_power += group->cpu_power; 6766 sd->groups->cpu_power += group->cpu_power;
6713 group = group->next; 6767 group = group->next;
6714 } while (group != child->groups); 6768 } while (group != child->groups);
6715 } 6769 }
6716 6770
6717 /* 6771 /*
6718 * Initializers for schedule domains 6772 * Initializers for schedule domains
6719 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 6773 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6720 */ 6774 */
6721 6775
6722 #ifdef CONFIG_SCHED_DEBUG 6776 #ifdef CONFIG_SCHED_DEBUG
6723 # define SD_INIT_NAME(sd, type) sd->name = #type 6777 # define SD_INIT_NAME(sd, type) sd->name = #type
6724 #else 6778 #else
6725 # define SD_INIT_NAME(sd, type) do { } while (0) 6779 # define SD_INIT_NAME(sd, type) do { } while (0)
6726 #endif 6780 #endif
6727 6781
6728 #define SD_INIT(sd, type) sd_init_##type(sd) 6782 #define SD_INIT(sd, type) sd_init_##type(sd)
6729 6783
6730 #define SD_INIT_FUNC(type) \ 6784 #define SD_INIT_FUNC(type) \
6731 static noinline void sd_init_##type(struct sched_domain *sd) \ 6785 static noinline void sd_init_##type(struct sched_domain *sd) \
6732 { \ 6786 { \
6733 memset(sd, 0, sizeof(*sd)); \ 6787 memset(sd, 0, sizeof(*sd)); \
6734 *sd = SD_##type##_INIT; \ 6788 *sd = SD_##type##_INIT; \
6735 sd->level = SD_LV_##type; \ 6789 sd->level = SD_LV_##type; \
6736 SD_INIT_NAME(sd, type); \ 6790 SD_INIT_NAME(sd, type); \
6737 } 6791 }
6738 6792
6739 SD_INIT_FUNC(CPU) 6793 SD_INIT_FUNC(CPU)
6740 #ifdef CONFIG_NUMA 6794 #ifdef CONFIG_NUMA
6741 SD_INIT_FUNC(ALLNODES) 6795 SD_INIT_FUNC(ALLNODES)
6742 SD_INIT_FUNC(NODE) 6796 SD_INIT_FUNC(NODE)
6743 #endif 6797 #endif
6744 #ifdef CONFIG_SCHED_SMT 6798 #ifdef CONFIG_SCHED_SMT
6745 SD_INIT_FUNC(SIBLING) 6799 SD_INIT_FUNC(SIBLING)
6746 #endif 6800 #endif
6747 #ifdef CONFIG_SCHED_MC 6801 #ifdef CONFIG_SCHED_MC
6748 SD_INIT_FUNC(MC) 6802 SD_INIT_FUNC(MC)
6749 #endif 6803 #endif
6750 6804
6751 static int default_relax_domain_level = -1; 6805 static int default_relax_domain_level = -1;
6752 6806
6753 static int __init setup_relax_domain_level(char *str) 6807 static int __init setup_relax_domain_level(char *str)
6754 { 6808 {
6755 unsigned long val; 6809 unsigned long val;
6756 6810
6757 val = simple_strtoul(str, NULL, 0); 6811 val = simple_strtoul(str, NULL, 0);
6758 if (val < SD_LV_MAX) 6812 if (val < SD_LV_MAX)
6759 default_relax_domain_level = val; 6813 default_relax_domain_level = val;
6760 6814
6761 return 1; 6815 return 1;
6762 } 6816 }
6763 __setup("relax_domain_level=", setup_relax_domain_level); 6817 __setup("relax_domain_level=", setup_relax_domain_level);
6764 6818
6765 static void set_domain_attribute(struct sched_domain *sd, 6819 static void set_domain_attribute(struct sched_domain *sd,
6766 struct sched_domain_attr *attr) 6820 struct sched_domain_attr *attr)
6767 { 6821 {
6768 int request; 6822 int request;
6769 6823
6770 if (!attr || attr->relax_domain_level < 0) { 6824 if (!attr || attr->relax_domain_level < 0) {
6771 if (default_relax_domain_level < 0) 6825 if (default_relax_domain_level < 0)
6772 return; 6826 return;
6773 else 6827 else
6774 request = default_relax_domain_level; 6828 request = default_relax_domain_level;
6775 } else 6829 } else
6776 request = attr->relax_domain_level; 6830 request = attr->relax_domain_level;
6777 if (request < sd->level) { 6831 if (request < sd->level) {
6778 /* turn off idle balance on this domain */ 6832 /* turn off idle balance on this domain */
6779 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6833 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6780 } else { 6834 } else {
6781 /* turn on idle balance on this domain */ 6835 /* turn on idle balance on this domain */
6782 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 6836 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
6783 } 6837 }
6784 } 6838 }
6785 6839
6786 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 6840 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6787 const struct cpumask *cpu_map) 6841 const struct cpumask *cpu_map)
6788 { 6842 {
6789 switch (what) { 6843 switch (what) {
6790 case sa_sched_groups: 6844 case sa_sched_groups:
6791 free_sched_groups(cpu_map, d->tmpmask); /* fall through */ 6845 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
6792 d->sched_group_nodes = NULL; 6846 d->sched_group_nodes = NULL;
6793 case sa_rootdomain: 6847 case sa_rootdomain:
6794 free_rootdomain(d->rd); /* fall through */ 6848 free_rootdomain(d->rd); /* fall through */
6795 case sa_tmpmask: 6849 case sa_tmpmask:
6796 free_cpumask_var(d->tmpmask); /* fall through */ 6850 free_cpumask_var(d->tmpmask); /* fall through */
6797 case sa_send_covered: 6851 case sa_send_covered:
6798 free_cpumask_var(d->send_covered); /* fall through */ 6852 free_cpumask_var(d->send_covered); /* fall through */
6799 case sa_this_core_map: 6853 case sa_this_core_map:
6800 free_cpumask_var(d->this_core_map); /* fall through */ 6854 free_cpumask_var(d->this_core_map); /* fall through */
6801 case sa_this_sibling_map: 6855 case sa_this_sibling_map:
6802 free_cpumask_var(d->this_sibling_map); /* fall through */ 6856 free_cpumask_var(d->this_sibling_map); /* fall through */
6803 case sa_nodemask: 6857 case sa_nodemask:
6804 free_cpumask_var(d->nodemask); /* fall through */ 6858 free_cpumask_var(d->nodemask); /* fall through */
6805 case sa_sched_group_nodes: 6859 case sa_sched_group_nodes:
6806 #ifdef CONFIG_NUMA 6860 #ifdef CONFIG_NUMA
6807 kfree(d->sched_group_nodes); /* fall through */ 6861 kfree(d->sched_group_nodes); /* fall through */
6808 case sa_notcovered: 6862 case sa_notcovered:
6809 free_cpumask_var(d->notcovered); /* fall through */ 6863 free_cpumask_var(d->notcovered); /* fall through */
6810 case sa_covered: 6864 case sa_covered:
6811 free_cpumask_var(d->covered); /* fall through */ 6865 free_cpumask_var(d->covered); /* fall through */
6812 case sa_domainspan: 6866 case sa_domainspan:
6813 free_cpumask_var(d->domainspan); /* fall through */ 6867 free_cpumask_var(d->domainspan); /* fall through */
6814 #endif 6868 #endif
6815 case sa_none: 6869 case sa_none:
6816 break; 6870 break;
6817 } 6871 }
6818 } 6872 }
6819 6873
6820 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 6874 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6821 const struct cpumask *cpu_map) 6875 const struct cpumask *cpu_map)
6822 { 6876 {
6823 #ifdef CONFIG_NUMA 6877 #ifdef CONFIG_NUMA
6824 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 6878 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
6825 return sa_none; 6879 return sa_none;
6826 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 6880 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
6827 return sa_domainspan; 6881 return sa_domainspan;
6828 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 6882 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
6829 return sa_covered; 6883 return sa_covered;
6830 /* Allocate the per-node list of sched groups */ 6884 /* Allocate the per-node list of sched groups */
6831 d->sched_group_nodes = kcalloc(nr_node_ids, 6885 d->sched_group_nodes = kcalloc(nr_node_ids,
6832 sizeof(struct sched_group *), GFP_KERNEL); 6886 sizeof(struct sched_group *), GFP_KERNEL);
6833 if (!d->sched_group_nodes) { 6887 if (!d->sched_group_nodes) {
6834 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6888 printk(KERN_WARNING "Can not alloc sched group node list\n");
6835 return sa_notcovered; 6889 return sa_notcovered;
6836 } 6890 }
6837 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; 6891 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
6838 #endif 6892 #endif
6839 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) 6893 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
6840 return sa_sched_group_nodes; 6894 return sa_sched_group_nodes;
6841 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) 6895 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
6842 return sa_nodemask; 6896 return sa_nodemask;
6843 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 6897 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6844 return sa_this_sibling_map; 6898 return sa_this_sibling_map;
6845 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 6899 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
6846 return sa_this_core_map; 6900 return sa_this_core_map;
6847 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 6901 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6848 return sa_send_covered; 6902 return sa_send_covered;
6849 d->rd = alloc_rootdomain(); 6903 d->rd = alloc_rootdomain();
6850 if (!d->rd) { 6904 if (!d->rd) {
6851 printk(KERN_WARNING "Cannot alloc root domain\n"); 6905 printk(KERN_WARNING "Cannot alloc root domain\n");
6852 return sa_tmpmask; 6906 return sa_tmpmask;
6853 } 6907 }
6854 return sa_rootdomain; 6908 return sa_rootdomain;
6855 } 6909 }
6856 6910
6857 static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 6911 static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
6858 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 6912 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
6859 { 6913 {
6860 struct sched_domain *sd = NULL; 6914 struct sched_domain *sd = NULL;
6861 #ifdef CONFIG_NUMA 6915 #ifdef CONFIG_NUMA
6862 struct sched_domain *parent; 6916 struct sched_domain *parent;
6863 6917
6864 d->sd_allnodes = 0; 6918 d->sd_allnodes = 0;
6865 if (cpumask_weight(cpu_map) > 6919 if (cpumask_weight(cpu_map) >
6866 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { 6920 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
6867 sd = &per_cpu(allnodes_domains, i).sd; 6921 sd = &per_cpu(allnodes_domains, i).sd;
6868 SD_INIT(sd, ALLNODES); 6922 SD_INIT(sd, ALLNODES);
6869 set_domain_attribute(sd, attr); 6923 set_domain_attribute(sd, attr);
6870 cpumask_copy(sched_domain_span(sd), cpu_map); 6924 cpumask_copy(sched_domain_span(sd), cpu_map);
6871 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); 6925 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
6872 d->sd_allnodes = 1; 6926 d->sd_allnodes = 1;
6873 } 6927 }
6874 parent = sd; 6928 parent = sd;
6875 6929
6876 sd = &per_cpu(node_domains, i).sd; 6930 sd = &per_cpu(node_domains, i).sd;
6877 SD_INIT(sd, NODE); 6931 SD_INIT(sd, NODE);
6878 set_domain_attribute(sd, attr); 6932 set_domain_attribute(sd, attr);
6879 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 6933 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
6880 sd->parent = parent; 6934 sd->parent = parent;
6881 if (parent) 6935 if (parent)
6882 parent->child = sd; 6936 parent->child = sd;
6883 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); 6937 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
6884 #endif 6938 #endif
6885 return sd; 6939 return sd;
6886 } 6940 }
6887 6941
6888 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 6942 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
6889 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6943 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6890 struct sched_domain *parent, int i) 6944 struct sched_domain *parent, int i)
6891 { 6945 {
6892 struct sched_domain *sd; 6946 struct sched_domain *sd;
6893 sd = &per_cpu(phys_domains, i).sd; 6947 sd = &per_cpu(phys_domains, i).sd;
6894 SD_INIT(sd, CPU); 6948 SD_INIT(sd, CPU);
6895 set_domain_attribute(sd, attr); 6949 set_domain_attribute(sd, attr);
6896 cpumask_copy(sched_domain_span(sd), d->nodemask); 6950 cpumask_copy(sched_domain_span(sd), d->nodemask);
6897 sd->parent = parent; 6951 sd->parent = parent;
6898 if (parent) 6952 if (parent)
6899 parent->child = sd; 6953 parent->child = sd;
6900 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); 6954 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
6901 return sd; 6955 return sd;
6902 } 6956 }
6903 6957
6904 static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 6958 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
6905 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6959 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6906 struct sched_domain *parent, int i) 6960 struct sched_domain *parent, int i)
6907 { 6961 {
6908 struct sched_domain *sd = parent; 6962 struct sched_domain *sd = parent;
6909 #ifdef CONFIG_SCHED_MC 6963 #ifdef CONFIG_SCHED_MC
6910 sd = &per_cpu(core_domains, i).sd; 6964 sd = &per_cpu(core_domains, i).sd;
6911 SD_INIT(sd, MC); 6965 SD_INIT(sd, MC);
6912 set_domain_attribute(sd, attr); 6966 set_domain_attribute(sd, attr);
6913 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); 6967 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
6914 sd->parent = parent; 6968 sd->parent = parent;
6915 parent->child = sd; 6969 parent->child = sd;
6916 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); 6970 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
6917 #endif 6971 #endif
6918 return sd; 6972 return sd;
6919 } 6973 }
6920 6974
6921 static struct sched_domain *__build_smt_sched_domain(struct s_data *d, 6975 static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
6922 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 6976 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
6923 struct sched_domain *parent, int i) 6977 struct sched_domain *parent, int i)
6924 { 6978 {
6925 struct sched_domain *sd = parent; 6979 struct sched_domain *sd = parent;
6926 #ifdef CONFIG_SCHED_SMT 6980 #ifdef CONFIG_SCHED_SMT
6927 sd = &per_cpu(cpu_domains, i).sd; 6981 sd = &per_cpu(cpu_domains, i).sd;
6928 SD_INIT(sd, SIBLING); 6982 SD_INIT(sd, SIBLING);
6929 set_domain_attribute(sd, attr); 6983 set_domain_attribute(sd, attr);
6930 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); 6984 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
6931 sd->parent = parent; 6985 sd->parent = parent;
6932 parent->child = sd; 6986 parent->child = sd;
6933 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); 6987 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
6934 #endif 6988 #endif
6935 return sd; 6989 return sd;
6936 } 6990 }
6937 6991
6938 static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 6992 static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
6939 const struct cpumask *cpu_map, int cpu) 6993 const struct cpumask *cpu_map, int cpu)
6940 { 6994 {
6941 switch (l) { 6995 switch (l) {
6942 #ifdef CONFIG_SCHED_SMT 6996 #ifdef CONFIG_SCHED_SMT
6943 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 6997 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
6944 cpumask_and(d->this_sibling_map, cpu_map, 6998 cpumask_and(d->this_sibling_map, cpu_map,
6945 topology_thread_cpumask(cpu)); 6999 topology_thread_cpumask(cpu));
6946 if (cpu == cpumask_first(d->this_sibling_map)) 7000 if (cpu == cpumask_first(d->this_sibling_map))
6947 init_sched_build_groups(d->this_sibling_map, cpu_map, 7001 init_sched_build_groups(d->this_sibling_map, cpu_map,
6948 &cpu_to_cpu_group, 7002 &cpu_to_cpu_group,
6949 d->send_covered, d->tmpmask); 7003 d->send_covered, d->tmpmask);
6950 break; 7004 break;
6951 #endif 7005 #endif
6952 #ifdef CONFIG_SCHED_MC 7006 #ifdef CONFIG_SCHED_MC
6953 case SD_LV_MC: /* set up multi-core groups */ 7007 case SD_LV_MC: /* set up multi-core groups */
6954 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); 7008 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
6955 if (cpu == cpumask_first(d->this_core_map)) 7009 if (cpu == cpumask_first(d->this_core_map))
6956 init_sched_build_groups(d->this_core_map, cpu_map, 7010 init_sched_build_groups(d->this_core_map, cpu_map,
6957 &cpu_to_core_group, 7011 &cpu_to_core_group,
6958 d->send_covered, d->tmpmask); 7012 d->send_covered, d->tmpmask);
6959 break; 7013 break;
6960 #endif 7014 #endif
6961 case SD_LV_CPU: /* set up physical groups */ 7015 case SD_LV_CPU: /* set up physical groups */
6962 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7016 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
6963 if (!cpumask_empty(d->nodemask)) 7017 if (!cpumask_empty(d->nodemask))
6964 init_sched_build_groups(d->nodemask, cpu_map, 7018 init_sched_build_groups(d->nodemask, cpu_map,
6965 &cpu_to_phys_group, 7019 &cpu_to_phys_group,
6966 d->send_covered, d->tmpmask); 7020 d->send_covered, d->tmpmask);
6967 break; 7021 break;
6968 #ifdef CONFIG_NUMA 7022 #ifdef CONFIG_NUMA
6969 case SD_LV_ALLNODES: 7023 case SD_LV_ALLNODES:
6970 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7024 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
6971 d->send_covered, d->tmpmask); 7025 d->send_covered, d->tmpmask);
6972 break; 7026 break;
6973 #endif 7027 #endif
6974 default: 7028 default:
6975 break; 7029 break;
6976 } 7030 }
6977 } 7031 }
6978 7032
6979 /* 7033 /*
6980 * Build sched domains for a given set of cpus and attach the sched domains 7034 * Build sched domains for a given set of cpus and attach the sched domains
6981 * to the individual cpus 7035 * to the individual cpus
6982 */ 7036 */
6983 static int __build_sched_domains(const struct cpumask *cpu_map, 7037 static int __build_sched_domains(const struct cpumask *cpu_map,
6984 struct sched_domain_attr *attr) 7038 struct sched_domain_attr *attr)
6985 { 7039 {
6986 enum s_alloc alloc_state = sa_none; 7040 enum s_alloc alloc_state = sa_none;
6987 struct s_data d; 7041 struct s_data d;
6988 struct sched_domain *sd; 7042 struct sched_domain *sd;
6989 int i; 7043 int i;
6990 #ifdef CONFIG_NUMA 7044 #ifdef CONFIG_NUMA
6991 d.sd_allnodes = 0; 7045 d.sd_allnodes = 0;
6992 #endif 7046 #endif
6993 7047
6994 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7048 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
6995 if (alloc_state != sa_rootdomain) 7049 if (alloc_state != sa_rootdomain)
6996 goto error; 7050 goto error;
6997 alloc_state = sa_sched_groups; 7051 alloc_state = sa_sched_groups;
6998 7052
6999 /* 7053 /*
7000 * Set up domains for cpus specified by the cpu_map. 7054 * Set up domains for cpus specified by the cpu_map.
7001 */ 7055 */
7002 for_each_cpu(i, cpu_map) { 7056 for_each_cpu(i, cpu_map) {
7003 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7057 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7004 cpu_map); 7058 cpu_map);
7005 7059
7006 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7060 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7007 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7061 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7008 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7062 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7009 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7063 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7010 } 7064 }
7011 7065
7012 for_each_cpu(i, cpu_map) { 7066 for_each_cpu(i, cpu_map) {
7013 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7067 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7014 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7068 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7015 } 7069 }
7016 7070
7017 /* Set up physical groups */ 7071 /* Set up physical groups */
7018 for (i = 0; i < nr_node_ids; i++) 7072 for (i = 0; i < nr_node_ids; i++)
7019 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 7073 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7020 7074
7021 #ifdef CONFIG_NUMA 7075 #ifdef CONFIG_NUMA
7022 /* Set up node groups */ 7076 /* Set up node groups */
7023 if (d.sd_allnodes) 7077 if (d.sd_allnodes)
7024 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); 7078 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7025 7079
7026 for (i = 0; i < nr_node_ids; i++) 7080 for (i = 0; i < nr_node_ids; i++)
7027 if (build_numa_sched_groups(&d, cpu_map, i)) 7081 if (build_numa_sched_groups(&d, cpu_map, i))
7028 goto error; 7082 goto error;
7029 #endif 7083 #endif
7030 7084
7031 /* Calculate CPU power for physical packages and nodes */ 7085 /* Calculate CPU power for physical packages and nodes */
7032 #ifdef CONFIG_SCHED_SMT 7086 #ifdef CONFIG_SCHED_SMT
7033 for_each_cpu(i, cpu_map) { 7087 for_each_cpu(i, cpu_map) {
7034 sd = &per_cpu(cpu_domains, i).sd; 7088 sd = &per_cpu(cpu_domains, i).sd;
7035 init_sched_groups_power(i, sd); 7089 init_sched_groups_power(i, sd);
7036 } 7090 }
7037 #endif 7091 #endif
7038 #ifdef CONFIG_SCHED_MC 7092 #ifdef CONFIG_SCHED_MC
7039 for_each_cpu(i, cpu_map) { 7093 for_each_cpu(i, cpu_map) {
7040 sd = &per_cpu(core_domains, i).sd; 7094 sd = &per_cpu(core_domains, i).sd;
7041 init_sched_groups_power(i, sd); 7095 init_sched_groups_power(i, sd);
7042 } 7096 }
7043 #endif 7097 #endif
7044 7098
7045 for_each_cpu(i, cpu_map) { 7099 for_each_cpu(i, cpu_map) {
7046 sd = &per_cpu(phys_domains, i).sd; 7100 sd = &per_cpu(phys_domains, i).sd;
7047 init_sched_groups_power(i, sd); 7101 init_sched_groups_power(i, sd);
7048 } 7102 }
7049 7103
7050 #ifdef CONFIG_NUMA 7104 #ifdef CONFIG_NUMA
7051 for (i = 0; i < nr_node_ids; i++) 7105 for (i = 0; i < nr_node_ids; i++)
7052 init_numa_sched_groups_power(d.sched_group_nodes[i]); 7106 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7053 7107
7054 if (d.sd_allnodes) { 7108 if (d.sd_allnodes) {
7055 struct sched_group *sg; 7109 struct sched_group *sg;
7056 7110
7057 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7111 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
7058 d.tmpmask); 7112 d.tmpmask);
7059 init_numa_sched_groups_power(sg); 7113 init_numa_sched_groups_power(sg);
7060 } 7114 }
7061 #endif 7115 #endif
7062 7116
7063 /* Attach the domains */ 7117 /* Attach the domains */
7064 for_each_cpu(i, cpu_map) { 7118 for_each_cpu(i, cpu_map) {
7065 #ifdef CONFIG_SCHED_SMT 7119 #ifdef CONFIG_SCHED_SMT
7066 sd = &per_cpu(cpu_domains, i).sd; 7120 sd = &per_cpu(cpu_domains, i).sd;
7067 #elif defined(CONFIG_SCHED_MC) 7121 #elif defined(CONFIG_SCHED_MC)
7068 sd = &per_cpu(core_domains, i).sd; 7122 sd = &per_cpu(core_domains, i).sd;
7069 #else 7123 #else
7070 sd = &per_cpu(phys_domains, i).sd; 7124 sd = &per_cpu(phys_domains, i).sd;
7071 #endif 7125 #endif
7072 cpu_attach_domain(sd, d.rd, i); 7126 cpu_attach_domain(sd, d.rd, i);
7073 } 7127 }
7074 7128
7075 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7129 d.sched_group_nodes = NULL; /* don't free this we still need it */
7076 __free_domain_allocs(&d, sa_tmpmask, cpu_map); 7130 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7077 return 0; 7131 return 0;
7078 7132
7079 error: 7133 error:
7080 __free_domain_allocs(&d, alloc_state, cpu_map); 7134 __free_domain_allocs(&d, alloc_state, cpu_map);
7081 return -ENOMEM; 7135 return -ENOMEM;
7082 } 7136 }
7083 7137
7084 static int build_sched_domains(const struct cpumask *cpu_map) 7138 static int build_sched_domains(const struct cpumask *cpu_map)
7085 { 7139 {
7086 return __build_sched_domains(cpu_map, NULL); 7140 return __build_sched_domains(cpu_map, NULL);
7087 } 7141 }
7088 7142
7089 static cpumask_var_t *doms_cur; /* current sched domains */ 7143 static cpumask_var_t *doms_cur; /* current sched domains */
7090 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7144 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7091 static struct sched_domain_attr *dattr_cur; 7145 static struct sched_domain_attr *dattr_cur;
7092 /* attribues of custom domains in 'doms_cur' */ 7146 /* attribues of custom domains in 'doms_cur' */
7093 7147
7094 /* 7148 /*
7095 * Special case: If a kmalloc of a doms_cur partition (array of 7149 * Special case: If a kmalloc of a doms_cur partition (array of
7096 * cpumask) fails, then fallback to a single sched domain, 7150 * cpumask) fails, then fallback to a single sched domain,
7097 * as determined by the single cpumask fallback_doms. 7151 * as determined by the single cpumask fallback_doms.
7098 */ 7152 */
7099 static cpumask_var_t fallback_doms; 7153 static cpumask_var_t fallback_doms;
7100 7154
7101 /* 7155 /*
7102 * arch_update_cpu_topology lets virtualized architectures update the 7156 * arch_update_cpu_topology lets virtualized architectures update the
7103 * cpu core maps. It is supposed to return 1 if the topology changed 7157 * cpu core maps. It is supposed to return 1 if the topology changed
7104 * or 0 if it stayed the same. 7158 * or 0 if it stayed the same.
7105 */ 7159 */
7106 int __attribute__((weak)) arch_update_cpu_topology(void) 7160 int __attribute__((weak)) arch_update_cpu_topology(void)
7107 { 7161 {
7108 return 0; 7162 return 0;
7109 } 7163 }
7110 7164
7111 cpumask_var_t *alloc_sched_domains(unsigned int ndoms) 7165 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7112 { 7166 {
7113 int i; 7167 int i;
7114 cpumask_var_t *doms; 7168 cpumask_var_t *doms;
7115 7169
7116 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); 7170 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7117 if (!doms) 7171 if (!doms)
7118 return NULL; 7172 return NULL;
7119 for (i = 0; i < ndoms; i++) { 7173 for (i = 0; i < ndoms; i++) {
7120 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { 7174 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7121 free_sched_domains(doms, i); 7175 free_sched_domains(doms, i);
7122 return NULL; 7176 return NULL;
7123 } 7177 }
7124 } 7178 }
7125 return doms; 7179 return doms;
7126 } 7180 }
7127 7181
7128 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) 7182 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7129 { 7183 {
7130 unsigned int i; 7184 unsigned int i;
7131 for (i = 0; i < ndoms; i++) 7185 for (i = 0; i < ndoms; i++)
7132 free_cpumask_var(doms[i]); 7186 free_cpumask_var(doms[i]);
7133 kfree(doms); 7187 kfree(doms);
7134 } 7188 }
7135 7189
7136 /* 7190 /*
7137 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7191 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7138 * For now this just excludes isolated cpus, but could be used to 7192 * For now this just excludes isolated cpus, but could be used to
7139 * exclude other special cases in the future. 7193 * exclude other special cases in the future.
7140 */ 7194 */
7141 static int arch_init_sched_domains(const struct cpumask *cpu_map) 7195 static int arch_init_sched_domains(const struct cpumask *cpu_map)
7142 { 7196 {
7143 int err; 7197 int err;
7144 7198
7145 arch_update_cpu_topology(); 7199 arch_update_cpu_topology();
7146 ndoms_cur = 1; 7200 ndoms_cur = 1;
7147 doms_cur = alloc_sched_domains(ndoms_cur); 7201 doms_cur = alloc_sched_domains(ndoms_cur);
7148 if (!doms_cur) 7202 if (!doms_cur)
7149 doms_cur = &fallback_doms; 7203 doms_cur = &fallback_doms;
7150 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7204 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7151 dattr_cur = NULL; 7205 dattr_cur = NULL;
7152 err = build_sched_domains(doms_cur[0]); 7206 err = build_sched_domains(doms_cur[0]);
7153 register_sched_domain_sysctl(); 7207 register_sched_domain_sysctl();
7154 7208
7155 return err; 7209 return err;
7156 } 7210 }
7157 7211
7158 static void arch_destroy_sched_domains(const struct cpumask *cpu_map, 7212 static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7159 struct cpumask *tmpmask) 7213 struct cpumask *tmpmask)
7160 { 7214 {
7161 free_sched_groups(cpu_map, tmpmask); 7215 free_sched_groups(cpu_map, tmpmask);
7162 } 7216 }
7163 7217
7164 /* 7218 /*
7165 * Detach sched domains from a group of cpus specified in cpu_map 7219 * Detach sched domains from a group of cpus specified in cpu_map
7166 * These cpus will now be attached to the NULL domain 7220 * These cpus will now be attached to the NULL domain
7167 */ 7221 */
7168 static void detach_destroy_domains(const struct cpumask *cpu_map) 7222 static void detach_destroy_domains(const struct cpumask *cpu_map)
7169 { 7223 {
7170 /* Save because hotplug lock held. */ 7224 /* Save because hotplug lock held. */
7171 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); 7225 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7172 int i; 7226 int i;
7173 7227
7174 for_each_cpu(i, cpu_map) 7228 for_each_cpu(i, cpu_map)
7175 cpu_attach_domain(NULL, &def_root_domain, i); 7229 cpu_attach_domain(NULL, &def_root_domain, i);
7176 synchronize_sched(); 7230 synchronize_sched();
7177 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); 7231 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7178 } 7232 }
7179 7233
7180 /* handle null as "default" */ 7234 /* handle null as "default" */
7181 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 7235 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7182 struct sched_domain_attr *new, int idx_new) 7236 struct sched_domain_attr *new, int idx_new)
7183 { 7237 {
7184 struct sched_domain_attr tmp; 7238 struct sched_domain_attr tmp;
7185 7239
7186 /* fast path */ 7240 /* fast path */
7187 if (!new && !cur) 7241 if (!new && !cur)
7188 return 1; 7242 return 1;
7189 7243
7190 tmp = SD_ATTR_INIT; 7244 tmp = SD_ATTR_INIT;
7191 return !memcmp(cur ? (cur + idx_cur) : &tmp, 7245 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7192 new ? (new + idx_new) : &tmp, 7246 new ? (new + idx_new) : &tmp,
7193 sizeof(struct sched_domain_attr)); 7247 sizeof(struct sched_domain_attr));
7194 } 7248 }
7195 7249
7196 /* 7250 /*
7197 * Partition sched domains as specified by the 'ndoms_new' 7251 * Partition sched domains as specified by the 'ndoms_new'
7198 * cpumasks in the array doms_new[] of cpumasks. This compares 7252 * cpumasks in the array doms_new[] of cpumasks. This compares
7199 * doms_new[] to the current sched domain partitioning, doms_cur[]. 7253 * doms_new[] to the current sched domain partitioning, doms_cur[].
7200 * It destroys each deleted domain and builds each new domain. 7254 * It destroys each deleted domain and builds each new domain.
7201 * 7255 *
7202 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. 7256 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
7203 * The masks don't intersect (don't overlap.) We should setup one 7257 * The masks don't intersect (don't overlap.) We should setup one
7204 * sched domain for each mask. CPUs not in any of the cpumasks will 7258 * sched domain for each mask. CPUs not in any of the cpumasks will
7205 * not be load balanced. If the same cpumask appears both in the 7259 * not be load balanced. If the same cpumask appears both in the
7206 * current 'doms_cur' domains and in the new 'doms_new', we can leave 7260 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7207 * it as it is. 7261 * it as it is.
7208 * 7262 *
7209 * The passed in 'doms_new' should be allocated using 7263 * The passed in 'doms_new' should be allocated using
7210 * alloc_sched_domains. This routine takes ownership of it and will 7264 * alloc_sched_domains. This routine takes ownership of it and will
7211 * free_sched_domains it when done with it. If the caller failed the 7265 * free_sched_domains it when done with it. If the caller failed the
7212 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, 7266 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7213 * and partition_sched_domains() will fallback to the single partition 7267 * and partition_sched_domains() will fallback to the single partition
7214 * 'fallback_doms', it also forces the domains to be rebuilt. 7268 * 'fallback_doms', it also forces the domains to be rebuilt.
7215 * 7269 *
7216 * If doms_new == NULL it will be replaced with cpu_online_mask. 7270 * If doms_new == NULL it will be replaced with cpu_online_mask.
7217 * ndoms_new == 0 is a special case for destroying existing domains, 7271 * ndoms_new == 0 is a special case for destroying existing domains,
7218 * and it will not create the default domain. 7272 * and it will not create the default domain.
7219 * 7273 *
7220 * Call with hotplug lock held 7274 * Call with hotplug lock held
7221 */ 7275 */
7222 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 7276 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
7223 struct sched_domain_attr *dattr_new) 7277 struct sched_domain_attr *dattr_new)
7224 { 7278 {
7225 int i, j, n; 7279 int i, j, n;
7226 int new_topology; 7280 int new_topology;
7227 7281
7228 mutex_lock(&sched_domains_mutex); 7282 mutex_lock(&sched_domains_mutex);
7229 7283
7230 /* always unregister in case we don't destroy any domains */ 7284 /* always unregister in case we don't destroy any domains */
7231 unregister_sched_domain_sysctl(); 7285 unregister_sched_domain_sysctl();
7232 7286
7233 /* Let architecture update cpu core mappings. */ 7287 /* Let architecture update cpu core mappings. */
7234 new_topology = arch_update_cpu_topology(); 7288 new_topology = arch_update_cpu_topology();
7235 7289
7236 n = doms_new ? ndoms_new : 0; 7290 n = doms_new ? ndoms_new : 0;
7237 7291
7238 /* Destroy deleted domains */ 7292 /* Destroy deleted domains */
7239 for (i = 0; i < ndoms_cur; i++) { 7293 for (i = 0; i < ndoms_cur; i++) {
7240 for (j = 0; j < n && !new_topology; j++) { 7294 for (j = 0; j < n && !new_topology; j++) {
7241 if (cpumask_equal(doms_cur[i], doms_new[j]) 7295 if (cpumask_equal(doms_cur[i], doms_new[j])
7242 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7296 && dattrs_equal(dattr_cur, i, dattr_new, j))
7243 goto match1; 7297 goto match1;
7244 } 7298 }
7245 /* no match - a current sched domain not in new doms_new[] */ 7299 /* no match - a current sched domain not in new doms_new[] */
7246 detach_destroy_domains(doms_cur[i]); 7300 detach_destroy_domains(doms_cur[i]);
7247 match1: 7301 match1:
7248 ; 7302 ;
7249 } 7303 }
7250 7304
7251 if (doms_new == NULL) { 7305 if (doms_new == NULL) {
7252 ndoms_cur = 0; 7306 ndoms_cur = 0;
7253 doms_new = &fallback_doms; 7307 doms_new = &fallback_doms;
7254 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); 7308 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
7255 WARN_ON_ONCE(dattr_new); 7309 WARN_ON_ONCE(dattr_new);
7256 } 7310 }
7257 7311
7258 /* Build new domains */ 7312 /* Build new domains */
7259 for (i = 0; i < ndoms_new; i++) { 7313 for (i = 0; i < ndoms_new; i++) {
7260 for (j = 0; j < ndoms_cur && !new_topology; j++) { 7314 for (j = 0; j < ndoms_cur && !new_topology; j++) {
7261 if (cpumask_equal(doms_new[i], doms_cur[j]) 7315 if (cpumask_equal(doms_new[i], doms_cur[j])
7262 && dattrs_equal(dattr_new, i, dattr_cur, j)) 7316 && dattrs_equal(dattr_new, i, dattr_cur, j))
7263 goto match2; 7317 goto match2;
7264 } 7318 }
7265 /* no match - add a new doms_new */ 7319 /* no match - add a new doms_new */
7266 __build_sched_domains(doms_new[i], 7320 __build_sched_domains(doms_new[i],
7267 dattr_new ? dattr_new + i : NULL); 7321 dattr_new ? dattr_new + i : NULL);
7268 match2: 7322 match2:
7269 ; 7323 ;
7270 } 7324 }
7271 7325
7272 /* Remember the new sched domains */ 7326 /* Remember the new sched domains */
7273 if (doms_cur != &fallback_doms) 7327 if (doms_cur != &fallback_doms)
7274 free_sched_domains(doms_cur, ndoms_cur); 7328 free_sched_domains(doms_cur, ndoms_cur);
7275 kfree(dattr_cur); /* kfree(NULL) is safe */ 7329 kfree(dattr_cur); /* kfree(NULL) is safe */
7276 doms_cur = doms_new; 7330 doms_cur = doms_new;
7277 dattr_cur = dattr_new; 7331 dattr_cur = dattr_new;
7278 ndoms_cur = ndoms_new; 7332 ndoms_cur = ndoms_new;
7279 7333
7280 register_sched_domain_sysctl(); 7334 register_sched_domain_sysctl();
7281 7335
7282 mutex_unlock(&sched_domains_mutex); 7336 mutex_unlock(&sched_domains_mutex);
7283 } 7337 }
7284 7338
7285 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7339 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7286 static void arch_reinit_sched_domains(void) 7340 static void arch_reinit_sched_domains(void)
7287 { 7341 {
7288 get_online_cpus(); 7342 get_online_cpus();
7289 7343
7290 /* Destroy domains first to force the rebuild */ 7344 /* Destroy domains first to force the rebuild */
7291 partition_sched_domains(0, NULL, NULL); 7345 partition_sched_domains(0, NULL, NULL);
7292 7346
7293 rebuild_sched_domains(); 7347 rebuild_sched_domains();
7294 put_online_cpus(); 7348 put_online_cpus();
7295 } 7349 }
7296 7350
7297 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7351 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7298 { 7352 {
7299 unsigned int level = 0; 7353 unsigned int level = 0;
7300 7354
7301 if (sscanf(buf, "%u", &level) != 1) 7355 if (sscanf(buf, "%u", &level) != 1)
7302 return -EINVAL; 7356 return -EINVAL;
7303 7357
7304 /* 7358 /*
7305 * level is always be positive so don't check for 7359 * level is always be positive so don't check for
7306 * level < POWERSAVINGS_BALANCE_NONE which is 0 7360 * level < POWERSAVINGS_BALANCE_NONE which is 0
7307 * What happens on 0 or 1 byte write, 7361 * What happens on 0 or 1 byte write,
7308 * need to check for count as well? 7362 * need to check for count as well?
7309 */ 7363 */
7310 7364
7311 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) 7365 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
7312 return -EINVAL; 7366 return -EINVAL;
7313 7367
7314 if (smt) 7368 if (smt)
7315 sched_smt_power_savings = level; 7369 sched_smt_power_savings = level;
7316 else 7370 else
7317 sched_mc_power_savings = level; 7371 sched_mc_power_savings = level;
7318 7372
7319 arch_reinit_sched_domains(); 7373 arch_reinit_sched_domains();
7320 7374
7321 return count; 7375 return count;
7322 } 7376 }
7323 7377
7324 #ifdef CONFIG_SCHED_MC 7378 #ifdef CONFIG_SCHED_MC
7325 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7379 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7326 struct sysdev_class_attribute *attr, 7380 struct sysdev_class_attribute *attr,
7327 char *page) 7381 char *page)
7328 { 7382 {
7329 return sprintf(page, "%u\n", sched_mc_power_savings); 7383 return sprintf(page, "%u\n", sched_mc_power_savings);
7330 } 7384 }
7331 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7385 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7332 struct sysdev_class_attribute *attr, 7386 struct sysdev_class_attribute *attr,
7333 const char *buf, size_t count) 7387 const char *buf, size_t count)
7334 { 7388 {
7335 return sched_power_savings_store(buf, count, 0); 7389 return sched_power_savings_store(buf, count, 0);
7336 } 7390 }
7337 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 7391 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7338 sched_mc_power_savings_show, 7392 sched_mc_power_savings_show,
7339 sched_mc_power_savings_store); 7393 sched_mc_power_savings_store);
7340 #endif 7394 #endif
7341 7395
7342 #ifdef CONFIG_SCHED_SMT 7396 #ifdef CONFIG_SCHED_SMT
7343 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7397 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7344 struct sysdev_class_attribute *attr, 7398 struct sysdev_class_attribute *attr,
7345 char *page) 7399 char *page)
7346 { 7400 {
7347 return sprintf(page, "%u\n", sched_smt_power_savings); 7401 return sprintf(page, "%u\n", sched_smt_power_savings);
7348 } 7402 }
7349 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7403 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7350 struct sysdev_class_attribute *attr, 7404 struct sysdev_class_attribute *attr,
7351 const char *buf, size_t count) 7405 const char *buf, size_t count)
7352 { 7406 {
7353 return sched_power_savings_store(buf, count, 1); 7407 return sched_power_savings_store(buf, count, 1);
7354 } 7408 }
7355 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 7409 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7356 sched_smt_power_savings_show, 7410 sched_smt_power_savings_show,
7357 sched_smt_power_savings_store); 7411 sched_smt_power_savings_store);
7358 #endif 7412 #endif
7359 7413
7360 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 7414 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7361 { 7415 {
7362 int err = 0; 7416 int err = 0;
7363 7417
7364 #ifdef CONFIG_SCHED_SMT 7418 #ifdef CONFIG_SCHED_SMT
7365 if (smt_capable()) 7419 if (smt_capable())
7366 err = sysfs_create_file(&cls->kset.kobj, 7420 err = sysfs_create_file(&cls->kset.kobj,
7367 &attr_sched_smt_power_savings.attr); 7421 &attr_sched_smt_power_savings.attr);
7368 #endif 7422 #endif
7369 #ifdef CONFIG_SCHED_MC 7423 #ifdef CONFIG_SCHED_MC
7370 if (!err && mc_capable()) 7424 if (!err && mc_capable())
7371 err = sysfs_create_file(&cls->kset.kobj, 7425 err = sysfs_create_file(&cls->kset.kobj,
7372 &attr_sched_mc_power_savings.attr); 7426 &attr_sched_mc_power_savings.attr);
7373 #endif 7427 #endif
7374 return err; 7428 return err;
7375 } 7429 }
7376 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7430 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7377 7431
7378 #ifndef CONFIG_CPUSETS 7432 #ifndef CONFIG_CPUSETS
7379 /* 7433 /*
7380 * Add online and remove offline CPUs from the scheduler domains. 7434 * Add online and remove offline CPUs from the scheduler domains.
7381 * When cpusets are enabled they take over this function. 7435 * When cpusets are enabled they take over this function.
7382 */ 7436 */
7383 static int update_sched_domains(struct notifier_block *nfb, 7437 static int update_sched_domains(struct notifier_block *nfb,
7384 unsigned long action, void *hcpu) 7438 unsigned long action, void *hcpu)
7385 { 7439 {
7386 switch (action) { 7440 switch (action) {
7387 case CPU_ONLINE: 7441 case CPU_ONLINE:
7388 case CPU_ONLINE_FROZEN: 7442 case CPU_ONLINE_FROZEN:
7389 case CPU_DOWN_PREPARE: 7443 case CPU_DOWN_PREPARE:
7390 case CPU_DOWN_PREPARE_FROZEN: 7444 case CPU_DOWN_PREPARE_FROZEN:
7391 case CPU_DOWN_FAILED: 7445 case CPU_DOWN_FAILED:
7392 case CPU_DOWN_FAILED_FROZEN: 7446 case CPU_DOWN_FAILED_FROZEN:
7393 partition_sched_domains(1, NULL, NULL); 7447 partition_sched_domains(1, NULL, NULL);
7394 return NOTIFY_OK; 7448 return NOTIFY_OK;
7395 7449
7396 default: 7450 default:
7397 return NOTIFY_DONE; 7451 return NOTIFY_DONE;
7398 } 7452 }
7399 } 7453 }
7400 #endif 7454 #endif
7401 7455
7402 static int update_runtime(struct notifier_block *nfb, 7456 static int update_runtime(struct notifier_block *nfb,
7403 unsigned long action, void *hcpu) 7457 unsigned long action, void *hcpu)
7404 { 7458 {
7405 int cpu = (int)(long)hcpu; 7459 int cpu = (int)(long)hcpu;
7406 7460
7407 switch (action) { 7461 switch (action) {
7408 case CPU_DOWN_PREPARE: 7462 case CPU_DOWN_PREPARE:
7409 case CPU_DOWN_PREPARE_FROZEN: 7463 case CPU_DOWN_PREPARE_FROZEN:
7410 disable_runtime(cpu_rq(cpu)); 7464 disable_runtime(cpu_rq(cpu));
7411 return NOTIFY_OK; 7465 return NOTIFY_OK;
7412 7466
7413 case CPU_DOWN_FAILED: 7467 case CPU_DOWN_FAILED:
7414 case CPU_DOWN_FAILED_FROZEN: 7468 case CPU_DOWN_FAILED_FROZEN:
7415 case CPU_ONLINE: 7469 case CPU_ONLINE:
7416 case CPU_ONLINE_FROZEN: 7470 case CPU_ONLINE_FROZEN:
7417 enable_runtime(cpu_rq(cpu)); 7471 enable_runtime(cpu_rq(cpu));
7418 return NOTIFY_OK; 7472 return NOTIFY_OK;
7419 7473
7420 default: 7474 default:
7421 return NOTIFY_DONE; 7475 return NOTIFY_DONE;
7422 } 7476 }
7423 } 7477 }
7424 7478
7425 void __init sched_init_smp(void) 7479 void __init sched_init_smp(void)
7426 { 7480 {
7427 cpumask_var_t non_isolated_cpus; 7481 cpumask_var_t non_isolated_cpus;
7428 7482
7429 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7483 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7430 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7484 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7431 7485
7432 #if defined(CONFIG_NUMA) 7486 #if defined(CONFIG_NUMA)
7433 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 7487 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7434 GFP_KERNEL); 7488 GFP_KERNEL);
7435 BUG_ON(sched_group_nodes_bycpu == NULL); 7489 BUG_ON(sched_group_nodes_bycpu == NULL);
7436 #endif 7490 #endif
7437 get_online_cpus(); 7491 get_online_cpus();
7438 mutex_lock(&sched_domains_mutex); 7492 mutex_lock(&sched_domains_mutex);
7439 arch_init_sched_domains(cpu_active_mask); 7493 arch_init_sched_domains(cpu_active_mask);
7440 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7494 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7441 if (cpumask_empty(non_isolated_cpus)) 7495 if (cpumask_empty(non_isolated_cpus))
7442 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7496 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
7443 mutex_unlock(&sched_domains_mutex); 7497 mutex_unlock(&sched_domains_mutex);
7444 put_online_cpus(); 7498 put_online_cpus();
7445 7499
7446 #ifndef CONFIG_CPUSETS 7500 #ifndef CONFIG_CPUSETS
7447 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7501 /* XXX: Theoretical race here - CPU may be hotplugged now */
7448 hotcpu_notifier(update_sched_domains, 0); 7502 hotcpu_notifier(update_sched_domains, 0);
7449 #endif 7503 #endif
7450 7504
7451 /* RT runtime code needs to handle some hotplug events */ 7505 /* RT runtime code needs to handle some hotplug events */
7452 hotcpu_notifier(update_runtime, 0); 7506 hotcpu_notifier(update_runtime, 0);
7453 7507
7454 init_hrtick(); 7508 init_hrtick();
7455 7509
7456 /* Move init over to a non-isolated CPU */ 7510 /* Move init over to a non-isolated CPU */
7457 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 7511 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
7458 BUG(); 7512 BUG();
7459 sched_init_granularity(); 7513 sched_init_granularity();
7460 free_cpumask_var(non_isolated_cpus); 7514 free_cpumask_var(non_isolated_cpus);
7461 7515
7462 init_sched_rt_class(); 7516 init_sched_rt_class();
7463 } 7517 }
7464 #else 7518 #else
7465 void __init sched_init_smp(void) 7519 void __init sched_init_smp(void)
7466 { 7520 {
7467 sched_init_granularity(); 7521 sched_init_granularity();
7468 } 7522 }
7469 #endif /* CONFIG_SMP */ 7523 #endif /* CONFIG_SMP */
7470 7524
7471 const_debug unsigned int sysctl_timer_migration = 1; 7525 const_debug unsigned int sysctl_timer_migration = 1;
7472 7526
7473 int in_sched_functions(unsigned long addr) 7527 int in_sched_functions(unsigned long addr)
7474 { 7528 {
7475 return in_lock_functions(addr) || 7529 return in_lock_functions(addr) ||
7476 (addr >= (unsigned long)__sched_text_start 7530 (addr >= (unsigned long)__sched_text_start
7477 && addr < (unsigned long)__sched_text_end); 7531 && addr < (unsigned long)__sched_text_end);
7478 } 7532 }
7479 7533
7480 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 7534 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
7481 { 7535 {
7482 cfs_rq->tasks_timeline = RB_ROOT; 7536 cfs_rq->tasks_timeline = RB_ROOT;
7483 INIT_LIST_HEAD(&cfs_rq->tasks); 7537 INIT_LIST_HEAD(&cfs_rq->tasks);
7484 #ifdef CONFIG_FAIR_GROUP_SCHED 7538 #ifdef CONFIG_FAIR_GROUP_SCHED
7485 cfs_rq->rq = rq; 7539 cfs_rq->rq = rq;
7486 #endif 7540 #endif
7487 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7541 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
7488 } 7542 }
7489 7543
7490 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 7544 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7491 { 7545 {
7492 struct rt_prio_array *array; 7546 struct rt_prio_array *array;
7493 int i; 7547 int i;
7494 7548
7495 array = &rt_rq->active; 7549 array = &rt_rq->active;
7496 for (i = 0; i < MAX_RT_PRIO; i++) { 7550 for (i = 0; i < MAX_RT_PRIO; i++) {
7497 INIT_LIST_HEAD(array->queue + i); 7551 INIT_LIST_HEAD(array->queue + i);
7498 __clear_bit(i, array->bitmap); 7552 __clear_bit(i, array->bitmap);
7499 } 7553 }
7500 /* delimiter for bitsearch: */ 7554 /* delimiter for bitsearch: */
7501 __set_bit(MAX_RT_PRIO, array->bitmap); 7555 __set_bit(MAX_RT_PRIO, array->bitmap);
7502 7556
7503 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 7557 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
7504 rt_rq->highest_prio.curr = MAX_RT_PRIO; 7558 rt_rq->highest_prio.curr = MAX_RT_PRIO;
7505 #ifdef CONFIG_SMP 7559 #ifdef CONFIG_SMP
7506 rt_rq->highest_prio.next = MAX_RT_PRIO; 7560 rt_rq->highest_prio.next = MAX_RT_PRIO;
7507 #endif 7561 #endif
7508 #endif 7562 #endif
7509 #ifdef CONFIG_SMP 7563 #ifdef CONFIG_SMP
7510 rt_rq->rt_nr_migratory = 0; 7564 rt_rq->rt_nr_migratory = 0;
7511 rt_rq->overloaded = 0; 7565 rt_rq->overloaded = 0;
7512 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); 7566 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
7513 #endif 7567 #endif
7514 7568
7515 rt_rq->rt_time = 0; 7569 rt_rq->rt_time = 0;
7516 rt_rq->rt_throttled = 0; 7570 rt_rq->rt_throttled = 0;
7517 rt_rq->rt_runtime = 0; 7571 rt_rq->rt_runtime = 0;
7518 raw_spin_lock_init(&rt_rq->rt_runtime_lock); 7572 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
7519 7573
7520 #ifdef CONFIG_RT_GROUP_SCHED 7574 #ifdef CONFIG_RT_GROUP_SCHED
7521 rt_rq->rt_nr_boosted = 0; 7575 rt_rq->rt_nr_boosted = 0;
7522 rt_rq->rq = rq; 7576 rt_rq->rq = rq;
7523 #endif 7577 #endif
7524 } 7578 }
7525 7579
7526 #ifdef CONFIG_FAIR_GROUP_SCHED 7580 #ifdef CONFIG_FAIR_GROUP_SCHED
7527 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7581 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7528 struct sched_entity *se, int cpu, int add, 7582 struct sched_entity *se, int cpu, int add,
7529 struct sched_entity *parent) 7583 struct sched_entity *parent)
7530 { 7584 {
7531 struct rq *rq = cpu_rq(cpu); 7585 struct rq *rq = cpu_rq(cpu);
7532 tg->cfs_rq[cpu] = cfs_rq; 7586 tg->cfs_rq[cpu] = cfs_rq;
7533 init_cfs_rq(cfs_rq, rq); 7587 init_cfs_rq(cfs_rq, rq);
7534 cfs_rq->tg = tg; 7588 cfs_rq->tg = tg;
7535 if (add) 7589 if (add)
7536 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7590 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7537 7591
7538 tg->se[cpu] = se; 7592 tg->se[cpu] = se;
7539 /* se could be NULL for init_task_group */ 7593 /* se could be NULL for init_task_group */
7540 if (!se) 7594 if (!se)
7541 return; 7595 return;
7542 7596
7543 if (!parent) 7597 if (!parent)
7544 se->cfs_rq = &rq->cfs; 7598 se->cfs_rq = &rq->cfs;
7545 else 7599 else
7546 se->cfs_rq = parent->my_q; 7600 se->cfs_rq = parent->my_q;
7547 7601
7548 se->my_q = cfs_rq; 7602 se->my_q = cfs_rq;
7549 se->load.weight = tg->shares; 7603 se->load.weight = tg->shares;
7550 se->load.inv_weight = 0; 7604 se->load.inv_weight = 0;
7551 se->parent = parent; 7605 se->parent = parent;
7552 } 7606 }
7553 #endif 7607 #endif
7554 7608
7555 #ifdef CONFIG_RT_GROUP_SCHED 7609 #ifdef CONFIG_RT_GROUP_SCHED
7556 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7610 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7557 struct sched_rt_entity *rt_se, int cpu, int add, 7611 struct sched_rt_entity *rt_se, int cpu, int add,
7558 struct sched_rt_entity *parent) 7612 struct sched_rt_entity *parent)
7559 { 7613 {
7560 struct rq *rq = cpu_rq(cpu); 7614 struct rq *rq = cpu_rq(cpu);
7561 7615
7562 tg->rt_rq[cpu] = rt_rq; 7616 tg->rt_rq[cpu] = rt_rq;
7563 init_rt_rq(rt_rq, rq); 7617 init_rt_rq(rt_rq, rq);
7564 rt_rq->tg = tg; 7618 rt_rq->tg = tg;
7565 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7619 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7566 if (add) 7620 if (add)
7567 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7621 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7568 7622
7569 tg->rt_se[cpu] = rt_se; 7623 tg->rt_se[cpu] = rt_se;
7570 if (!rt_se) 7624 if (!rt_se)
7571 return; 7625 return;
7572 7626
7573 if (!parent) 7627 if (!parent)
7574 rt_se->rt_rq = &rq->rt; 7628 rt_se->rt_rq = &rq->rt;
7575 else 7629 else
7576 rt_se->rt_rq = parent->my_q; 7630 rt_se->rt_rq = parent->my_q;
7577 7631
7578 rt_se->my_q = rt_rq; 7632 rt_se->my_q = rt_rq;
7579 rt_se->parent = parent; 7633 rt_se->parent = parent;
7580 INIT_LIST_HEAD(&rt_se->run_list); 7634 INIT_LIST_HEAD(&rt_se->run_list);
7581 } 7635 }
7582 #endif 7636 #endif
7583 7637
7584 void __init sched_init(void) 7638 void __init sched_init(void)
7585 { 7639 {
7586 int i, j; 7640 int i, j;
7587 unsigned long alloc_size = 0, ptr; 7641 unsigned long alloc_size = 0, ptr;
7588 7642
7589 #ifdef CONFIG_FAIR_GROUP_SCHED 7643 #ifdef CONFIG_FAIR_GROUP_SCHED
7590 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7644 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7591 #endif 7645 #endif
7592 #ifdef CONFIG_RT_GROUP_SCHED 7646 #ifdef CONFIG_RT_GROUP_SCHED
7593 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7647 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7594 #endif 7648 #endif
7595 #ifdef CONFIG_CPUMASK_OFFSTACK 7649 #ifdef CONFIG_CPUMASK_OFFSTACK
7596 alloc_size += num_possible_cpus() * cpumask_size(); 7650 alloc_size += num_possible_cpus() * cpumask_size();
7597 #endif 7651 #endif
7598 if (alloc_size) { 7652 if (alloc_size) {
7599 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7653 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7600 7654
7601 #ifdef CONFIG_FAIR_GROUP_SCHED 7655 #ifdef CONFIG_FAIR_GROUP_SCHED
7602 init_task_group.se = (struct sched_entity **)ptr; 7656 init_task_group.se = (struct sched_entity **)ptr;
7603 ptr += nr_cpu_ids * sizeof(void **); 7657 ptr += nr_cpu_ids * sizeof(void **);
7604 7658
7605 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7659 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
7606 ptr += nr_cpu_ids * sizeof(void **); 7660 ptr += nr_cpu_ids * sizeof(void **);
7607 7661
7608 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7662 #endif /* CONFIG_FAIR_GROUP_SCHED */
7609 #ifdef CONFIG_RT_GROUP_SCHED 7663 #ifdef CONFIG_RT_GROUP_SCHED
7610 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7664 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7611 ptr += nr_cpu_ids * sizeof(void **); 7665 ptr += nr_cpu_ids * sizeof(void **);
7612 7666
7613 init_task_group.rt_rq = (struct rt_rq **)ptr; 7667 init_task_group.rt_rq = (struct rt_rq **)ptr;
7614 ptr += nr_cpu_ids * sizeof(void **); 7668 ptr += nr_cpu_ids * sizeof(void **);
7615 7669
7616 #endif /* CONFIG_RT_GROUP_SCHED */ 7670 #endif /* CONFIG_RT_GROUP_SCHED */
7617 #ifdef CONFIG_CPUMASK_OFFSTACK 7671 #ifdef CONFIG_CPUMASK_OFFSTACK
7618 for_each_possible_cpu(i) { 7672 for_each_possible_cpu(i) {
7619 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 7673 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
7620 ptr += cpumask_size(); 7674 ptr += cpumask_size();
7621 } 7675 }
7622 #endif /* CONFIG_CPUMASK_OFFSTACK */ 7676 #endif /* CONFIG_CPUMASK_OFFSTACK */
7623 } 7677 }
7624 7678
7625 #ifdef CONFIG_SMP 7679 #ifdef CONFIG_SMP
7626 init_defrootdomain(); 7680 init_defrootdomain();
7627 #endif 7681 #endif
7628 7682
7629 init_rt_bandwidth(&def_rt_bandwidth, 7683 init_rt_bandwidth(&def_rt_bandwidth,
7630 global_rt_period(), global_rt_runtime()); 7684 global_rt_period(), global_rt_runtime());
7631 7685
7632 #ifdef CONFIG_RT_GROUP_SCHED 7686 #ifdef CONFIG_RT_GROUP_SCHED
7633 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7687 init_rt_bandwidth(&init_task_group.rt_bandwidth,
7634 global_rt_period(), global_rt_runtime()); 7688 global_rt_period(), global_rt_runtime());
7635 #endif /* CONFIG_RT_GROUP_SCHED */ 7689 #endif /* CONFIG_RT_GROUP_SCHED */
7636 7690
7637 #ifdef CONFIG_CGROUP_SCHED 7691 #ifdef CONFIG_CGROUP_SCHED
7638 list_add(&init_task_group.list, &task_groups); 7692 list_add(&init_task_group.list, &task_groups);
7639 INIT_LIST_HEAD(&init_task_group.children); 7693 INIT_LIST_HEAD(&init_task_group.children);
7640 7694
7641 #endif /* CONFIG_CGROUP_SCHED */ 7695 #endif /* CONFIG_CGROUP_SCHED */
7642 7696
7643 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7697 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7644 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7698 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7645 __alignof__(unsigned long)); 7699 __alignof__(unsigned long));
7646 #endif 7700 #endif
7647 for_each_possible_cpu(i) { 7701 for_each_possible_cpu(i) {
7648 struct rq *rq; 7702 struct rq *rq;
7649 7703
7650 rq = cpu_rq(i); 7704 rq = cpu_rq(i);
7651 raw_spin_lock_init(&rq->lock); 7705 raw_spin_lock_init(&rq->lock);
7652 rq->nr_running = 0; 7706 rq->nr_running = 0;
7653 rq->calc_load_active = 0; 7707 rq->calc_load_active = 0;
7654 rq->calc_load_update = jiffies + LOAD_FREQ; 7708 rq->calc_load_update = jiffies + LOAD_FREQ;
7655 init_cfs_rq(&rq->cfs, rq); 7709 init_cfs_rq(&rq->cfs, rq);
7656 init_rt_rq(&rq->rt, rq); 7710 init_rt_rq(&rq->rt, rq);
7657 #ifdef CONFIG_FAIR_GROUP_SCHED 7711 #ifdef CONFIG_FAIR_GROUP_SCHED
7658 init_task_group.shares = init_task_group_load; 7712 init_task_group.shares = init_task_group_load;
7659 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7713 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7660 #ifdef CONFIG_CGROUP_SCHED 7714 #ifdef CONFIG_CGROUP_SCHED
7661 /* 7715 /*
7662 * How much cpu bandwidth does init_task_group get? 7716 * How much cpu bandwidth does init_task_group get?
7663 * 7717 *
7664 * In case of task-groups formed thr' the cgroup filesystem, it 7718 * In case of task-groups formed thr' the cgroup filesystem, it
7665 * gets 100% of the cpu resources in the system. This overall 7719 * gets 100% of the cpu resources in the system. This overall
7666 * system cpu resource is divided among the tasks of 7720 * system cpu resource is divided among the tasks of
7667 * init_task_group and its child task-groups in a fair manner, 7721 * init_task_group and its child task-groups in a fair manner,
7668 * based on each entity's (task or task-group's) weight 7722 * based on each entity's (task or task-group's) weight
7669 * (se->load.weight). 7723 * (se->load.weight).
7670 * 7724 *
7671 * In other words, if init_task_group has 10 tasks of weight 7725 * In other words, if init_task_group has 10 tasks of weight
7672 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7726 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7673 * then A0's share of the cpu resource is: 7727 * then A0's share of the cpu resource is:
7674 * 7728 *
7675 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7729 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7676 * 7730 *
7677 * We achieve this by letting init_task_group's tasks sit 7731 * We achieve this by letting init_task_group's tasks sit
7678 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7732 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
7679 */ 7733 */
7680 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7734 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
7681 #endif 7735 #endif
7682 #endif /* CONFIG_FAIR_GROUP_SCHED */ 7736 #endif /* CONFIG_FAIR_GROUP_SCHED */
7683 7737
7684 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7738 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7685 #ifdef CONFIG_RT_GROUP_SCHED 7739 #ifdef CONFIG_RT_GROUP_SCHED
7686 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7740 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7687 #ifdef CONFIG_CGROUP_SCHED 7741 #ifdef CONFIG_CGROUP_SCHED
7688 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7742 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7689 #endif 7743 #endif
7690 #endif 7744 #endif
7691 7745
7692 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7746 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7693 rq->cpu_load[j] = 0; 7747 rq->cpu_load[j] = 0;
7694 #ifdef CONFIG_SMP 7748 #ifdef CONFIG_SMP
7695 rq->sd = NULL; 7749 rq->sd = NULL;
7696 rq->rd = NULL; 7750 rq->rd = NULL;
7697 rq->post_schedule = 0; 7751 rq->post_schedule = 0;
7698 rq->active_balance = 0; 7752 rq->active_balance = 0;
7699 rq->next_balance = jiffies; 7753 rq->next_balance = jiffies;
7700 rq->push_cpu = 0; 7754 rq->push_cpu = 0;
7701 rq->cpu = i; 7755 rq->cpu = i;
7702 rq->online = 0; 7756 rq->online = 0;
7703 rq->migration_thread = NULL; 7757 rq->migration_thread = NULL;
7704 rq->idle_stamp = 0; 7758 rq->idle_stamp = 0;
7705 rq->avg_idle = 2*sysctl_sched_migration_cost; 7759 rq->avg_idle = 2*sysctl_sched_migration_cost;
7706 INIT_LIST_HEAD(&rq->migration_queue); 7760 INIT_LIST_HEAD(&rq->migration_queue);
7707 rq_attach_root(rq, &def_root_domain); 7761 rq_attach_root(rq, &def_root_domain);
7708 #endif 7762 #endif
7709 init_rq_hrtick(rq); 7763 init_rq_hrtick(rq);
7710 atomic_set(&rq->nr_iowait, 0); 7764 atomic_set(&rq->nr_iowait, 0);
7711 } 7765 }
7712 7766
7713 set_load_weight(&init_task); 7767 set_load_weight(&init_task);
7714 7768
7715 #ifdef CONFIG_PREEMPT_NOTIFIERS 7769 #ifdef CONFIG_PREEMPT_NOTIFIERS
7716 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 7770 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
7717 #endif 7771 #endif
7718 7772
7719 #ifdef CONFIG_SMP 7773 #ifdef CONFIG_SMP
7720 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 7774 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7721 #endif 7775 #endif
7722 7776
7723 #ifdef CONFIG_RT_MUTEXES 7777 #ifdef CONFIG_RT_MUTEXES
7724 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock); 7778 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
7725 #endif 7779 #endif
7726 7780
7727 /* 7781 /*
7728 * The boot idle thread does lazy MMU switching as well: 7782 * The boot idle thread does lazy MMU switching as well:
7729 */ 7783 */
7730 atomic_inc(&init_mm.mm_count); 7784 atomic_inc(&init_mm.mm_count);
7731 enter_lazy_tlb(&init_mm, current); 7785 enter_lazy_tlb(&init_mm, current);
7732 7786
7733 /* 7787 /*
7734 * Make us the idle thread. Technically, schedule() should not be 7788 * Make us the idle thread. Technically, schedule() should not be
7735 * called from this thread, however somewhere below it might be, 7789 * called from this thread, however somewhere below it might be,
7736 * but because we are the idle thread, we just pick up running again 7790 * but because we are the idle thread, we just pick up running again
7737 * when this runqueue becomes "idle". 7791 * when this runqueue becomes "idle".
7738 */ 7792 */
7739 init_idle(current, smp_processor_id()); 7793 init_idle(current, smp_processor_id());
7740 7794
7741 calc_load_update = jiffies + LOAD_FREQ; 7795 calc_load_update = jiffies + LOAD_FREQ;
7742 7796
7743 /* 7797 /*
7744 * During early bootup we pretend to be a normal task: 7798 * During early bootup we pretend to be a normal task:
7745 */ 7799 */
7746 current->sched_class = &fair_sched_class; 7800 current->sched_class = &fair_sched_class;
7747 7801
7748 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7802 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
7749 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7803 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7750 #ifdef CONFIG_SMP 7804 #ifdef CONFIG_SMP
7751 #ifdef CONFIG_NO_HZ 7805 #ifdef CONFIG_NO_HZ
7752 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7806 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
7753 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7807 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
7754 #endif 7808 #endif
7755 /* May be allocated at isolcpus cmdline parse time */ 7809 /* May be allocated at isolcpus cmdline parse time */
7756 if (cpu_isolated_map == NULL) 7810 if (cpu_isolated_map == NULL)
7757 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7811 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7758 #endif /* SMP */ 7812 #endif /* SMP */
7759 7813
7760 perf_event_init(); 7814 perf_event_init();
7761 7815
7762 scheduler_running = 1; 7816 scheduler_running = 1;
7763 } 7817 }
7764 7818
7765 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7819 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
7766 static inline int preempt_count_equals(int preempt_offset) 7820 static inline int preempt_count_equals(int preempt_offset)
7767 { 7821 {
7768 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); 7822 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
7769 7823
7770 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7824 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
7771 } 7825 }
7772 7826
7773 void __might_sleep(const char *file, int line, int preempt_offset) 7827 void __might_sleep(const char *file, int line, int preempt_offset)
7774 { 7828 {
7775 #ifdef in_atomic 7829 #ifdef in_atomic
7776 static unsigned long prev_jiffy; /* ratelimiting */ 7830 static unsigned long prev_jiffy; /* ratelimiting */
7777 7831
7778 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 7832 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
7779 system_state != SYSTEM_RUNNING || oops_in_progress) 7833 system_state != SYSTEM_RUNNING || oops_in_progress)
7780 return; 7834 return;
7781 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 7835 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7782 return; 7836 return;
7783 prev_jiffy = jiffies; 7837 prev_jiffy = jiffies;
7784 7838
7785 printk(KERN_ERR 7839 printk(KERN_ERR
7786 "BUG: sleeping function called from invalid context at %s:%d\n", 7840 "BUG: sleeping function called from invalid context at %s:%d\n",
7787 file, line); 7841 file, line);
7788 printk(KERN_ERR 7842 printk(KERN_ERR
7789 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 7843 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7790 in_atomic(), irqs_disabled(), 7844 in_atomic(), irqs_disabled(),
7791 current->pid, current->comm); 7845 current->pid, current->comm);
7792 7846
7793 debug_show_held_locks(current); 7847 debug_show_held_locks(current);
7794 if (irqs_disabled()) 7848 if (irqs_disabled())
7795 print_irqtrace_events(current); 7849 print_irqtrace_events(current);
7796 dump_stack(); 7850 dump_stack();
7797 #endif 7851 #endif
7798 } 7852 }
7799 EXPORT_SYMBOL(__might_sleep); 7853 EXPORT_SYMBOL(__might_sleep);
7800 #endif 7854 #endif
7801 7855
7802 #ifdef CONFIG_MAGIC_SYSRQ 7856 #ifdef CONFIG_MAGIC_SYSRQ
7803 static void normalize_task(struct rq *rq, struct task_struct *p) 7857 static void normalize_task(struct rq *rq, struct task_struct *p)
7804 { 7858 {
7805 int on_rq; 7859 int on_rq;
7806 7860
7807 on_rq = p->se.on_rq; 7861 on_rq = p->se.on_rq;
7808 if (on_rq) 7862 if (on_rq)
7809 deactivate_task(rq, p, 0); 7863 deactivate_task(rq, p, 0);
7810 __setscheduler(rq, p, SCHED_NORMAL, 0); 7864 __setscheduler(rq, p, SCHED_NORMAL, 0);
7811 if (on_rq) { 7865 if (on_rq) {
7812 activate_task(rq, p, 0); 7866 activate_task(rq, p, 0);
7813 resched_task(rq->curr); 7867 resched_task(rq->curr);
7814 } 7868 }
7815 } 7869 }
7816 7870
7817 void normalize_rt_tasks(void) 7871 void normalize_rt_tasks(void)
7818 { 7872 {
7819 struct task_struct *g, *p; 7873 struct task_struct *g, *p;
7820 unsigned long flags; 7874 unsigned long flags;
7821 struct rq *rq; 7875 struct rq *rq;
7822 7876
7823 read_lock_irqsave(&tasklist_lock, flags); 7877 read_lock_irqsave(&tasklist_lock, flags);
7824 do_each_thread(g, p) { 7878 do_each_thread(g, p) {
7825 /* 7879 /*
7826 * Only normalize user tasks: 7880 * Only normalize user tasks:
7827 */ 7881 */
7828 if (!p->mm) 7882 if (!p->mm)
7829 continue; 7883 continue;
7830 7884
7831 p->se.exec_start = 0; 7885 p->se.exec_start = 0;
7832 #ifdef CONFIG_SCHEDSTATS 7886 #ifdef CONFIG_SCHEDSTATS
7833 p->se.statistics.wait_start = 0; 7887 p->se.statistics.wait_start = 0;
7834 p->se.statistics.sleep_start = 0; 7888 p->se.statistics.sleep_start = 0;
7835 p->se.statistics.block_start = 0; 7889 p->se.statistics.block_start = 0;
7836 #endif 7890 #endif
7837 7891
7838 if (!rt_task(p)) { 7892 if (!rt_task(p)) {
7839 /* 7893 /*
7840 * Renice negative nice level userspace 7894 * Renice negative nice level userspace
7841 * tasks back to 0: 7895 * tasks back to 0:
7842 */ 7896 */
7843 if (TASK_NICE(p) < 0 && p->mm) 7897 if (TASK_NICE(p) < 0 && p->mm)
7844 set_user_nice(p, 0); 7898 set_user_nice(p, 0);
7845 continue; 7899 continue;
7846 } 7900 }
7847 7901
7848 raw_spin_lock(&p->pi_lock); 7902 raw_spin_lock(&p->pi_lock);
7849 rq = __task_rq_lock(p); 7903 rq = __task_rq_lock(p);
7850 7904
7851 normalize_task(rq, p); 7905 normalize_task(rq, p);
7852 7906
7853 __task_rq_unlock(rq); 7907 __task_rq_unlock(rq);
7854 raw_spin_unlock(&p->pi_lock); 7908 raw_spin_unlock(&p->pi_lock);
7855 } while_each_thread(g, p); 7909 } while_each_thread(g, p);
7856 7910
7857 read_unlock_irqrestore(&tasklist_lock, flags); 7911 read_unlock_irqrestore(&tasklist_lock, flags);
7858 } 7912 }
7859 7913
7860 #endif /* CONFIG_MAGIC_SYSRQ */ 7914 #endif /* CONFIG_MAGIC_SYSRQ */
7861 7915
7862 #ifdef CONFIG_IA64 7916 #ifdef CONFIG_IA64
7863 /* 7917 /*
7864 * These functions are only useful for the IA64 MCA handling. 7918 * These functions are only useful for the IA64 MCA handling.
7865 * 7919 *
7866 * They can only be called when the whole system has been 7920 * They can only be called when the whole system has been
7867 * stopped - every CPU needs to be quiescent, and no scheduling 7921 * stopped - every CPU needs to be quiescent, and no scheduling
7868 * activity can take place. Using them for anything else would 7922 * activity can take place. Using them for anything else would
7869 * be a serious bug, and as a result, they aren't even visible 7923 * be a serious bug, and as a result, they aren't even visible
7870 * under any other configuration. 7924 * under any other configuration.
7871 */ 7925 */
7872 7926
7873 /** 7927 /**
7874 * curr_task - return the current task for a given cpu. 7928 * curr_task - return the current task for a given cpu.
7875 * @cpu: the processor in question. 7929 * @cpu: the processor in question.
7876 * 7930 *
7877 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7931 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7878 */ 7932 */
7879 struct task_struct *curr_task(int cpu) 7933 struct task_struct *curr_task(int cpu)
7880 { 7934 {
7881 return cpu_curr(cpu); 7935 return cpu_curr(cpu);
7882 } 7936 }
7883 7937
7884 /** 7938 /**
7885 * set_curr_task - set the current task for a given cpu. 7939 * set_curr_task - set the current task for a given cpu.
7886 * @cpu: the processor in question. 7940 * @cpu: the processor in question.
7887 * @p: the task pointer to set. 7941 * @p: the task pointer to set.
7888 * 7942 *
7889 * Description: This function must only be used when non-maskable interrupts 7943 * Description: This function must only be used when non-maskable interrupts
7890 * are serviced on a separate stack. It allows the architecture to switch the 7944 * are serviced on a separate stack. It allows the architecture to switch the
7891 * notion of the current task on a cpu in a non-blocking manner. This function 7945 * notion of the current task on a cpu in a non-blocking manner. This function
7892 * must be called with all CPU's synchronized, and interrupts disabled, the 7946 * must be called with all CPU's synchronized, and interrupts disabled, the
7893 * and caller must save the original value of the current task (see 7947 * and caller must save the original value of the current task (see
7894 * curr_task() above) and restore that value before reenabling interrupts and 7948 * curr_task() above) and restore that value before reenabling interrupts and
7895 * re-starting the system. 7949 * re-starting the system.
7896 * 7950 *
7897 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 7951 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
7898 */ 7952 */
7899 void set_curr_task(int cpu, struct task_struct *p) 7953 void set_curr_task(int cpu, struct task_struct *p)
7900 { 7954 {
7901 cpu_curr(cpu) = p; 7955 cpu_curr(cpu) = p;
7902 } 7956 }
7903 7957
7904 #endif 7958 #endif
7905 7959
7906 #ifdef CONFIG_FAIR_GROUP_SCHED 7960 #ifdef CONFIG_FAIR_GROUP_SCHED
7907 static void free_fair_sched_group(struct task_group *tg) 7961 static void free_fair_sched_group(struct task_group *tg)
7908 { 7962 {
7909 int i; 7963 int i;
7910 7964
7911 for_each_possible_cpu(i) { 7965 for_each_possible_cpu(i) {
7912 if (tg->cfs_rq) 7966 if (tg->cfs_rq)
7913 kfree(tg->cfs_rq[i]); 7967 kfree(tg->cfs_rq[i]);
7914 if (tg->se) 7968 if (tg->se)
7915 kfree(tg->se[i]); 7969 kfree(tg->se[i]);
7916 } 7970 }
7917 7971
7918 kfree(tg->cfs_rq); 7972 kfree(tg->cfs_rq);
7919 kfree(tg->se); 7973 kfree(tg->se);
7920 } 7974 }
7921 7975
7922 static 7976 static
7923 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 7977 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7924 { 7978 {
7925 struct cfs_rq *cfs_rq; 7979 struct cfs_rq *cfs_rq;
7926 struct sched_entity *se; 7980 struct sched_entity *se;
7927 struct rq *rq; 7981 struct rq *rq;
7928 int i; 7982 int i;
7929 7983
7930 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 7984 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
7931 if (!tg->cfs_rq) 7985 if (!tg->cfs_rq)
7932 goto err; 7986 goto err;
7933 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 7987 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
7934 if (!tg->se) 7988 if (!tg->se)
7935 goto err; 7989 goto err;
7936 7990
7937 tg->shares = NICE_0_LOAD; 7991 tg->shares = NICE_0_LOAD;
7938 7992
7939 for_each_possible_cpu(i) { 7993 for_each_possible_cpu(i) {
7940 rq = cpu_rq(i); 7994 rq = cpu_rq(i);
7941 7995
7942 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 7996 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
7943 GFP_KERNEL, cpu_to_node(i)); 7997 GFP_KERNEL, cpu_to_node(i));
7944 if (!cfs_rq) 7998 if (!cfs_rq)
7945 goto err; 7999 goto err;
7946 8000
7947 se = kzalloc_node(sizeof(struct sched_entity), 8001 se = kzalloc_node(sizeof(struct sched_entity),
7948 GFP_KERNEL, cpu_to_node(i)); 8002 GFP_KERNEL, cpu_to_node(i));
7949 if (!se) 8003 if (!se)
7950 goto err_free_rq; 8004 goto err_free_rq;
7951 8005
7952 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8006 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
7953 } 8007 }
7954 8008
7955 return 1; 8009 return 1;
7956 8010
7957 err_free_rq: 8011 err_free_rq:
7958 kfree(cfs_rq); 8012 kfree(cfs_rq);
7959 err: 8013 err:
7960 return 0; 8014 return 0;
7961 } 8015 }
7962 8016
7963 static inline void register_fair_sched_group(struct task_group *tg, int cpu) 8017 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7964 { 8018 {
7965 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, 8019 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
7966 &cpu_rq(cpu)->leaf_cfs_rq_list); 8020 &cpu_rq(cpu)->leaf_cfs_rq_list);
7967 } 8021 }
7968 8022
7969 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8023 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7970 { 8024 {
7971 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8025 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
7972 } 8026 }
7973 #else /* !CONFG_FAIR_GROUP_SCHED */ 8027 #else /* !CONFG_FAIR_GROUP_SCHED */
7974 static inline void free_fair_sched_group(struct task_group *tg) 8028 static inline void free_fair_sched_group(struct task_group *tg)
7975 { 8029 {
7976 } 8030 }
7977 8031
7978 static inline 8032 static inline
7979 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8033 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
7980 { 8034 {
7981 return 1; 8035 return 1;
7982 } 8036 }
7983 8037
7984 static inline void register_fair_sched_group(struct task_group *tg, int cpu) 8038 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
7985 { 8039 {
7986 } 8040 }
7987 8041
7988 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8042 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
7989 { 8043 {
7990 } 8044 }
7991 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8045 #endif /* CONFIG_FAIR_GROUP_SCHED */
7992 8046
7993 #ifdef CONFIG_RT_GROUP_SCHED 8047 #ifdef CONFIG_RT_GROUP_SCHED
7994 static void free_rt_sched_group(struct task_group *tg) 8048 static void free_rt_sched_group(struct task_group *tg)
7995 { 8049 {
7996 int i; 8050 int i;
7997 8051
7998 destroy_rt_bandwidth(&tg->rt_bandwidth); 8052 destroy_rt_bandwidth(&tg->rt_bandwidth);
7999 8053
8000 for_each_possible_cpu(i) { 8054 for_each_possible_cpu(i) {
8001 if (tg->rt_rq) 8055 if (tg->rt_rq)
8002 kfree(tg->rt_rq[i]); 8056 kfree(tg->rt_rq[i]);
8003 if (tg->rt_se) 8057 if (tg->rt_se)
8004 kfree(tg->rt_se[i]); 8058 kfree(tg->rt_se[i]);
8005 } 8059 }
8006 8060
8007 kfree(tg->rt_rq); 8061 kfree(tg->rt_rq);
8008 kfree(tg->rt_se); 8062 kfree(tg->rt_se);
8009 } 8063 }
8010 8064
8011 static 8065 static
8012 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8066 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8013 { 8067 {
8014 struct rt_rq *rt_rq; 8068 struct rt_rq *rt_rq;
8015 struct sched_rt_entity *rt_se; 8069 struct sched_rt_entity *rt_se;
8016 struct rq *rq; 8070 struct rq *rq;
8017 int i; 8071 int i;
8018 8072
8019 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8073 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
8020 if (!tg->rt_rq) 8074 if (!tg->rt_rq)
8021 goto err; 8075 goto err;
8022 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 8076 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
8023 if (!tg->rt_se) 8077 if (!tg->rt_se)
8024 goto err; 8078 goto err;
8025 8079
8026 init_rt_bandwidth(&tg->rt_bandwidth, 8080 init_rt_bandwidth(&tg->rt_bandwidth,
8027 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8081 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8028 8082
8029 for_each_possible_cpu(i) { 8083 for_each_possible_cpu(i) {
8030 rq = cpu_rq(i); 8084 rq = cpu_rq(i);
8031 8085
8032 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8086 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8033 GFP_KERNEL, cpu_to_node(i)); 8087 GFP_KERNEL, cpu_to_node(i));
8034 if (!rt_rq) 8088 if (!rt_rq)
8035 goto err; 8089 goto err;
8036 8090
8037 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 8091 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8038 GFP_KERNEL, cpu_to_node(i)); 8092 GFP_KERNEL, cpu_to_node(i));
8039 if (!rt_se) 8093 if (!rt_se)
8040 goto err_free_rq; 8094 goto err_free_rq;
8041 8095
8042 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8096 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
8043 } 8097 }
8044 8098
8045 return 1; 8099 return 1;
8046 8100
8047 err_free_rq: 8101 err_free_rq:
8048 kfree(rt_rq); 8102 kfree(rt_rq);
8049 err: 8103 err:
8050 return 0; 8104 return 0;
8051 } 8105 }
8052 8106
8053 static inline void register_rt_sched_group(struct task_group *tg, int cpu) 8107 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8054 { 8108 {
8055 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, 8109 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8056 &cpu_rq(cpu)->leaf_rt_rq_list); 8110 &cpu_rq(cpu)->leaf_rt_rq_list);
8057 } 8111 }
8058 8112
8059 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8113 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8060 { 8114 {
8061 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8115 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8062 } 8116 }
8063 #else /* !CONFIG_RT_GROUP_SCHED */ 8117 #else /* !CONFIG_RT_GROUP_SCHED */
8064 static inline void free_rt_sched_group(struct task_group *tg) 8118 static inline void free_rt_sched_group(struct task_group *tg)
8065 { 8119 {
8066 } 8120 }
8067 8121
8068 static inline 8122 static inline
8069 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 8123 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8070 { 8124 {
8071 return 1; 8125 return 1;
8072 } 8126 }
8073 8127
8074 static inline void register_rt_sched_group(struct task_group *tg, int cpu) 8128 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8075 { 8129 {
8076 } 8130 }
8077 8131
8078 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8132 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8079 { 8133 {
8080 } 8134 }
8081 #endif /* CONFIG_RT_GROUP_SCHED */ 8135 #endif /* CONFIG_RT_GROUP_SCHED */
8082 8136
8083 #ifdef CONFIG_CGROUP_SCHED 8137 #ifdef CONFIG_CGROUP_SCHED
8084 static void free_sched_group(struct task_group *tg) 8138 static void free_sched_group(struct task_group *tg)
8085 { 8139 {
8086 free_fair_sched_group(tg); 8140 free_fair_sched_group(tg);
8087 free_rt_sched_group(tg); 8141 free_rt_sched_group(tg);
8088 kfree(tg); 8142 kfree(tg);
8089 } 8143 }
8090 8144
8091 /* allocate runqueue etc for a new task group */ 8145 /* allocate runqueue etc for a new task group */
8092 struct task_group *sched_create_group(struct task_group *parent) 8146 struct task_group *sched_create_group(struct task_group *parent)
8093 { 8147 {
8094 struct task_group *tg; 8148 struct task_group *tg;
8095 unsigned long flags; 8149 unsigned long flags;
8096 int i; 8150 int i;
8097 8151
8098 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8152 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8099 if (!tg) 8153 if (!tg)
8100 return ERR_PTR(-ENOMEM); 8154 return ERR_PTR(-ENOMEM);
8101 8155
8102 if (!alloc_fair_sched_group(tg, parent)) 8156 if (!alloc_fair_sched_group(tg, parent))
8103 goto err; 8157 goto err;
8104 8158
8105 if (!alloc_rt_sched_group(tg, parent)) 8159 if (!alloc_rt_sched_group(tg, parent))
8106 goto err; 8160 goto err;
8107 8161
8108 spin_lock_irqsave(&task_group_lock, flags); 8162 spin_lock_irqsave(&task_group_lock, flags);
8109 for_each_possible_cpu(i) { 8163 for_each_possible_cpu(i) {
8110 register_fair_sched_group(tg, i); 8164 register_fair_sched_group(tg, i);
8111 register_rt_sched_group(tg, i); 8165 register_rt_sched_group(tg, i);
8112 } 8166 }
8113 list_add_rcu(&tg->list, &task_groups); 8167 list_add_rcu(&tg->list, &task_groups);
8114 8168
8115 WARN_ON(!parent); /* root should already exist */ 8169 WARN_ON(!parent); /* root should already exist */
8116 8170
8117 tg->parent = parent; 8171 tg->parent = parent;
8118 INIT_LIST_HEAD(&tg->children); 8172 INIT_LIST_HEAD(&tg->children);
8119 list_add_rcu(&tg->siblings, &parent->children); 8173 list_add_rcu(&tg->siblings, &parent->children);
8120 spin_unlock_irqrestore(&task_group_lock, flags); 8174 spin_unlock_irqrestore(&task_group_lock, flags);
8121 8175
8122 return tg; 8176 return tg;
8123 8177
8124 err: 8178 err:
8125 free_sched_group(tg); 8179 free_sched_group(tg);
8126 return ERR_PTR(-ENOMEM); 8180 return ERR_PTR(-ENOMEM);
8127 } 8181 }
8128 8182
8129 /* rcu callback to free various structures associated with a task group */ 8183 /* rcu callback to free various structures associated with a task group */
8130 static void free_sched_group_rcu(struct rcu_head *rhp) 8184 static void free_sched_group_rcu(struct rcu_head *rhp)
8131 { 8185 {
8132 /* now it should be safe to free those cfs_rqs */ 8186 /* now it should be safe to free those cfs_rqs */
8133 free_sched_group(container_of(rhp, struct task_group, rcu)); 8187 free_sched_group(container_of(rhp, struct task_group, rcu));
8134 } 8188 }
8135 8189
8136 /* Destroy runqueue etc associated with a task group */ 8190 /* Destroy runqueue etc associated with a task group */
8137 void sched_destroy_group(struct task_group *tg) 8191 void sched_destroy_group(struct task_group *tg)
8138 { 8192 {
8139 unsigned long flags; 8193 unsigned long flags;
8140 int i; 8194 int i;
8141 8195
8142 spin_lock_irqsave(&task_group_lock, flags); 8196 spin_lock_irqsave(&task_group_lock, flags);
8143 for_each_possible_cpu(i) { 8197 for_each_possible_cpu(i) {
8144 unregister_fair_sched_group(tg, i); 8198 unregister_fair_sched_group(tg, i);
8145 unregister_rt_sched_group(tg, i); 8199 unregister_rt_sched_group(tg, i);
8146 } 8200 }
8147 list_del_rcu(&tg->list); 8201 list_del_rcu(&tg->list);
8148 list_del_rcu(&tg->siblings); 8202 list_del_rcu(&tg->siblings);
8149 spin_unlock_irqrestore(&task_group_lock, flags); 8203 spin_unlock_irqrestore(&task_group_lock, flags);
8150 8204
8151 /* wait for possible concurrent references to cfs_rqs complete */ 8205 /* wait for possible concurrent references to cfs_rqs complete */
8152 call_rcu(&tg->rcu, free_sched_group_rcu); 8206 call_rcu(&tg->rcu, free_sched_group_rcu);
8153 } 8207 }
8154 8208
8155 /* change task's runqueue when it moves between groups. 8209 /* change task's runqueue when it moves between groups.
8156 * The caller of this function should have put the task in its new group 8210 * The caller of this function should have put the task in its new group
8157 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 8211 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8158 * reflect its new group. 8212 * reflect its new group.
8159 */ 8213 */
8160 void sched_move_task(struct task_struct *tsk) 8214 void sched_move_task(struct task_struct *tsk)
8161 { 8215 {
8162 int on_rq, running; 8216 int on_rq, running;
8163 unsigned long flags; 8217 unsigned long flags;
8164 struct rq *rq; 8218 struct rq *rq;
8165 8219
8166 rq = task_rq_lock(tsk, &flags); 8220 rq = task_rq_lock(tsk, &flags);
8167 8221
8168 running = task_current(rq, tsk); 8222 running = task_current(rq, tsk);
8169 on_rq = tsk->se.on_rq; 8223 on_rq = tsk->se.on_rq;
8170 8224
8171 if (on_rq) 8225 if (on_rq)
8172 dequeue_task(rq, tsk, 0); 8226 dequeue_task(rq, tsk, 0);
8173 if (unlikely(running)) 8227 if (unlikely(running))
8174 tsk->sched_class->put_prev_task(rq, tsk); 8228 tsk->sched_class->put_prev_task(rq, tsk);
8175 8229
8176 set_task_rq(tsk, task_cpu(tsk)); 8230 set_task_rq(tsk, task_cpu(tsk));
8177 8231
8178 #ifdef CONFIG_FAIR_GROUP_SCHED 8232 #ifdef CONFIG_FAIR_GROUP_SCHED
8179 if (tsk->sched_class->moved_group) 8233 if (tsk->sched_class->moved_group)
8180 tsk->sched_class->moved_group(tsk, on_rq); 8234 tsk->sched_class->moved_group(tsk, on_rq);
8181 #endif 8235 #endif
8182 8236
8183 if (unlikely(running)) 8237 if (unlikely(running))
8184 tsk->sched_class->set_curr_task(rq); 8238 tsk->sched_class->set_curr_task(rq);
8185 if (on_rq) 8239 if (on_rq)
8186 enqueue_task(rq, tsk, 0); 8240 enqueue_task(rq, tsk, 0);
8187 8241
8188 task_rq_unlock(rq, &flags); 8242 task_rq_unlock(rq, &flags);
8189 } 8243 }
8190 #endif /* CONFIG_CGROUP_SCHED */ 8244 #endif /* CONFIG_CGROUP_SCHED */
8191 8245
8192 #ifdef CONFIG_FAIR_GROUP_SCHED 8246 #ifdef CONFIG_FAIR_GROUP_SCHED
8193 static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8247 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8194 { 8248 {
8195 struct cfs_rq *cfs_rq = se->cfs_rq; 8249 struct cfs_rq *cfs_rq = se->cfs_rq;
8196 int on_rq; 8250 int on_rq;
8197 8251
8198 on_rq = se->on_rq; 8252 on_rq = se->on_rq;
8199 if (on_rq) 8253 if (on_rq)
8200 dequeue_entity(cfs_rq, se, 0); 8254 dequeue_entity(cfs_rq, se, 0);
8201 8255
8202 se->load.weight = shares; 8256 se->load.weight = shares;
8203 se->load.inv_weight = 0; 8257 se->load.inv_weight = 0;
8204 8258
8205 if (on_rq) 8259 if (on_rq)
8206 enqueue_entity(cfs_rq, se, 0); 8260 enqueue_entity(cfs_rq, se, 0);
8207 } 8261 }
8208 8262
8209 static void set_se_shares(struct sched_entity *se, unsigned long shares) 8263 static void set_se_shares(struct sched_entity *se, unsigned long shares)
8210 { 8264 {
8211 struct cfs_rq *cfs_rq = se->cfs_rq; 8265 struct cfs_rq *cfs_rq = se->cfs_rq;
8212 struct rq *rq = cfs_rq->rq; 8266 struct rq *rq = cfs_rq->rq;
8213 unsigned long flags; 8267 unsigned long flags;
8214 8268
8215 raw_spin_lock_irqsave(&rq->lock, flags); 8269 raw_spin_lock_irqsave(&rq->lock, flags);
8216 __set_se_shares(se, shares); 8270 __set_se_shares(se, shares);
8217 raw_spin_unlock_irqrestore(&rq->lock, flags); 8271 raw_spin_unlock_irqrestore(&rq->lock, flags);
8218 } 8272 }
8219 8273
8220 static DEFINE_MUTEX(shares_mutex); 8274 static DEFINE_MUTEX(shares_mutex);
8221 8275
8222 int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8276 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8223 { 8277 {
8224 int i; 8278 int i;
8225 unsigned long flags; 8279 unsigned long flags;
8226 8280
8227 /* 8281 /*
8228 * We can't change the weight of the root cgroup. 8282 * We can't change the weight of the root cgroup.
8229 */ 8283 */
8230 if (!tg->se[0]) 8284 if (!tg->se[0])
8231 return -EINVAL; 8285 return -EINVAL;
8232 8286
8233 if (shares < MIN_SHARES) 8287 if (shares < MIN_SHARES)
8234 shares = MIN_SHARES; 8288 shares = MIN_SHARES;
8235 else if (shares > MAX_SHARES) 8289 else if (shares > MAX_SHARES)
8236 shares = MAX_SHARES; 8290 shares = MAX_SHARES;
8237 8291
8238 mutex_lock(&shares_mutex); 8292 mutex_lock(&shares_mutex);
8239 if (tg->shares == shares) 8293 if (tg->shares == shares)
8240 goto done; 8294 goto done;
8241 8295
8242 spin_lock_irqsave(&task_group_lock, flags); 8296 spin_lock_irqsave(&task_group_lock, flags);
8243 for_each_possible_cpu(i) 8297 for_each_possible_cpu(i)
8244 unregister_fair_sched_group(tg, i); 8298 unregister_fair_sched_group(tg, i);
8245 list_del_rcu(&tg->siblings); 8299 list_del_rcu(&tg->siblings);
8246 spin_unlock_irqrestore(&task_group_lock, flags); 8300 spin_unlock_irqrestore(&task_group_lock, flags);
8247 8301
8248 /* wait for any ongoing reference to this group to finish */ 8302 /* wait for any ongoing reference to this group to finish */
8249 synchronize_sched(); 8303 synchronize_sched();
8250 8304
8251 /* 8305 /*
8252 * Now we are free to modify the group's share on each cpu 8306 * Now we are free to modify the group's share on each cpu
8253 * w/o tripping rebalance_share or load_balance_fair. 8307 * w/o tripping rebalance_share or load_balance_fair.
8254 */ 8308 */
8255 tg->shares = shares; 8309 tg->shares = shares;
8256 for_each_possible_cpu(i) { 8310 for_each_possible_cpu(i) {
8257 /* 8311 /*
8258 * force a rebalance 8312 * force a rebalance
8259 */ 8313 */
8260 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8314 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8261 set_se_shares(tg->se[i], shares); 8315 set_se_shares(tg->se[i], shares);
8262 } 8316 }
8263 8317
8264 /* 8318 /*
8265 * Enable load balance activity on this group, by inserting it back on 8319 * Enable load balance activity on this group, by inserting it back on
8266 * each cpu's rq->leaf_cfs_rq_list. 8320 * each cpu's rq->leaf_cfs_rq_list.
8267 */ 8321 */
8268 spin_lock_irqsave(&task_group_lock, flags); 8322 spin_lock_irqsave(&task_group_lock, flags);
8269 for_each_possible_cpu(i) 8323 for_each_possible_cpu(i)
8270 register_fair_sched_group(tg, i); 8324 register_fair_sched_group(tg, i);
8271 list_add_rcu(&tg->siblings, &tg->parent->children); 8325 list_add_rcu(&tg->siblings, &tg->parent->children);
8272 spin_unlock_irqrestore(&task_group_lock, flags); 8326 spin_unlock_irqrestore(&task_group_lock, flags);
8273 done: 8327 done:
8274 mutex_unlock(&shares_mutex); 8328 mutex_unlock(&shares_mutex);
8275 return 0; 8329 return 0;
8276 } 8330 }
8277 8331
8278 unsigned long sched_group_shares(struct task_group *tg) 8332 unsigned long sched_group_shares(struct task_group *tg)
8279 { 8333 {
8280 return tg->shares; 8334 return tg->shares;
8281 } 8335 }
8282 #endif 8336 #endif
8283 8337
8284 #ifdef CONFIG_RT_GROUP_SCHED 8338 #ifdef CONFIG_RT_GROUP_SCHED
8285 /* 8339 /*
8286 * Ensure that the real time constraints are schedulable. 8340 * Ensure that the real time constraints are schedulable.
8287 */ 8341 */
8288 static DEFINE_MUTEX(rt_constraints_mutex); 8342 static DEFINE_MUTEX(rt_constraints_mutex);
8289 8343
8290 static unsigned long to_ratio(u64 period, u64 runtime) 8344 static unsigned long to_ratio(u64 period, u64 runtime)
8291 { 8345 {
8292 if (runtime == RUNTIME_INF) 8346 if (runtime == RUNTIME_INF)
8293 return 1ULL << 20; 8347 return 1ULL << 20;
8294 8348
8295 return div64_u64(runtime << 20, period); 8349 return div64_u64(runtime << 20, period);
8296 } 8350 }
8297 8351
8298 /* Must be called with tasklist_lock held */ 8352 /* Must be called with tasklist_lock held */
8299 static inline int tg_has_rt_tasks(struct task_group *tg) 8353 static inline int tg_has_rt_tasks(struct task_group *tg)
8300 { 8354 {
8301 struct task_struct *g, *p; 8355 struct task_struct *g, *p;
8302 8356
8303 do_each_thread(g, p) { 8357 do_each_thread(g, p) {
8304 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8358 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8305 return 1; 8359 return 1;
8306 } while_each_thread(g, p); 8360 } while_each_thread(g, p);
8307 8361
8308 return 0; 8362 return 0;
8309 } 8363 }
8310 8364
8311 struct rt_schedulable_data { 8365 struct rt_schedulable_data {
8312 struct task_group *tg; 8366 struct task_group *tg;
8313 u64 rt_period; 8367 u64 rt_period;
8314 u64 rt_runtime; 8368 u64 rt_runtime;
8315 }; 8369 };
8316 8370
8317 static int tg_schedulable(struct task_group *tg, void *data) 8371 static int tg_schedulable(struct task_group *tg, void *data)
8318 { 8372 {
8319 struct rt_schedulable_data *d = data; 8373 struct rt_schedulable_data *d = data;
8320 struct task_group *child; 8374 struct task_group *child;
8321 unsigned long total, sum = 0; 8375 unsigned long total, sum = 0;
8322 u64 period, runtime; 8376 u64 period, runtime;
8323 8377
8324 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8378 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8325 runtime = tg->rt_bandwidth.rt_runtime; 8379 runtime = tg->rt_bandwidth.rt_runtime;
8326 8380
8327 if (tg == d->tg) { 8381 if (tg == d->tg) {
8328 period = d->rt_period; 8382 period = d->rt_period;
8329 runtime = d->rt_runtime; 8383 runtime = d->rt_runtime;
8330 } 8384 }
8331 8385
8332 /* 8386 /*
8333 * Cannot have more runtime than the period. 8387 * Cannot have more runtime than the period.
8334 */ 8388 */
8335 if (runtime > period && runtime != RUNTIME_INF) 8389 if (runtime > period && runtime != RUNTIME_INF)
8336 return -EINVAL; 8390 return -EINVAL;
8337 8391
8338 /* 8392 /*
8339 * Ensure we don't starve existing RT tasks. 8393 * Ensure we don't starve existing RT tasks.
8340 */ 8394 */
8341 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 8395 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8342 return -EBUSY; 8396 return -EBUSY;
8343 8397
8344 total = to_ratio(period, runtime); 8398 total = to_ratio(period, runtime);
8345 8399
8346 /* 8400 /*
8347 * Nobody can have more than the global setting allows. 8401 * Nobody can have more than the global setting allows.
8348 */ 8402 */
8349 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 8403 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8350 return -EINVAL; 8404 return -EINVAL;
8351 8405
8352 /* 8406 /*
8353 * The sum of our children's runtime should not exceed our own. 8407 * The sum of our children's runtime should not exceed our own.
8354 */ 8408 */
8355 list_for_each_entry_rcu(child, &tg->children, siblings) { 8409 list_for_each_entry_rcu(child, &tg->children, siblings) {
8356 period = ktime_to_ns(child->rt_bandwidth.rt_period); 8410 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8357 runtime = child->rt_bandwidth.rt_runtime; 8411 runtime = child->rt_bandwidth.rt_runtime;
8358 8412
8359 if (child == d->tg) { 8413 if (child == d->tg) {
8360 period = d->rt_period; 8414 period = d->rt_period;
8361 runtime = d->rt_runtime; 8415 runtime = d->rt_runtime;
8362 } 8416 }
8363 8417
8364 sum += to_ratio(period, runtime); 8418 sum += to_ratio(period, runtime);
8365 } 8419 }
8366 8420
8367 if (sum > total) 8421 if (sum > total)
8368 return -EINVAL; 8422 return -EINVAL;
8369 8423
8370 return 0; 8424 return 0;
8371 } 8425 }
8372 8426
8373 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8427 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8374 { 8428 {
8375 struct rt_schedulable_data data = { 8429 struct rt_schedulable_data data = {
8376 .tg = tg, 8430 .tg = tg,
8377 .rt_period = period, 8431 .rt_period = period,
8378 .rt_runtime = runtime, 8432 .rt_runtime = runtime,
8379 }; 8433 };
8380 8434
8381 return walk_tg_tree(tg_schedulable, tg_nop, &data); 8435 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8382 } 8436 }
8383 8437
8384 static int tg_set_bandwidth(struct task_group *tg, 8438 static int tg_set_bandwidth(struct task_group *tg,
8385 u64 rt_period, u64 rt_runtime) 8439 u64 rt_period, u64 rt_runtime)
8386 { 8440 {
8387 int i, err = 0; 8441 int i, err = 0;
8388 8442
8389 mutex_lock(&rt_constraints_mutex); 8443 mutex_lock(&rt_constraints_mutex);
8390 read_lock(&tasklist_lock); 8444 read_lock(&tasklist_lock);
8391 err = __rt_schedulable(tg, rt_period, rt_runtime); 8445 err = __rt_schedulable(tg, rt_period, rt_runtime);
8392 if (err) 8446 if (err)
8393 goto unlock; 8447 goto unlock;
8394 8448
8395 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8449 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8396 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8450 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8397 tg->rt_bandwidth.rt_runtime = rt_runtime; 8451 tg->rt_bandwidth.rt_runtime = rt_runtime;
8398 8452
8399 for_each_possible_cpu(i) { 8453 for_each_possible_cpu(i) {
8400 struct rt_rq *rt_rq = tg->rt_rq[i]; 8454 struct rt_rq *rt_rq = tg->rt_rq[i];
8401 8455
8402 raw_spin_lock(&rt_rq->rt_runtime_lock); 8456 raw_spin_lock(&rt_rq->rt_runtime_lock);
8403 rt_rq->rt_runtime = rt_runtime; 8457 rt_rq->rt_runtime = rt_runtime;
8404 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8458 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8405 } 8459 }
8406 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8460 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8407 unlock: 8461 unlock:
8408 read_unlock(&tasklist_lock); 8462 read_unlock(&tasklist_lock);
8409 mutex_unlock(&rt_constraints_mutex); 8463 mutex_unlock(&rt_constraints_mutex);
8410 8464
8411 return err; 8465 return err;
8412 } 8466 }
8413 8467
8414 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 8468 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8415 { 8469 {
8416 u64 rt_runtime, rt_period; 8470 u64 rt_runtime, rt_period;
8417 8471
8418 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 8472 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8419 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 8473 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8420 if (rt_runtime_us < 0) 8474 if (rt_runtime_us < 0)
8421 rt_runtime = RUNTIME_INF; 8475 rt_runtime = RUNTIME_INF;
8422 8476
8423 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8477 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8424 } 8478 }
8425 8479
8426 long sched_group_rt_runtime(struct task_group *tg) 8480 long sched_group_rt_runtime(struct task_group *tg)
8427 { 8481 {
8428 u64 rt_runtime_us; 8482 u64 rt_runtime_us;
8429 8483
8430 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 8484 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
8431 return -1; 8485 return -1;
8432 8486
8433 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 8487 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
8434 do_div(rt_runtime_us, NSEC_PER_USEC); 8488 do_div(rt_runtime_us, NSEC_PER_USEC);
8435 return rt_runtime_us; 8489 return rt_runtime_us;
8436 } 8490 }
8437 8491
8438 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 8492 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8439 { 8493 {
8440 u64 rt_runtime, rt_period; 8494 u64 rt_runtime, rt_period;
8441 8495
8442 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8496 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8443 rt_runtime = tg->rt_bandwidth.rt_runtime; 8497 rt_runtime = tg->rt_bandwidth.rt_runtime;
8444 8498
8445 if (rt_period == 0) 8499 if (rt_period == 0)
8446 return -EINVAL; 8500 return -EINVAL;
8447 8501
8448 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8502 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8449 } 8503 }
8450 8504
8451 long sched_group_rt_period(struct task_group *tg) 8505 long sched_group_rt_period(struct task_group *tg)
8452 { 8506 {
8453 u64 rt_period_us; 8507 u64 rt_period_us;
8454 8508
8455 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 8509 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
8456 do_div(rt_period_us, NSEC_PER_USEC); 8510 do_div(rt_period_us, NSEC_PER_USEC);
8457 return rt_period_us; 8511 return rt_period_us;
8458 } 8512 }
8459 8513
8460 static int sched_rt_global_constraints(void) 8514 static int sched_rt_global_constraints(void)
8461 { 8515 {
8462 u64 runtime, period; 8516 u64 runtime, period;
8463 int ret = 0; 8517 int ret = 0;
8464 8518
8465 if (sysctl_sched_rt_period <= 0) 8519 if (sysctl_sched_rt_period <= 0)
8466 return -EINVAL; 8520 return -EINVAL;
8467 8521
8468 runtime = global_rt_runtime(); 8522 runtime = global_rt_runtime();
8469 period = global_rt_period(); 8523 period = global_rt_period();
8470 8524
8471 /* 8525 /*
8472 * Sanity check on the sysctl variables. 8526 * Sanity check on the sysctl variables.
8473 */ 8527 */
8474 if (runtime > period && runtime != RUNTIME_INF) 8528 if (runtime > period && runtime != RUNTIME_INF)
8475 return -EINVAL; 8529 return -EINVAL;
8476 8530
8477 mutex_lock(&rt_constraints_mutex); 8531 mutex_lock(&rt_constraints_mutex);
8478 read_lock(&tasklist_lock); 8532 read_lock(&tasklist_lock);
8479 ret = __rt_schedulable(NULL, 0, 0); 8533 ret = __rt_schedulable(NULL, 0, 0);
8480 read_unlock(&tasklist_lock); 8534 read_unlock(&tasklist_lock);
8481 mutex_unlock(&rt_constraints_mutex); 8535 mutex_unlock(&rt_constraints_mutex);
8482 8536
8483 return ret; 8537 return ret;
8484 } 8538 }
8485 8539
8486 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 8540 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
8487 { 8541 {
8488 /* Don't accept realtime tasks when there is no way for them to run */ 8542 /* Don't accept realtime tasks when there is no way for them to run */
8489 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 8543 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
8490 return 0; 8544 return 0;
8491 8545
8492 return 1; 8546 return 1;
8493 } 8547 }
8494 8548
8495 #else /* !CONFIG_RT_GROUP_SCHED */ 8549 #else /* !CONFIG_RT_GROUP_SCHED */
8496 static int sched_rt_global_constraints(void) 8550 static int sched_rt_global_constraints(void)
8497 { 8551 {
8498 unsigned long flags; 8552 unsigned long flags;
8499 int i; 8553 int i;
8500 8554
8501 if (sysctl_sched_rt_period <= 0) 8555 if (sysctl_sched_rt_period <= 0)
8502 return -EINVAL; 8556 return -EINVAL;
8503 8557
8504 /* 8558 /*
8505 * There's always some RT tasks in the root group 8559 * There's always some RT tasks in the root group
8506 * -- migration, kstopmachine etc.. 8560 * -- migration, kstopmachine etc..
8507 */ 8561 */
8508 if (sysctl_sched_rt_runtime == 0) 8562 if (sysctl_sched_rt_runtime == 0)
8509 return -EBUSY; 8563 return -EBUSY;
8510 8564
8511 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8565 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8512 for_each_possible_cpu(i) { 8566 for_each_possible_cpu(i) {
8513 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8567 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
8514 8568
8515 raw_spin_lock(&rt_rq->rt_runtime_lock); 8569 raw_spin_lock(&rt_rq->rt_runtime_lock);
8516 rt_rq->rt_runtime = global_rt_runtime(); 8570 rt_rq->rt_runtime = global_rt_runtime();
8517 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8571 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8518 } 8572 }
8519 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 8573 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
8520 8574
8521 return 0; 8575 return 0;
8522 } 8576 }
8523 #endif /* CONFIG_RT_GROUP_SCHED */ 8577 #endif /* CONFIG_RT_GROUP_SCHED */
8524 8578
8525 int sched_rt_handler(struct ctl_table *table, int write, 8579 int sched_rt_handler(struct ctl_table *table, int write,
8526 void __user *buffer, size_t *lenp, 8580 void __user *buffer, size_t *lenp,
8527 loff_t *ppos) 8581 loff_t *ppos)
8528 { 8582 {
8529 int ret; 8583 int ret;
8530 int old_period, old_runtime; 8584 int old_period, old_runtime;
8531 static DEFINE_MUTEX(mutex); 8585 static DEFINE_MUTEX(mutex);
8532 8586
8533 mutex_lock(&mutex); 8587 mutex_lock(&mutex);
8534 old_period = sysctl_sched_rt_period; 8588 old_period = sysctl_sched_rt_period;
8535 old_runtime = sysctl_sched_rt_runtime; 8589 old_runtime = sysctl_sched_rt_runtime;
8536 8590
8537 ret = proc_dointvec(table, write, buffer, lenp, ppos); 8591 ret = proc_dointvec(table, write, buffer, lenp, ppos);
8538 8592
8539 if (!ret && write) { 8593 if (!ret && write) {
8540 ret = sched_rt_global_constraints(); 8594 ret = sched_rt_global_constraints();
8541 if (ret) { 8595 if (ret) {
8542 sysctl_sched_rt_period = old_period; 8596 sysctl_sched_rt_period = old_period;
8543 sysctl_sched_rt_runtime = old_runtime; 8597 sysctl_sched_rt_runtime = old_runtime;
8544 } else { 8598 } else {
8545 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 8599 def_rt_bandwidth.rt_runtime = global_rt_runtime();
8546 def_rt_bandwidth.rt_period = 8600 def_rt_bandwidth.rt_period =
8547 ns_to_ktime(global_rt_period()); 8601 ns_to_ktime(global_rt_period());
8548 } 8602 }
8549 } 8603 }
8550 mutex_unlock(&mutex); 8604 mutex_unlock(&mutex);
8551 8605
8552 return ret; 8606 return ret;
8553 } 8607 }
8554 8608
8555 #ifdef CONFIG_CGROUP_SCHED 8609 #ifdef CONFIG_CGROUP_SCHED
8556 8610
8557 /* return corresponding task_group object of a cgroup */ 8611 /* return corresponding task_group object of a cgroup */
8558 static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 8612 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
8559 { 8613 {
8560 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 8614 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
8561 struct task_group, css); 8615 struct task_group, css);
8562 } 8616 }
8563 8617
8564 static struct cgroup_subsys_state * 8618 static struct cgroup_subsys_state *
8565 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 8619 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8566 { 8620 {
8567 struct task_group *tg, *parent; 8621 struct task_group *tg, *parent;
8568 8622
8569 if (!cgrp->parent) { 8623 if (!cgrp->parent) {
8570 /* This is early initialization for the top cgroup */ 8624 /* This is early initialization for the top cgroup */
8571 return &init_task_group.css; 8625 return &init_task_group.css;
8572 } 8626 }
8573 8627
8574 parent = cgroup_tg(cgrp->parent); 8628 parent = cgroup_tg(cgrp->parent);
8575 tg = sched_create_group(parent); 8629 tg = sched_create_group(parent);
8576 if (IS_ERR(tg)) 8630 if (IS_ERR(tg))
8577 return ERR_PTR(-ENOMEM); 8631 return ERR_PTR(-ENOMEM);
8578 8632
8579 return &tg->css; 8633 return &tg->css;
8580 } 8634 }
8581 8635
8582 static void 8636 static void
8583 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 8637 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8584 { 8638 {
8585 struct task_group *tg = cgroup_tg(cgrp); 8639 struct task_group *tg = cgroup_tg(cgrp);
8586 8640
8587 sched_destroy_group(tg); 8641 sched_destroy_group(tg);
8588 } 8642 }
8589 8643
8590 static int 8644 static int
8591 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 8645 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8592 { 8646 {
8593 #ifdef CONFIG_RT_GROUP_SCHED 8647 #ifdef CONFIG_RT_GROUP_SCHED
8594 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 8648 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
8595 return -EINVAL; 8649 return -EINVAL;
8596 #else 8650 #else
8597 /* We don't support RT-tasks being in separate groups */ 8651 /* We don't support RT-tasks being in separate groups */
8598 if (tsk->sched_class != &fair_sched_class) 8652 if (tsk->sched_class != &fair_sched_class)
8599 return -EINVAL; 8653 return -EINVAL;
8600 #endif 8654 #endif
8601 return 0; 8655 return 0;
8602 } 8656 }
8603 8657
8604 static int 8658 static int
8605 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8659 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8606 struct task_struct *tsk, bool threadgroup) 8660 struct task_struct *tsk, bool threadgroup)
8607 { 8661 {
8608 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 8662 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
8609 if (retval) 8663 if (retval)
8610 return retval; 8664 return retval;
8611 if (threadgroup) { 8665 if (threadgroup) {
8612 struct task_struct *c; 8666 struct task_struct *c;
8613 rcu_read_lock(); 8667 rcu_read_lock();
8614 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 8668 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8615 retval = cpu_cgroup_can_attach_task(cgrp, c); 8669 retval = cpu_cgroup_can_attach_task(cgrp, c);
8616 if (retval) { 8670 if (retval) {
8617 rcu_read_unlock(); 8671 rcu_read_unlock();
8618 return retval; 8672 return retval;
8619 } 8673 }
8620 } 8674 }
8621 rcu_read_unlock(); 8675 rcu_read_unlock();
8622 } 8676 }
8623 return 0; 8677 return 0;
8624 } 8678 }
8625 8679
8626 static void 8680 static void
8627 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8681 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8628 struct cgroup *old_cont, struct task_struct *tsk, 8682 struct cgroup *old_cont, struct task_struct *tsk,
8629 bool threadgroup) 8683 bool threadgroup)
8630 { 8684 {
8631 sched_move_task(tsk); 8685 sched_move_task(tsk);
8632 if (threadgroup) { 8686 if (threadgroup) {
8633 struct task_struct *c; 8687 struct task_struct *c;
8634 rcu_read_lock(); 8688 rcu_read_lock();
8635 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 8689 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8636 sched_move_task(c); 8690 sched_move_task(c);
8637 } 8691 }
8638 rcu_read_unlock(); 8692 rcu_read_unlock();
8639 } 8693 }
8640 } 8694 }
8641 8695
8642 #ifdef CONFIG_FAIR_GROUP_SCHED 8696 #ifdef CONFIG_FAIR_GROUP_SCHED
8643 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8697 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8644 u64 shareval) 8698 u64 shareval)
8645 { 8699 {
8646 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8700 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
8647 } 8701 }
8648 8702
8649 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8703 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8650 { 8704 {
8651 struct task_group *tg = cgroup_tg(cgrp); 8705 struct task_group *tg = cgroup_tg(cgrp);
8652 8706
8653 return (u64) tg->shares; 8707 return (u64) tg->shares;
8654 } 8708 }
8655 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8709 #endif /* CONFIG_FAIR_GROUP_SCHED */
8656 8710
8657 #ifdef CONFIG_RT_GROUP_SCHED 8711 #ifdef CONFIG_RT_GROUP_SCHED
8658 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8712 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
8659 s64 val) 8713 s64 val)
8660 { 8714 {
8661 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 8715 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
8662 } 8716 }
8663 8717
8664 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 8718 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
8665 { 8719 {
8666 return sched_group_rt_runtime(cgroup_tg(cgrp)); 8720 return sched_group_rt_runtime(cgroup_tg(cgrp));
8667 } 8721 }
8668 8722
8669 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 8723 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
8670 u64 rt_period_us) 8724 u64 rt_period_us)
8671 { 8725 {
8672 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 8726 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
8673 } 8727 }
8674 8728
8675 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 8729 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8676 { 8730 {
8677 return sched_group_rt_period(cgroup_tg(cgrp)); 8731 return sched_group_rt_period(cgroup_tg(cgrp));
8678 } 8732 }
8679 #endif /* CONFIG_RT_GROUP_SCHED */ 8733 #endif /* CONFIG_RT_GROUP_SCHED */
8680 8734
8681 static struct cftype cpu_files[] = { 8735 static struct cftype cpu_files[] = {
8682 #ifdef CONFIG_FAIR_GROUP_SCHED 8736 #ifdef CONFIG_FAIR_GROUP_SCHED
8683 { 8737 {
8684 .name = "shares", 8738 .name = "shares",
8685 .read_u64 = cpu_shares_read_u64, 8739 .read_u64 = cpu_shares_read_u64,
8686 .write_u64 = cpu_shares_write_u64, 8740 .write_u64 = cpu_shares_write_u64,
8687 }, 8741 },
8688 #endif 8742 #endif
8689 #ifdef CONFIG_RT_GROUP_SCHED 8743 #ifdef CONFIG_RT_GROUP_SCHED
8690 { 8744 {
8691 .name = "rt_runtime_us", 8745 .name = "rt_runtime_us",
8692 .read_s64 = cpu_rt_runtime_read, 8746 .read_s64 = cpu_rt_runtime_read,
8693 .write_s64 = cpu_rt_runtime_write, 8747 .write_s64 = cpu_rt_runtime_write,
8694 }, 8748 },
8695 { 8749 {
8696 .name = "rt_period_us", 8750 .name = "rt_period_us",
8697 .read_u64 = cpu_rt_period_read_uint, 8751 .read_u64 = cpu_rt_period_read_uint,
8698 .write_u64 = cpu_rt_period_write_uint, 8752 .write_u64 = cpu_rt_period_write_uint,
8699 }, 8753 },
8700 #endif 8754 #endif
8701 }; 8755 };
8702 8756
8703 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 8757 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8704 { 8758 {
8705 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); 8759 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
8706 } 8760 }
8707 8761
8708 struct cgroup_subsys cpu_cgroup_subsys = { 8762 struct cgroup_subsys cpu_cgroup_subsys = {
8709 .name = "cpu", 8763 .name = "cpu",
8710 .create = cpu_cgroup_create, 8764 .create = cpu_cgroup_create,
8711 .destroy = cpu_cgroup_destroy, 8765 .destroy = cpu_cgroup_destroy,
8712 .can_attach = cpu_cgroup_can_attach, 8766 .can_attach = cpu_cgroup_can_attach,
8713 .attach = cpu_cgroup_attach, 8767 .attach = cpu_cgroup_attach,
8714 .populate = cpu_cgroup_populate, 8768 .populate = cpu_cgroup_populate,
8715 .subsys_id = cpu_cgroup_subsys_id, 8769 .subsys_id = cpu_cgroup_subsys_id,
8716 .early_init = 1, 8770 .early_init = 1,
8717 }; 8771 };
8718 8772
8719 #endif /* CONFIG_CGROUP_SCHED */ 8773 #endif /* CONFIG_CGROUP_SCHED */
8720 8774
8721 #ifdef CONFIG_CGROUP_CPUACCT 8775 #ifdef CONFIG_CGROUP_CPUACCT
8722 8776
8723 /* 8777 /*
8724 * CPU accounting code for task groups. 8778 * CPU accounting code for task groups.
8725 * 8779 *
8726 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 8780 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
8727 * (balbir@in.ibm.com). 8781 * (balbir@in.ibm.com).
8728 */ 8782 */
8729 8783
8730 /* track cpu usage of a group of tasks and its child groups */ 8784 /* track cpu usage of a group of tasks and its child groups */
8731 struct cpuacct { 8785 struct cpuacct {
8732 struct cgroup_subsys_state css; 8786 struct cgroup_subsys_state css;
8733 /* cpuusage holds pointer to a u64-type object on every cpu */ 8787 /* cpuusage holds pointer to a u64-type object on every cpu */
8734 u64 __percpu *cpuusage; 8788 u64 __percpu *cpuusage;
8735 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8789 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
8736 struct cpuacct *parent; 8790 struct cpuacct *parent;
8737 }; 8791 };
8738 8792
8739 struct cgroup_subsys cpuacct_subsys; 8793 struct cgroup_subsys cpuacct_subsys;
8740 8794
8741 /* return cpu accounting group corresponding to this container */ 8795 /* return cpu accounting group corresponding to this container */
8742 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 8796 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
8743 { 8797 {
8744 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 8798 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
8745 struct cpuacct, css); 8799 struct cpuacct, css);
8746 } 8800 }
8747 8801
8748 /* return cpu accounting group to which this task belongs */ 8802 /* return cpu accounting group to which this task belongs */
8749 static inline struct cpuacct *task_ca(struct task_struct *tsk) 8803 static inline struct cpuacct *task_ca(struct task_struct *tsk)
8750 { 8804 {
8751 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 8805 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
8752 struct cpuacct, css); 8806 struct cpuacct, css);
8753 } 8807 }
8754 8808
8755 /* create a new cpu accounting group */ 8809 /* create a new cpu accounting group */
8756 static struct cgroup_subsys_state *cpuacct_create( 8810 static struct cgroup_subsys_state *cpuacct_create(
8757 struct cgroup_subsys *ss, struct cgroup *cgrp) 8811 struct cgroup_subsys *ss, struct cgroup *cgrp)
8758 { 8812 {
8759 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 8813 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
8760 int i; 8814 int i;
8761 8815
8762 if (!ca) 8816 if (!ca)
8763 goto out; 8817 goto out;
8764 8818
8765 ca->cpuusage = alloc_percpu(u64); 8819 ca->cpuusage = alloc_percpu(u64);
8766 if (!ca->cpuusage) 8820 if (!ca->cpuusage)
8767 goto out_free_ca; 8821 goto out_free_ca;
8768 8822
8769 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 8823 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
8770 if (percpu_counter_init(&ca->cpustat[i], 0)) 8824 if (percpu_counter_init(&ca->cpustat[i], 0))
8771 goto out_free_counters; 8825 goto out_free_counters;
8772 8826
8773 if (cgrp->parent) 8827 if (cgrp->parent)
8774 ca->parent = cgroup_ca(cgrp->parent); 8828 ca->parent = cgroup_ca(cgrp->parent);
8775 8829
8776 return &ca->css; 8830 return &ca->css;
8777 8831
8778 out_free_counters: 8832 out_free_counters:
8779 while (--i >= 0) 8833 while (--i >= 0)
8780 percpu_counter_destroy(&ca->cpustat[i]); 8834 percpu_counter_destroy(&ca->cpustat[i]);
8781 free_percpu(ca->cpuusage); 8835 free_percpu(ca->cpuusage);
8782 out_free_ca: 8836 out_free_ca:
8783 kfree(ca); 8837 kfree(ca);
8784 out: 8838 out:
8785 return ERR_PTR(-ENOMEM); 8839 return ERR_PTR(-ENOMEM);
8786 } 8840 }
8787 8841
8788 /* destroy an existing cpu accounting group */ 8842 /* destroy an existing cpu accounting group */
8789 static void 8843 static void
8790 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 8844 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8791 { 8845 {
8792 struct cpuacct *ca = cgroup_ca(cgrp); 8846 struct cpuacct *ca = cgroup_ca(cgrp);
8793 int i; 8847 int i;
8794 8848
8795 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 8849 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
8796 percpu_counter_destroy(&ca->cpustat[i]); 8850 percpu_counter_destroy(&ca->cpustat[i]);
8797 free_percpu(ca->cpuusage); 8851 free_percpu(ca->cpuusage);
8798 kfree(ca); 8852 kfree(ca);
8799 } 8853 }
8800 8854
8801 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 8855 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
8802 { 8856 {
8803 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8857 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8804 u64 data; 8858 u64 data;
8805 8859
8806 #ifndef CONFIG_64BIT 8860 #ifndef CONFIG_64BIT
8807 /* 8861 /*
8808 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 8862 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
8809 */ 8863 */
8810 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 8864 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8811 data = *cpuusage; 8865 data = *cpuusage;
8812 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 8866 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8813 #else 8867 #else
8814 data = *cpuusage; 8868 data = *cpuusage;
8815 #endif 8869 #endif
8816 8870
8817 return data; 8871 return data;
8818 } 8872 }
8819 8873
8820 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 8874 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
8821 { 8875 {
8822 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8876 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8823 8877
8824 #ifndef CONFIG_64BIT 8878 #ifndef CONFIG_64BIT
8825 /* 8879 /*
8826 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 8880 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
8827 */ 8881 */
8828 raw_spin_lock_irq(&cpu_rq(cpu)->lock); 8882 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
8829 *cpuusage = val; 8883 *cpuusage = val;
8830 raw_spin_unlock_irq(&cpu_rq(cpu)->lock); 8884 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
8831 #else 8885 #else
8832 *cpuusage = val; 8886 *cpuusage = val;
8833 #endif 8887 #endif
8834 } 8888 }
8835 8889
8836 /* return total cpu usage (in nanoseconds) of a group */ 8890 /* return total cpu usage (in nanoseconds) of a group */
8837 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 8891 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
8838 { 8892 {
8839 struct cpuacct *ca = cgroup_ca(cgrp); 8893 struct cpuacct *ca = cgroup_ca(cgrp);
8840 u64 totalcpuusage = 0; 8894 u64 totalcpuusage = 0;
8841 int i; 8895 int i;
8842 8896
8843 for_each_present_cpu(i) 8897 for_each_present_cpu(i)
8844 totalcpuusage += cpuacct_cpuusage_read(ca, i); 8898 totalcpuusage += cpuacct_cpuusage_read(ca, i);
8845 8899
8846 return totalcpuusage; 8900 return totalcpuusage;
8847 } 8901 }
8848 8902
8849 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 8903 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
8850 u64 reset) 8904 u64 reset)
8851 { 8905 {
8852 struct cpuacct *ca = cgroup_ca(cgrp); 8906 struct cpuacct *ca = cgroup_ca(cgrp);
8853 int err = 0; 8907 int err = 0;
8854 int i; 8908 int i;
8855 8909
8856 if (reset) { 8910 if (reset) {
8857 err = -EINVAL; 8911 err = -EINVAL;
8858 goto out; 8912 goto out;
8859 } 8913 }
8860 8914
8861 for_each_present_cpu(i) 8915 for_each_present_cpu(i)
8862 cpuacct_cpuusage_write(ca, i, 0); 8916 cpuacct_cpuusage_write(ca, i, 0);
8863 8917
8864 out: 8918 out:
8865 return err; 8919 return err;
8866 } 8920 }
8867 8921
8868 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 8922 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
8869 struct seq_file *m) 8923 struct seq_file *m)
8870 { 8924 {
8871 struct cpuacct *ca = cgroup_ca(cgroup); 8925 struct cpuacct *ca = cgroup_ca(cgroup);
8872 u64 percpu; 8926 u64 percpu;
8873 int i; 8927 int i;
8874 8928
8875 for_each_present_cpu(i) { 8929 for_each_present_cpu(i) {
8876 percpu = cpuacct_cpuusage_read(ca, i); 8930 percpu = cpuacct_cpuusage_read(ca, i);
8877 seq_printf(m, "%llu ", (unsigned long long) percpu); 8931 seq_printf(m, "%llu ", (unsigned long long) percpu);
8878 } 8932 }
8879 seq_printf(m, "\n"); 8933 seq_printf(m, "\n");
8880 return 0; 8934 return 0;
8881 } 8935 }
8882 8936
8883 static const char *cpuacct_stat_desc[] = { 8937 static const char *cpuacct_stat_desc[] = {
8884 [CPUACCT_STAT_USER] = "user", 8938 [CPUACCT_STAT_USER] = "user",
8885 [CPUACCT_STAT_SYSTEM] = "system", 8939 [CPUACCT_STAT_SYSTEM] = "system",
8886 }; 8940 };
8887 8941
8888 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 8942 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
8889 struct cgroup_map_cb *cb) 8943 struct cgroup_map_cb *cb)
8890 { 8944 {
8891 struct cpuacct *ca = cgroup_ca(cgrp); 8945 struct cpuacct *ca = cgroup_ca(cgrp);
8892 int i; 8946 int i;
8893 8947
8894 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 8948 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
8895 s64 val = percpu_counter_read(&ca->cpustat[i]); 8949 s64 val = percpu_counter_read(&ca->cpustat[i]);
8896 val = cputime64_to_clock_t(val); 8950 val = cputime64_to_clock_t(val);
8897 cb->fill(cb, cpuacct_stat_desc[i], val); 8951 cb->fill(cb, cpuacct_stat_desc[i], val);
8898 } 8952 }
8899 return 0; 8953 return 0;
8900 } 8954 }
8901 8955
8902 static struct cftype files[] = { 8956 static struct cftype files[] = {
8903 { 8957 {
8904 .name = "usage", 8958 .name = "usage",
8905 .read_u64 = cpuusage_read, 8959 .read_u64 = cpuusage_read,
8906 .write_u64 = cpuusage_write, 8960 .write_u64 = cpuusage_write,
8907 }, 8961 },
8908 { 8962 {
8909 .name = "usage_percpu", 8963 .name = "usage_percpu",
8910 .read_seq_string = cpuacct_percpu_seq_read, 8964 .read_seq_string = cpuacct_percpu_seq_read,
8911 }, 8965 },
8912 { 8966 {
8913 .name = "stat", 8967 .name = "stat",
8914 .read_map = cpuacct_stats_show, 8968 .read_map = cpuacct_stats_show,
8915 }, 8969 },
8916 }; 8970 };
8917 8971
8918 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 8972 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8919 { 8973 {
8920 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); 8974 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8921 } 8975 }
8922 8976
8923 /* 8977 /*
8924 * charge this task's execution time to its accounting group. 8978 * charge this task's execution time to its accounting group.
8925 * 8979 *
8926 * called with rq->lock held. 8980 * called with rq->lock held.
8927 */ 8981 */
8928 static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 8982 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8929 { 8983 {
8930 struct cpuacct *ca; 8984 struct cpuacct *ca;
8931 int cpu; 8985 int cpu;
8932 8986
8933 if (unlikely(!cpuacct_subsys.active)) 8987 if (unlikely(!cpuacct_subsys.active))
8934 return; 8988 return;
8935 8989
8936 cpu = task_cpu(tsk); 8990 cpu = task_cpu(tsk);
8937 8991
8938 rcu_read_lock(); 8992 rcu_read_lock();
8939 8993
8940 ca = task_ca(tsk); 8994 ca = task_ca(tsk);
8941 8995
8942 for (; ca; ca = ca->parent) { 8996 for (; ca; ca = ca->parent) {
8943 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 8997 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
8944 *cpuusage += cputime; 8998 *cpuusage += cputime;
8945 } 8999 }
8946 9000
8947 rcu_read_unlock(); 9001 rcu_read_unlock();
8948 } 9002 }
8949 9003
8950 /* 9004 /*
8951 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large 9005 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
8952 * in cputime_t units. As a result, cpuacct_update_stats calls 9006 * in cputime_t units. As a result, cpuacct_update_stats calls
8953 * percpu_counter_add with values large enough to always overflow the 9007 * percpu_counter_add with values large enough to always overflow the
8954 * per cpu batch limit causing bad SMP scalability. 9008 * per cpu batch limit causing bad SMP scalability.
8955 * 9009 *
8956 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we 9010 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
8957 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled 9011 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
8958 * and enabled. We cap it at INT_MAX which is the largest allowed batch value. 9012 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
8959 */ 9013 */
8960 #ifdef CONFIG_SMP 9014 #ifdef CONFIG_SMP
8961 #define CPUACCT_BATCH \ 9015 #define CPUACCT_BATCH \
8962 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) 9016 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
8963 #else 9017 #else
8964 #define CPUACCT_BATCH 0 9018 #define CPUACCT_BATCH 0
8965 #endif 9019 #endif
8966 9020
8967 /* 9021 /*
8968 * Charge the system/user time to the task's accounting group. 9022 * Charge the system/user time to the task's accounting group.
8969 */ 9023 */
8970 static void cpuacct_update_stats(struct task_struct *tsk, 9024 static void cpuacct_update_stats(struct task_struct *tsk,
8971 enum cpuacct_stat_index idx, cputime_t val) 9025 enum cpuacct_stat_index idx, cputime_t val)
8972 { 9026 {
8973 struct cpuacct *ca; 9027 struct cpuacct *ca;
8974 int batch = CPUACCT_BATCH; 9028 int batch = CPUACCT_BATCH;
8975 9029
8976 if (unlikely(!cpuacct_subsys.active)) 9030 if (unlikely(!cpuacct_subsys.active))
8977 return; 9031 return;
8978 9032
8979 rcu_read_lock(); 9033 rcu_read_lock();
8980 ca = task_ca(tsk); 9034 ca = task_ca(tsk);
8981 9035
8982 do { 9036 do {
8983 __percpu_counter_add(&ca->cpustat[idx], val, batch); 9037 __percpu_counter_add(&ca->cpustat[idx], val, batch);
8984 ca = ca->parent; 9038 ca = ca->parent;
8985 } while (ca); 9039 } while (ca);
8986 rcu_read_unlock(); 9040 rcu_read_unlock();
8987 } 9041 }
8988 9042
8989 struct cgroup_subsys cpuacct_subsys = { 9043 struct cgroup_subsys cpuacct_subsys = {
8990 .name = "cpuacct", 9044 .name = "cpuacct",
8991 .create = cpuacct_create, 9045 .create = cpuacct_create,
8992 .destroy = cpuacct_destroy, 9046 .destroy = cpuacct_destroy,
8993 .populate = cpuacct_populate, 9047 .populate = cpuacct_populate,
8994 .subsys_id = cpuacct_subsys_id, 9048 .subsys_id = cpuacct_subsys_id,
8995 }; 9049 };
8996 #endif /* CONFIG_CGROUP_CPUACCT */ 9050 #endif /* CONFIG_CGROUP_CPUACCT */
8997 9051
8998 #ifndef CONFIG_SMP 9052 #ifndef CONFIG_SMP
8999 9053
9000 int rcu_expedited_torture_stats(char *page) 9054 int rcu_expedited_torture_stats(char *page)
9001 { 9055 {
9002 return 0; 9056 return 0;
9003 } 9057 }
9004 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); 9058 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9005 9059
9006 void synchronize_sched_expedited(void) 9060 void synchronize_sched_expedited(void)
9007 { 9061 {
9008 } 9062 }
9009 EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9063 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9010 9064
9011 #else /* #ifndef CONFIG_SMP */ 9065 #else /* #ifndef CONFIG_SMP */
9012 9066
9013 static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 9067 static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
9014 static DEFINE_MUTEX(rcu_sched_expedited_mutex); 9068 static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9015 9069
9016 #define RCU_EXPEDITED_STATE_POST -2 9070 #define RCU_EXPEDITED_STATE_POST -2
9017 #define RCU_EXPEDITED_STATE_IDLE -1 9071 #define RCU_EXPEDITED_STATE_IDLE -1
9018 9072
9019 static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9073 static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9020 9074
9021 int rcu_expedited_torture_stats(char *page) 9075 int rcu_expedited_torture_stats(char *page)
9022 { 9076 {
9023 int cnt = 0; 9077 int cnt = 0;
9024 int cpu; 9078 int cpu;
9025 9079
9026 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 9080 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
9027 for_each_online_cpu(cpu) { 9081 for_each_online_cpu(cpu) {
9028 cnt += sprintf(&page[cnt], " %d:%d", 9082 cnt += sprintf(&page[cnt], " %d:%d",
9029 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 9083 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
9030 } 9084 }
9031 cnt += sprintf(&page[cnt], "\n"); 9085 cnt += sprintf(&page[cnt], "\n");
9032 return cnt; 9086 return cnt;
9033 } 9087 }
9034 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); 9088 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9035 9089
9036 static long synchronize_sched_expedited_count; 9090 static long synchronize_sched_expedited_count;
9037 9091
9038 /* 9092 /*
9039 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 9093 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9040 * approach to force grace period to end quickly. This consumes 9094 * approach to force grace period to end quickly. This consumes
9041 * significant time on all CPUs, and is thus not recommended for 9095 * significant time on all CPUs, and is thus not recommended for
9042 * any sort of common-case code. 9096 * any sort of common-case code.
9043 * 9097 *
9044 * Note that it is illegal to call this function while holding any 9098 * Note that it is illegal to call this function while holding any
9045 * lock that is acquired by a CPU-hotplug notifier. Failing to 9099 * lock that is acquired by a CPU-hotplug notifier. Failing to
9046 * observe this restriction will result in deadlock. 9100 * observe this restriction will result in deadlock.
9047 */ 9101 */
9048 void synchronize_sched_expedited(void) 9102 void synchronize_sched_expedited(void)
9049 { 9103 {
9050 int cpu; 9104 int cpu;
9051 unsigned long flags; 9105 unsigned long flags;
9052 bool need_full_sync = 0; 9106 bool need_full_sync = 0;
9053 struct rq *rq; 9107 struct rq *rq;
9054 struct migration_req *req; 9108 struct migration_req *req;
9055 long snap; 9109 long snap;
9056 int trycount = 0; 9110 int trycount = 0;
9057 9111
9058 smp_mb(); /* ensure prior mod happens before capturing snap. */ 9112 smp_mb(); /* ensure prior mod happens before capturing snap. */
9059 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 9113 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
9060 get_online_cpus(); 9114 get_online_cpus();
9061 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 9115 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
9062 put_online_cpus(); 9116 put_online_cpus();
9063 if (trycount++ < 10) 9117 if (trycount++ < 10)
9064 udelay(trycount * num_online_cpus()); 9118 udelay(trycount * num_online_cpus());
9065 else { 9119 else {
9066 synchronize_sched(); 9120 synchronize_sched();
9067 return; 9121 return;
9068 } 9122 }
9069 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 9123 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
9070 smp_mb(); /* ensure test happens before caller kfree */ 9124 smp_mb(); /* ensure test happens before caller kfree */
9071 return; 9125 return;
9072 } 9126 }
9073 get_online_cpus(); 9127 get_online_cpus();
9074 } 9128 }
9075 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 9129 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
9076 for_each_online_cpu(cpu) { 9130 for_each_online_cpu(cpu) {
9077 rq = cpu_rq(cpu); 9131 rq = cpu_rq(cpu);
9078 req = &per_cpu(rcu_migration_req, cpu); 9132 req = &per_cpu(rcu_migration_req, cpu);
9079 init_completion(&req->done); 9133 init_completion(&req->done);
9080 req->task = NULL; 9134 req->task = NULL;
9081 req->dest_cpu = RCU_MIGRATION_NEED_QS; 9135 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9082 raw_spin_lock_irqsave(&rq->lock, flags); 9136 raw_spin_lock_irqsave(&rq->lock, flags);
9083 list_add(&req->list, &rq->migration_queue); 9137 list_add(&req->list, &rq->migration_queue);
9084 raw_spin_unlock_irqrestore(&rq->lock, flags); 9138 raw_spin_unlock_irqrestore(&rq->lock, flags);
9085 wake_up_process(rq->migration_thread); 9139 wake_up_process(rq->migration_thread);
9086 } 9140 }
9087 for_each_online_cpu(cpu) { 9141 for_each_online_cpu(cpu) {
9088 rcu_expedited_state = cpu; 9142 rcu_expedited_state = cpu;
9089 req = &per_cpu(rcu_migration_req, cpu); 9143 req = &per_cpu(rcu_migration_req, cpu);
9090 rq = cpu_rq(cpu); 9144 rq = cpu_rq(cpu);
9091 wait_for_completion(&req->done); 9145 wait_for_completion(&req->done);
9092 raw_spin_lock_irqsave(&rq->lock, flags); 9146 raw_spin_lock_irqsave(&rq->lock, flags);
9093 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 9147 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9094 need_full_sync = 1; 9148 need_full_sync = 1;
9095 req->dest_cpu = RCU_MIGRATION_IDLE; 9149 req->dest_cpu = RCU_MIGRATION_IDLE;
9096 raw_spin_unlock_irqrestore(&rq->lock, flags); 9150 raw_spin_unlock_irqrestore(&rq->lock, flags);
9097 } 9151 }
9098 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9152 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9099 synchronize_sched_expedited_count++; 9153 synchronize_sched_expedited_count++;
9100 mutex_unlock(&rcu_sched_expedited_mutex); 9154 mutex_unlock(&rcu_sched_expedited_mutex);
9101 put_online_cpus(); 9155 put_online_cpus();
9102 if (need_full_sync) 9156 if (need_full_sync)
9103 synchronize_sched(); 9157 synchronize_sched();
9104 } 9158 }
9105 EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 9159 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
kernel/sched_idletask.c
1 /* 1 /*
2 * idle-task scheduling class. 2 * idle-task scheduling class.
3 * 3 *
4 * (NOTE: these are not related to SCHED_IDLE tasks which are 4 * (NOTE: these are not related to SCHED_IDLE tasks which are
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8 #ifdef CONFIG_SMP 8 #ifdef CONFIG_SMP
9 static int 9 static int
10 select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) 10 select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
11 { 11 {
12 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
13 } 13 }
14 #endif /* CONFIG_SMP */ 14 #endif /* CONFIG_SMP */
15 /* 15 /*
16 * Idle tasks are unconditionally rescheduled: 16 * Idle tasks are unconditionally rescheduled:
17 */ 17 */
18 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) 18 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
19 { 19 {
20 resched_task(rq->idle); 20 resched_task(rq->idle);
21 } 21 }
22 22
23 static struct task_struct *pick_next_task_idle(struct rq *rq) 23 static struct task_struct *pick_next_task_idle(struct rq *rq)
24 { 24 {
25 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
26 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
27 calc_load_account_active(rq);
28 return rq->idle; 27 return rq->idle;
29 } 28 }
30 29
31 /* 30 /*
32 * It is not legal to sleep in the idle task - print a warning 31 * It is not legal to sleep in the idle task - print a warning
33 * message if some code attempts to do it: 32 * message if some code attempts to do it:
34 */ 33 */
35 static void 34 static void
36 dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) 35 dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
37 { 36 {
38 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
39 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
40 dump_stack(); 39 dump_stack();
41 raw_spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
42 } 41 }
43 42
44 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
45 { 44 {
46 } 45 }
47 46
48 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
49 { 48 {
50 } 49 }
51 50
52 static void set_curr_task_idle(struct rq *rq) 51 static void set_curr_task_idle(struct rq *rq)
53 { 52 {
54 } 53 }
55 54
56 static void switched_to_idle(struct rq *rq, struct task_struct *p, 55 static void switched_to_idle(struct rq *rq, struct task_struct *p,
57 int running) 56 int running)
58 { 57 {
59 /* Can this actually happen?? */ 58 /* Can this actually happen?? */
60 if (running) 59 if (running)
61 resched_task(rq->curr); 60 resched_task(rq->curr);
62 else 61 else
63 check_preempt_curr(rq, p, 0); 62 check_preempt_curr(rq, p, 0);
64 } 63 }
65 64
66 static void prio_changed_idle(struct rq *rq, struct task_struct *p, 65 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
67 int oldprio, int running) 66 int oldprio, int running)
68 { 67 {
69 /* This can happen for hot plug CPUS */ 68 /* This can happen for hot plug CPUS */
70 69
71 /* 70 /*
72 * Reschedule if we are currently running on this runqueue and 71 * Reschedule if we are currently running on this runqueue and
73 * our priority decreased, or if we are not currently running on 72 * our priority decreased, or if we are not currently running on
74 * this runqueue and our priority is higher than the current's 73 * this runqueue and our priority is higher than the current's
75 */ 74 */
76 if (running) { 75 if (running) {
77 if (p->prio > oldprio) 76 if (p->prio > oldprio)
78 resched_task(rq->curr); 77 resched_task(rq->curr);
79 } else 78 } else
80 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
81 } 80 }
82 81
83 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82 static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
84 { 83 {
85 return 0; 84 return 0;
86 } 85 }
87 86
88 /* 87 /*
89 * Simple, special scheduling class for the per-CPU idle tasks: 88 * Simple, special scheduling class for the per-CPU idle tasks:
90 */ 89 */
91 static const struct sched_class idle_sched_class = { 90 static const struct sched_class idle_sched_class = {
92 /* .next is NULL */ 91 /* .next is NULL */
93 /* no enqueue/yield_task for idle tasks */ 92 /* no enqueue/yield_task for idle tasks */
94 93
95 /* dequeue is not valid, we print a debug message there: */ 94 /* dequeue is not valid, we print a debug message there: */
96 .dequeue_task = dequeue_task_idle, 95 .dequeue_task = dequeue_task_idle,
97 96
98 .check_preempt_curr = check_preempt_curr_idle, 97 .check_preempt_curr = check_preempt_curr_idle,
99 98
100 .pick_next_task = pick_next_task_idle, 99 .pick_next_task = pick_next_task_idle,
101 .put_prev_task = put_prev_task_idle, 100 .put_prev_task = put_prev_task_idle,
102 101
103 #ifdef CONFIG_SMP 102 #ifdef CONFIG_SMP
104 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
105 #endif 104 #endif
106 105
107 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
108 .task_tick = task_tick_idle, 107 .task_tick = task_tick_idle,
109 108
110 .get_rr_interval = get_rr_interval_idle, 109 .get_rr_interval = get_rr_interval_idle,
111 110
112 .prio_changed = prio_changed_idle, 111 .prio_changed = prio_changed_idle,
113 .switched_to = switched_to_idle, 112 .switched_to = switched_to_idle,
114 113
115 /* no .task_new for idle tasks */ 114 /* no .task_new for idle tasks */
116 }; 115 };
117 116