Commit 56992309ccbe71f4321ddd50ee2f76f91b412c1a

Authored by Eric W. Biederman
1 parent ab09203e30

sysctl kernel: Remove binary sysctl logic

Now that sys_sysctl is a generic wrapper around /proc/sys  .ctl_name
and .strategy members of sysctl tables are dead code.  Remove them.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>

Showing 3 changed files with 3 additions and 38 deletions Inline Diff

1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, 25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz 26 * Thomas Gleixner, Mike Kravetz
27 */ 27 */
28 28
29 #include <linux/mm.h> 29 #include <linux/mm.h>
30 #include <linux/module.h> 30 #include <linux/module.h>
31 #include <linux/nmi.h> 31 #include <linux/nmi.h>
32 #include <linux/init.h> 32 #include <linux/init.h>
33 #include <linux/uaccess.h> 33 #include <linux/uaccess.h>
34 #include <linux/highmem.h> 34 #include <linux/highmem.h>
35 #include <linux/smp_lock.h> 35 #include <linux/smp_lock.h>
36 #include <asm/mmu_context.h> 36 #include <asm/mmu_context.h>
37 #include <linux/interrupt.h> 37 #include <linux/interrupt.h>
38 #include <linux/capability.h> 38 #include <linux/capability.h>
39 #include <linux/completion.h> 39 #include <linux/completion.h>
40 #include <linux/kernel_stat.h> 40 #include <linux/kernel_stat.h>
41 #include <linux/debug_locks.h> 41 #include <linux/debug_locks.h>
42 #include <linux/perf_event.h> 42 #include <linux/perf_event.h>
43 #include <linux/security.h> 43 #include <linux/security.h>
44 #include <linux/notifier.h> 44 #include <linux/notifier.h>
45 #include <linux/profile.h> 45 #include <linux/profile.h>
46 #include <linux/freezer.h> 46 #include <linux/freezer.h>
47 #include <linux/vmalloc.h> 47 #include <linux/vmalloc.h>
48 #include <linux/blkdev.h> 48 #include <linux/blkdev.h>
49 #include <linux/delay.h> 49 #include <linux/delay.h>
50 #include <linux/pid_namespace.h> 50 #include <linux/pid_namespace.h>
51 #include <linux/smp.h> 51 #include <linux/smp.h>
52 #include <linux/threads.h> 52 #include <linux/threads.h>
53 #include <linux/timer.h> 53 #include <linux/timer.h>
54 #include <linux/rcupdate.h> 54 #include <linux/rcupdate.h>
55 #include <linux/cpu.h> 55 #include <linux/cpu.h>
56 #include <linux/cpuset.h> 56 #include <linux/cpuset.h>
57 #include <linux/percpu.h> 57 #include <linux/percpu.h>
58 #include <linux/kthread.h> 58 #include <linux/kthread.h>
59 #include <linux/proc_fs.h> 59 #include <linux/proc_fs.h>
60 #include <linux/seq_file.h> 60 #include <linux/seq_file.h>
61 #include <linux/sysctl.h> 61 #include <linux/sysctl.h>
62 #include <linux/syscalls.h> 62 #include <linux/syscalls.h>
63 #include <linux/times.h> 63 #include <linux/times.h>
64 #include <linux/tsacct_kern.h> 64 #include <linux/tsacct_kern.h>
65 #include <linux/kprobes.h> 65 #include <linux/kprobes.h>
66 #include <linux/delayacct.h> 66 #include <linux/delayacct.h>
67 #include <linux/unistd.h> 67 #include <linux/unistd.h>
68 #include <linux/pagemap.h> 68 #include <linux/pagemap.h>
69 #include <linux/hrtimer.h> 69 #include <linux/hrtimer.h>
70 #include <linux/tick.h> 70 #include <linux/tick.h>
71 #include <linux/debugfs.h> 71 #include <linux/debugfs.h>
72 #include <linux/ctype.h> 72 #include <linux/ctype.h>
73 #include <linux/ftrace.h> 73 #include <linux/ftrace.h>
74 74
75 #include <asm/tlb.h> 75 #include <asm/tlb.h>
76 #include <asm/irq_regs.h> 76 #include <asm/irq_regs.h>
77 77
78 #include "sched_cpupri.h" 78 #include "sched_cpupri.h"
79 79
80 #define CREATE_TRACE_POINTS 80 #define CREATE_TRACE_POINTS
81 #include <trace/events/sched.h> 81 #include <trace/events/sched.h>
82 82
83 /* 83 /*
84 * Convert user-nice values [ -20 ... 0 ... 19 ] 84 * Convert user-nice values [ -20 ... 0 ... 19 ]
85 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 85 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
86 * and back. 86 * and back.
87 */ 87 */
88 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 88 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
89 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 89 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
90 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 90 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
91 91
92 /* 92 /*
93 * 'User priority' is the nice value converted to something we 93 * 'User priority' is the nice value converted to something we
94 * can work with better when scaling various scheduler parameters, 94 * can work with better when scaling various scheduler parameters,
95 * it's a [ 0 ... 39 ] range. 95 * it's a [ 0 ... 39 ] range.
96 */ 96 */
97 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 97 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
98 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 98 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
99 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
100 100
101 /* 101 /*
102 * Helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
103 */ 103 */
104 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) 104 #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
105 105
106 #define NICE_0_LOAD SCHED_LOAD_SCALE 106 #define NICE_0_LOAD SCHED_LOAD_SCALE
107 #define NICE_0_SHIFT SCHED_LOAD_SHIFT 107 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
108 108
109 /* 109 /*
110 * These are the 'tuning knobs' of the scheduler: 110 * These are the 'tuning knobs' of the scheduler:
111 * 111 *
112 * default timeslice is 100 msecs (used only for SCHED_RR tasks). 112 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
113 * Timeslices get refilled after they expire. 113 * Timeslices get refilled after they expire.
114 */ 114 */
115 #define DEF_TIMESLICE (100 * HZ / 1000) 115 #define DEF_TIMESLICE (100 * HZ / 1000)
116 116
117 /* 117 /*
118 * single value that denotes runtime == period, ie unlimited time. 118 * single value that denotes runtime == period, ie unlimited time.
119 */ 119 */
120 #define RUNTIME_INF ((u64)~0ULL) 120 #define RUNTIME_INF ((u64)~0ULL)
121 121
122 static inline int rt_policy(int policy) 122 static inline int rt_policy(int policy)
123 { 123 {
124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
125 return 1; 125 return 1;
126 return 0; 126 return 0;
127 } 127 }
128 128
129 static inline int task_has_rt_policy(struct task_struct *p) 129 static inline int task_has_rt_policy(struct task_struct *p)
130 { 130 {
131 return rt_policy(p->policy); 131 return rt_policy(p->policy);
132 } 132 }
133 133
134 /* 134 /*
135 * This is the priority-queue data structure of the RT scheduling class: 135 * This is the priority-queue data structure of the RT scheduling class:
136 */ 136 */
137 struct rt_prio_array { 137 struct rt_prio_array {
138 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 138 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
139 struct list_head queue[MAX_RT_PRIO]; 139 struct list_head queue[MAX_RT_PRIO];
140 }; 140 };
141 141
142 struct rt_bandwidth { 142 struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
148 }; 148 };
149 149
150 static struct rt_bandwidth def_rt_bandwidth; 150 static struct rt_bandwidth def_rt_bandwidth;
151 151
152 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 152 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
153 153
154 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) 154 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
155 { 155 {
156 struct rt_bandwidth *rt_b = 156 struct rt_bandwidth *rt_b =
157 container_of(timer, struct rt_bandwidth, rt_period_timer); 157 container_of(timer, struct rt_bandwidth, rt_period_timer);
158 ktime_t now; 158 ktime_t now;
159 int overrun; 159 int overrun;
160 int idle = 0; 160 int idle = 0;
161 161
162 for (;;) { 162 for (;;) {
163 now = hrtimer_cb_get_time(timer); 163 now = hrtimer_cb_get_time(timer);
164 overrun = hrtimer_forward(timer, now, rt_b->rt_period); 164 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
165 165
166 if (!overrun) 166 if (!overrun)
167 break; 167 break;
168 168
169 idle = do_sched_rt_period_timer(rt_b, overrun); 169 idle = do_sched_rt_period_timer(rt_b, overrun);
170 } 170 }
171 171
172 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; 172 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
173 } 173 }
174 174
175 static 175 static
176 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) 176 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
177 { 177 {
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
185 rt_b->rt_period_timer.function = sched_rt_period_timer; 185 rt_b->rt_period_timer.function = sched_rt_period_timer;
186 } 186 }
187 187
188 static inline int rt_bandwidth_enabled(void) 188 static inline int rt_bandwidth_enabled(void)
189 { 189 {
190 return sysctl_sched_rt_runtime >= 0; 190 return sysctl_sched_rt_runtime >= 0;
191 } 191 }
192 192
193 static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 193 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
194 { 194 {
195 ktime_t now; 195 ktime_t now;
196 196
197 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) 197 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
198 return; 198 return;
199 199
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
207 207
208 if (hrtimer_active(&rt_b->rt_period_timer)) 208 if (hrtimer_active(&rt_b->rt_period_timer))
209 break; 209 break;
210 210
211 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 211 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
212 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 212 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
213 213
214 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); 214 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
215 hard = hrtimer_get_expires(&rt_b->rt_period_timer); 215 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
216 delta = ktime_to_ns(ktime_sub(hard, soft)); 216 delta = ktime_to_ns(ktime_sub(hard, soft));
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 spin_unlock(&rt_b->rt_runtime_lock);
221 } 221 }
222 222
223 #ifdef CONFIG_RT_GROUP_SCHED 223 #ifdef CONFIG_RT_GROUP_SCHED
224 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) 224 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
225 { 225 {
226 hrtimer_cancel(&rt_b->rt_period_timer); 226 hrtimer_cancel(&rt_b->rt_period_timer);
227 } 227 }
228 #endif 228 #endif
229 229
230 /* 230 /*
231 * sched_domains_mutex serializes calls to arch_init_sched_domains, 231 * sched_domains_mutex serializes calls to arch_init_sched_domains,
232 * detach_destroy_domains and partition_sched_domains. 232 * detach_destroy_domains and partition_sched_domains.
233 */ 233 */
234 static DEFINE_MUTEX(sched_domains_mutex); 234 static DEFINE_MUTEX(sched_domains_mutex);
235 235
236 #ifdef CONFIG_GROUP_SCHED 236 #ifdef CONFIG_GROUP_SCHED
237 237
238 #include <linux/cgroup.h> 238 #include <linux/cgroup.h>
239 239
240 struct cfs_rq; 240 struct cfs_rq;
241 241
242 static LIST_HEAD(task_groups); 242 static LIST_HEAD(task_groups);
243 243
244 /* task group related information */ 244 /* task group related information */
245 struct task_group { 245 struct task_group {
246 #ifdef CONFIG_CGROUP_SCHED 246 #ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 247 struct cgroup_subsys_state css;
248 #endif 248 #endif
249 249
250 #ifdef CONFIG_USER_SCHED 250 #ifdef CONFIG_USER_SCHED
251 uid_t uid; 251 uid_t uid;
252 #endif 252 #endif
253 253
254 #ifdef CONFIG_FAIR_GROUP_SCHED 254 #ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 255 /* schedulable entities of this group on each cpu */
256 struct sched_entity **se; 256 struct sched_entity **se;
257 /* runqueue "owned" by this group on each cpu */ 257 /* runqueue "owned" by this group on each cpu */
258 struct cfs_rq **cfs_rq; 258 struct cfs_rq **cfs_rq;
259 unsigned long shares; 259 unsigned long shares;
260 #endif 260 #endif
261 261
262 #ifdef CONFIG_RT_GROUP_SCHED 262 #ifdef CONFIG_RT_GROUP_SCHED
263 struct sched_rt_entity **rt_se; 263 struct sched_rt_entity **rt_se;
264 struct rt_rq **rt_rq; 264 struct rt_rq **rt_rq;
265 265
266 struct rt_bandwidth rt_bandwidth; 266 struct rt_bandwidth rt_bandwidth;
267 #endif 267 #endif
268 268
269 struct rcu_head rcu; 269 struct rcu_head rcu;
270 struct list_head list; 270 struct list_head list;
271 271
272 struct task_group *parent; 272 struct task_group *parent;
273 struct list_head siblings; 273 struct list_head siblings;
274 struct list_head children; 274 struct list_head children;
275 }; 275 };
276 276
277 #ifdef CONFIG_USER_SCHED 277 #ifdef CONFIG_USER_SCHED
278 278
279 /* Helper function to pass uid information to create_sched_user() */ 279 /* Helper function to pass uid information to create_sched_user() */
280 void set_tg_uid(struct user_struct *user) 280 void set_tg_uid(struct user_struct *user)
281 { 281 {
282 user->tg->uid = user->uid; 282 user->tg->uid = user->uid;
283 } 283 }
284 284
285 /* 285 /*
286 * Root task group. 286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group. 288 * be a child to this group.
289 */ 289 */
290 struct task_group root_task_group; 290 struct task_group root_task_group;
291 291
292 #ifdef CONFIG_FAIR_GROUP_SCHED 292 #ifdef CONFIG_FAIR_GROUP_SCHED
293 /* Default task group's sched entity on each cpu */ 293 /* Default task group's sched entity on each cpu */
294 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294 static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295 /* Default task group's cfs_rq on each cpu */ 295 /* Default task group's cfs_rq on each cpu */
296 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); 296 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297 #endif /* CONFIG_FAIR_GROUP_SCHED */ 297 #endif /* CONFIG_FAIR_GROUP_SCHED */
298 298
299 #ifdef CONFIG_RT_GROUP_SCHED 299 #ifdef CONFIG_RT_GROUP_SCHED
300 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); 301 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
302 #endif /* CONFIG_RT_GROUP_SCHED */ 302 #endif /* CONFIG_RT_GROUP_SCHED */
303 #else /* !CONFIG_USER_SCHED */ 303 #else /* !CONFIG_USER_SCHED */
304 #define root_task_group init_task_group 304 #define root_task_group init_task_group
305 #endif /* CONFIG_USER_SCHED */ 305 #endif /* CONFIG_USER_SCHED */
306 306
307 /* task_group_lock serializes add/remove of task groups and also changes to 307 /* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 308 * a task group's cpu shares.
309 */ 309 */
310 static DEFINE_SPINLOCK(task_group_lock); 310 static DEFINE_SPINLOCK(task_group_lock);
311 311
312 #ifdef CONFIG_SMP 312 #ifdef CONFIG_SMP
313 static int root_task_group_empty(void) 313 static int root_task_group_empty(void)
314 { 314 {
315 return list_empty(&root_task_group.children); 315 return list_empty(&root_task_group.children);
316 } 316 }
317 #endif 317 #endif
318 318
319 #ifdef CONFIG_FAIR_GROUP_SCHED 319 #ifdef CONFIG_FAIR_GROUP_SCHED
320 #ifdef CONFIG_USER_SCHED 320 #ifdef CONFIG_USER_SCHED
321 # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 321 # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322 #else /* !CONFIG_USER_SCHED */ 322 #else /* !CONFIG_USER_SCHED */
323 # define INIT_TASK_GROUP_LOAD NICE_0_LOAD 323 # define INIT_TASK_GROUP_LOAD NICE_0_LOAD
324 #endif /* CONFIG_USER_SCHED */ 324 #endif /* CONFIG_USER_SCHED */
325 325
326 /* 326 /*
327 * A weight of 0 or 1 can cause arithmetics problems. 327 * A weight of 0 or 1 can cause arithmetics problems.
328 * A weight of a cfs_rq is the sum of weights of which entities 328 * A weight of a cfs_rq is the sum of weights of which entities
329 * are queued on this cfs_rq, so a weight of a entity should not be 329 * are queued on this cfs_rq, so a weight of a entity should not be
330 * too large, so as the shares value of a task group. 330 * too large, so as the shares value of a task group.
331 * (The default weight is 1024 - so there's no practical 331 * (The default weight is 1024 - so there's no practical
332 * limitation from this.) 332 * limitation from this.)
333 */ 333 */
334 #define MIN_SHARES 2 334 #define MIN_SHARES 2
335 #define MAX_SHARES (1UL << 18) 335 #define MAX_SHARES (1UL << 18)
336 336
337 static int init_task_group_load = INIT_TASK_GROUP_LOAD; 337 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
338 #endif 338 #endif
339 339
340 /* Default task group. 340 /* Default task group.
341 * Every task in system belong to this group at bootup. 341 * Every task in system belong to this group at bootup.
342 */ 342 */
343 struct task_group init_task_group; 343 struct task_group init_task_group;
344 344
345 /* return group to which a task belongs */ 345 /* return group to which a task belongs */
346 static inline struct task_group *task_group(struct task_struct *p) 346 static inline struct task_group *task_group(struct task_struct *p)
347 { 347 {
348 struct task_group *tg; 348 struct task_group *tg;
349 349
350 #ifdef CONFIG_USER_SCHED 350 #ifdef CONFIG_USER_SCHED
351 rcu_read_lock(); 351 rcu_read_lock();
352 tg = __task_cred(p)->user->tg; 352 tg = __task_cred(p)->user->tg;
353 rcu_read_unlock(); 353 rcu_read_unlock();
354 #elif defined(CONFIG_CGROUP_SCHED) 354 #elif defined(CONFIG_CGROUP_SCHED)
355 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 355 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
356 struct task_group, css); 356 struct task_group, css);
357 #else 357 #else
358 tg = &init_task_group; 358 tg = &init_task_group;
359 #endif 359 #endif
360 return tg; 360 return tg;
361 } 361 }
362 362
363 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 363 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
364 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 364 static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
365 { 365 {
366 #ifdef CONFIG_FAIR_GROUP_SCHED 366 #ifdef CONFIG_FAIR_GROUP_SCHED
367 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 367 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
368 p->se.parent = task_group(p)->se[cpu]; 368 p->se.parent = task_group(p)->se[cpu];
369 #endif 369 #endif
370 370
371 #ifdef CONFIG_RT_GROUP_SCHED 371 #ifdef CONFIG_RT_GROUP_SCHED
372 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 372 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
373 p->rt.parent = task_group(p)->rt_se[cpu]; 373 p->rt.parent = task_group(p)->rt_se[cpu];
374 #endif 374 #endif
375 } 375 }
376 376
377 #else 377 #else
378 378
379 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
380 static inline struct task_group *task_group(struct task_struct *p) 380 static inline struct task_group *task_group(struct task_struct *p)
381 { 381 {
382 return NULL; 382 return NULL;
383 } 383 }
384 384
385 #endif /* CONFIG_GROUP_SCHED */ 385 #endif /* CONFIG_GROUP_SCHED */
386 386
387 /* CFS-related fields in a runqueue */ 387 /* CFS-related fields in a runqueue */
388 struct cfs_rq { 388 struct cfs_rq {
389 struct load_weight load; 389 struct load_weight load;
390 unsigned long nr_running; 390 unsigned long nr_running;
391 391
392 u64 exec_clock; 392 u64 exec_clock;
393 u64 min_vruntime; 393 u64 min_vruntime;
394 394
395 struct rb_root tasks_timeline; 395 struct rb_root tasks_timeline;
396 struct rb_node *rb_leftmost; 396 struct rb_node *rb_leftmost;
397 397
398 struct list_head tasks; 398 struct list_head tasks;
399 struct list_head *balance_iterator; 399 struct list_head *balance_iterator;
400 400
401 /* 401 /*
402 * 'curr' points to currently running entity on this cfs_rq. 402 * 'curr' points to currently running entity on this cfs_rq.
403 * It is set to NULL otherwise (i.e when none are currently running). 403 * It is set to NULL otherwise (i.e when none are currently running).
404 */ 404 */
405 struct sched_entity *curr, *next, *last; 405 struct sched_entity *curr, *next, *last;
406 406
407 unsigned int nr_spread_over; 407 unsigned int nr_spread_over;
408 408
409 #ifdef CONFIG_FAIR_GROUP_SCHED 409 #ifdef CONFIG_FAIR_GROUP_SCHED
410 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 410 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
411 411
412 /* 412 /*
413 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 413 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
414 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 414 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
415 * (like users, containers etc.) 415 * (like users, containers etc.)
416 * 416 *
417 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 417 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
418 * list is used during load balance. 418 * list is used during load balance.
419 */ 419 */
420 struct list_head leaf_cfs_rq_list; 420 struct list_head leaf_cfs_rq_list;
421 struct task_group *tg; /* group that "owns" this runqueue */ 421 struct task_group *tg; /* group that "owns" this runqueue */
422 422
423 #ifdef CONFIG_SMP 423 #ifdef CONFIG_SMP
424 /* 424 /*
425 * the part of load.weight contributed by tasks 425 * the part of load.weight contributed by tasks
426 */ 426 */
427 unsigned long task_weight; 427 unsigned long task_weight;
428 428
429 /* 429 /*
430 * h_load = weight * f(tg) 430 * h_load = weight * f(tg)
431 * 431 *
432 * Where f(tg) is the recursive weight fraction assigned to 432 * Where f(tg) is the recursive weight fraction assigned to
433 * this group. 433 * this group.
434 */ 434 */
435 unsigned long h_load; 435 unsigned long h_load;
436 436
437 /* 437 /*
438 * this cpu's part of tg->shares 438 * this cpu's part of tg->shares
439 */ 439 */
440 unsigned long shares; 440 unsigned long shares;
441 441
442 /* 442 /*
443 * load.weight at the time we set shares 443 * load.weight at the time we set shares
444 */ 444 */
445 unsigned long rq_weight; 445 unsigned long rq_weight;
446 #endif 446 #endif
447 #endif 447 #endif
448 }; 448 };
449 449
450 /* Real-Time classes' related field in a runqueue: */ 450 /* Real-Time classes' related field in a runqueue: */
451 struct rt_rq { 451 struct rt_rq {
452 struct rt_prio_array active; 452 struct rt_prio_array active;
453 unsigned long rt_nr_running; 453 unsigned long rt_nr_running;
454 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 454 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
455 struct { 455 struct {
456 int curr; /* highest queued rt task prio */ 456 int curr; /* highest queued rt task prio */
457 #ifdef CONFIG_SMP 457 #ifdef CONFIG_SMP
458 int next; /* next highest */ 458 int next; /* next highest */
459 #endif 459 #endif
460 } highest_prio; 460 } highest_prio;
461 #endif 461 #endif
462 #ifdef CONFIG_SMP 462 #ifdef CONFIG_SMP
463 unsigned long rt_nr_migratory; 463 unsigned long rt_nr_migratory;
464 unsigned long rt_nr_total; 464 unsigned long rt_nr_total;
465 int overloaded; 465 int overloaded;
466 struct plist_head pushable_tasks; 466 struct plist_head pushable_tasks;
467 #endif 467 #endif
468 int rt_throttled; 468 int rt_throttled;
469 u64 rt_time; 469 u64 rt_time;
470 u64 rt_runtime; 470 u64 rt_runtime;
471 /* Nests inside the rq lock: */ 471 /* Nests inside the rq lock: */
472 spinlock_t rt_runtime_lock; 472 spinlock_t rt_runtime_lock;
473 473
474 #ifdef CONFIG_RT_GROUP_SCHED 474 #ifdef CONFIG_RT_GROUP_SCHED
475 unsigned long rt_nr_boosted; 475 unsigned long rt_nr_boosted;
476 476
477 struct rq *rq; 477 struct rq *rq;
478 struct list_head leaf_rt_rq_list; 478 struct list_head leaf_rt_rq_list;
479 struct task_group *tg; 479 struct task_group *tg;
480 struct sched_rt_entity *rt_se; 480 struct sched_rt_entity *rt_se;
481 #endif 481 #endif
482 }; 482 };
483 483
484 #ifdef CONFIG_SMP 484 #ifdef CONFIG_SMP
485 485
486 /* 486 /*
487 * We add the notion of a root-domain which will be used to define per-domain 487 * We add the notion of a root-domain which will be used to define per-domain
488 * variables. Each exclusive cpuset essentially defines an island domain by 488 * variables. Each exclusive cpuset essentially defines an island domain by
489 * fully partitioning the member cpus from any other cpuset. Whenever a new 489 * fully partitioning the member cpus from any other cpuset. Whenever a new
490 * exclusive cpuset is created, we also create and attach a new root-domain 490 * exclusive cpuset is created, we also create and attach a new root-domain
491 * object. 491 * object.
492 * 492 *
493 */ 493 */
494 struct root_domain { 494 struct root_domain {
495 atomic_t refcount; 495 atomic_t refcount;
496 cpumask_var_t span; 496 cpumask_var_t span;
497 cpumask_var_t online; 497 cpumask_var_t online;
498 498
499 /* 499 /*
500 * The "RT overload" flag: it gets set if a CPU has more than 500 * The "RT overload" flag: it gets set if a CPU has more than
501 * one runnable RT task. 501 * one runnable RT task.
502 */ 502 */
503 cpumask_var_t rto_mask; 503 cpumask_var_t rto_mask;
504 atomic_t rto_count; 504 atomic_t rto_count;
505 #ifdef CONFIG_SMP 505 #ifdef CONFIG_SMP
506 struct cpupri cpupri; 506 struct cpupri cpupri;
507 #endif 507 #endif
508 }; 508 };
509 509
510 /* 510 /*
511 * By default the system creates a single root-domain with all cpus as 511 * By default the system creates a single root-domain with all cpus as
512 * members (mimicking the global state we have today). 512 * members (mimicking the global state we have today).
513 */ 513 */
514 static struct root_domain def_root_domain; 514 static struct root_domain def_root_domain;
515 515
516 #endif 516 #endif
517 517
518 /* 518 /*
519 * This is the main, per-CPU runqueue data structure. 519 * This is the main, per-CPU runqueue data structure.
520 * 520 *
521 * Locking rule: those places that want to lock multiple runqueues 521 * Locking rule: those places that want to lock multiple runqueues
522 * (such as the load balancing or the thread migration code), lock 522 * (such as the load balancing or the thread migration code), lock
523 * acquire operations must be ordered by ascending &runqueue. 523 * acquire operations must be ordered by ascending &runqueue.
524 */ 524 */
525 struct rq { 525 struct rq {
526 /* runqueue lock: */ 526 /* runqueue lock: */
527 spinlock_t lock; 527 spinlock_t lock;
528 528
529 /* 529 /*
530 * nr_running and cpu_load should be in the same cacheline because 530 * nr_running and cpu_load should be in the same cacheline because
531 * remote CPUs use both these fields when doing load calculation. 531 * remote CPUs use both these fields when doing load calculation.
532 */ 532 */
533 unsigned long nr_running; 533 unsigned long nr_running;
534 #define CPU_LOAD_IDX_MAX 5 534 #define CPU_LOAD_IDX_MAX 5
535 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 535 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536 #ifdef CONFIG_NO_HZ 536 #ifdef CONFIG_NO_HZ
537 unsigned long last_tick_seen; 537 unsigned long last_tick_seen;
538 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
539 #endif 539 #endif
540 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 541 struct load_weight load;
542 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
543 u64 nr_switches; 543 u64 nr_switches;
544 u64 nr_migrations_in; 544 u64 nr_migrations_in;
545 545
546 struct cfs_rq cfs; 546 struct cfs_rq cfs;
547 struct rt_rq rt; 547 struct rt_rq rt;
548 548
549 #ifdef CONFIG_FAIR_GROUP_SCHED 549 #ifdef CONFIG_FAIR_GROUP_SCHED
550 /* list of leaf cfs_rq on this cpu: */ 550 /* list of leaf cfs_rq on this cpu: */
551 struct list_head leaf_cfs_rq_list; 551 struct list_head leaf_cfs_rq_list;
552 #endif 552 #endif
553 #ifdef CONFIG_RT_GROUP_SCHED 553 #ifdef CONFIG_RT_GROUP_SCHED
554 struct list_head leaf_rt_rq_list; 554 struct list_head leaf_rt_rq_list;
555 #endif 555 #endif
556 556
557 /* 557 /*
558 * This is part of a global counter where only the total sum 558 * This is part of a global counter where only the total sum
559 * over all CPUs matters. A task can increase this counter on 559 * over all CPUs matters. A task can increase this counter on
560 * one CPU and if it got migrated afterwards it may decrease 560 * one CPU and if it got migrated afterwards it may decrease
561 * it on another CPU. Always updated under the runqueue lock: 561 * it on another CPU. Always updated under the runqueue lock:
562 */ 562 */
563 unsigned long nr_uninterruptible; 563 unsigned long nr_uninterruptible;
564 564
565 struct task_struct *curr, *idle; 565 struct task_struct *curr, *idle;
566 unsigned long next_balance; 566 unsigned long next_balance;
567 struct mm_struct *prev_mm; 567 struct mm_struct *prev_mm;
568 568
569 u64 clock; 569 u64 clock;
570 570
571 atomic_t nr_iowait; 571 atomic_t nr_iowait;
572 572
573 #ifdef CONFIG_SMP 573 #ifdef CONFIG_SMP
574 struct root_domain *rd; 574 struct root_domain *rd;
575 struct sched_domain *sd; 575 struct sched_domain *sd;
576 576
577 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
578 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule; 579 int post_schedule;
580 int active_balance; 580 int active_balance;
581 int push_cpu; 581 int push_cpu;
582 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
583 int cpu; 583 int cpu;
584 int online; 584 int online;
585 585
586 unsigned long avg_load_per_task; 586 unsigned long avg_load_per_task;
587 587
588 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
589 struct list_head migration_queue; 589 struct list_head migration_queue;
590 590
591 u64 rt_avg; 591 u64 rt_avg;
592 u64 age_stamp; 592 u64 age_stamp;
593 #endif 593 #endif
594 594
595 /* calc_load related fields */ 595 /* calc_load related fields */
596 unsigned long calc_load_update; 596 unsigned long calc_load_update;
597 long calc_load_active; 597 long calc_load_active;
598 598
599 #ifdef CONFIG_SCHED_HRTICK 599 #ifdef CONFIG_SCHED_HRTICK
600 #ifdef CONFIG_SMP 600 #ifdef CONFIG_SMP
601 int hrtick_csd_pending; 601 int hrtick_csd_pending;
602 struct call_single_data hrtick_csd; 602 struct call_single_data hrtick_csd;
603 #endif 603 #endif
604 struct hrtimer hrtick_timer; 604 struct hrtimer hrtick_timer;
605 #endif 605 #endif
606 606
607 #ifdef CONFIG_SCHEDSTATS 607 #ifdef CONFIG_SCHEDSTATS
608 /* latency stats */ 608 /* latency stats */
609 struct sched_info rq_sched_info; 609 struct sched_info rq_sched_info;
610 unsigned long long rq_cpu_time; 610 unsigned long long rq_cpu_time;
611 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 611 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
612 612
613 /* sys_sched_yield() stats */ 613 /* sys_sched_yield() stats */
614 unsigned int yld_count; 614 unsigned int yld_count;
615 615
616 /* schedule() stats */ 616 /* schedule() stats */
617 unsigned int sched_switch; 617 unsigned int sched_switch;
618 unsigned int sched_count; 618 unsigned int sched_count;
619 unsigned int sched_goidle; 619 unsigned int sched_goidle;
620 620
621 /* try_to_wake_up() stats */ 621 /* try_to_wake_up() stats */
622 unsigned int ttwu_count; 622 unsigned int ttwu_count;
623 unsigned int ttwu_local; 623 unsigned int ttwu_local;
624 624
625 /* BKL stats */ 625 /* BKL stats */
626 unsigned int bkl_count; 626 unsigned int bkl_count;
627 #endif 627 #endif
628 }; 628 };
629 629
630 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
631 631
632 static inline 632 static inline
633 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 633 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
634 { 634 {
635 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
636 } 636 }
637 637
638 static inline int cpu_of(struct rq *rq) 638 static inline int cpu_of(struct rq *rq)
639 { 639 {
640 #ifdef CONFIG_SMP 640 #ifdef CONFIG_SMP
641 return rq->cpu; 641 return rq->cpu;
642 #else 642 #else
643 return 0; 643 return 0;
644 #endif 644 #endif
645 } 645 }
646 646
647 /* 647 /*
648 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 648 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
649 * See detach_destroy_domains: synchronize_sched for details. 649 * See detach_destroy_domains: synchronize_sched for details.
650 * 650 *
651 * The domain tree of any CPU may only be accessed from within 651 * The domain tree of any CPU may only be accessed from within
652 * preempt-disabled sections. 652 * preempt-disabled sections.
653 */ 653 */
654 #define for_each_domain(cpu, __sd) \ 654 #define for_each_domain(cpu, __sd) \
655 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 655 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
656 656
657 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 657 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
658 #define this_rq() (&__get_cpu_var(runqueues)) 658 #define this_rq() (&__get_cpu_var(runqueues))
659 #define task_rq(p) cpu_rq(task_cpu(p)) 659 #define task_rq(p) cpu_rq(task_cpu(p))
660 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661 #define raw_rq() (&__raw_get_cpu_var(runqueues)) 661 #define raw_rq() (&__raw_get_cpu_var(runqueues))
662 662
663 inline void update_rq_clock(struct rq *rq) 663 inline void update_rq_clock(struct rq *rq)
664 { 664 {
665 rq->clock = sched_clock_cpu(cpu_of(rq)); 665 rq->clock = sched_clock_cpu(cpu_of(rq));
666 } 666 }
667 667
668 /* 668 /*
669 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 669 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
670 */ 670 */
671 #ifdef CONFIG_SCHED_DEBUG 671 #ifdef CONFIG_SCHED_DEBUG
672 # define const_debug __read_mostly 672 # define const_debug __read_mostly
673 #else 673 #else
674 # define const_debug static const 674 # define const_debug static const
675 #endif 675 #endif
676 676
677 /** 677 /**
678 * runqueue_is_locked 678 * runqueue_is_locked
679 * @cpu: the processor in question. 679 * @cpu: the processor in question.
680 * 680 *
681 * Returns true if the current cpu runqueue is locked. 681 * Returns true if the current cpu runqueue is locked.
682 * This interface allows printk to be called with the runqueue lock 682 * This interface allows printk to be called with the runqueue lock
683 * held and know whether or not it is OK to wake up the klogd. 683 * held and know whether or not it is OK to wake up the klogd.
684 */ 684 */
685 int runqueue_is_locked(int cpu) 685 int runqueue_is_locked(int cpu)
686 { 686 {
687 return spin_is_locked(&cpu_rq(cpu)->lock); 687 return spin_is_locked(&cpu_rq(cpu)->lock);
688 } 688 }
689 689
690 /* 690 /*
691 * Debugging: various feature bits 691 * Debugging: various feature bits
692 */ 692 */
693 693
694 #define SCHED_FEAT(name, enabled) \ 694 #define SCHED_FEAT(name, enabled) \
695 __SCHED_FEAT_##name , 695 __SCHED_FEAT_##name ,
696 696
697 enum { 697 enum {
698 #include "sched_features.h" 698 #include "sched_features.h"
699 }; 699 };
700 700
701 #undef SCHED_FEAT 701 #undef SCHED_FEAT
702 702
703 #define SCHED_FEAT(name, enabled) \ 703 #define SCHED_FEAT(name, enabled) \
704 (1UL << __SCHED_FEAT_##name) * enabled | 704 (1UL << __SCHED_FEAT_##name) * enabled |
705 705
706 const_debug unsigned int sysctl_sched_features = 706 const_debug unsigned int sysctl_sched_features =
707 #include "sched_features.h" 707 #include "sched_features.h"
708 0; 708 0;
709 709
710 #undef SCHED_FEAT 710 #undef SCHED_FEAT
711 711
712 #ifdef CONFIG_SCHED_DEBUG 712 #ifdef CONFIG_SCHED_DEBUG
713 #define SCHED_FEAT(name, enabled) \ 713 #define SCHED_FEAT(name, enabled) \
714 #name , 714 #name ,
715 715
716 static __read_mostly char *sched_feat_names[] = { 716 static __read_mostly char *sched_feat_names[] = {
717 #include "sched_features.h" 717 #include "sched_features.h"
718 NULL 718 NULL
719 }; 719 };
720 720
721 #undef SCHED_FEAT 721 #undef SCHED_FEAT
722 722
723 static int sched_feat_show(struct seq_file *m, void *v) 723 static int sched_feat_show(struct seq_file *m, void *v)
724 { 724 {
725 int i; 725 int i;
726 726
727 for (i = 0; sched_feat_names[i]; i++) { 727 for (i = 0; sched_feat_names[i]; i++) {
728 if (!(sysctl_sched_features & (1UL << i))) 728 if (!(sysctl_sched_features & (1UL << i)))
729 seq_puts(m, "NO_"); 729 seq_puts(m, "NO_");
730 seq_printf(m, "%s ", sched_feat_names[i]); 730 seq_printf(m, "%s ", sched_feat_names[i]);
731 } 731 }
732 seq_puts(m, "\n"); 732 seq_puts(m, "\n");
733 733
734 return 0; 734 return 0;
735 } 735 }
736 736
737 static ssize_t 737 static ssize_t
738 sched_feat_write(struct file *filp, const char __user *ubuf, 738 sched_feat_write(struct file *filp, const char __user *ubuf,
739 size_t cnt, loff_t *ppos) 739 size_t cnt, loff_t *ppos)
740 { 740 {
741 char buf[64]; 741 char buf[64];
742 char *cmp = buf; 742 char *cmp = buf;
743 int neg = 0; 743 int neg = 0;
744 int i; 744 int i;
745 745
746 if (cnt > 63) 746 if (cnt > 63)
747 cnt = 63; 747 cnt = 63;
748 748
749 if (copy_from_user(&buf, ubuf, cnt)) 749 if (copy_from_user(&buf, ubuf, cnt))
750 return -EFAULT; 750 return -EFAULT;
751 751
752 buf[cnt] = 0; 752 buf[cnt] = 0;
753 753
754 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
755 neg = 1; 755 neg = 1;
756 cmp += 3; 756 cmp += 3;
757 } 757 }
758 758
759 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
760 int len = strlen(sched_feat_names[i]); 760 int len = strlen(sched_feat_names[i]);
761 761
762 if (strncmp(cmp, sched_feat_names[i], len) == 0) { 762 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
763 if (neg) 763 if (neg)
764 sysctl_sched_features &= ~(1UL << i); 764 sysctl_sched_features &= ~(1UL << i);
765 else 765 else
766 sysctl_sched_features |= (1UL << i); 766 sysctl_sched_features |= (1UL << i);
767 break; 767 break;
768 } 768 }
769 } 769 }
770 770
771 if (!sched_feat_names[i]) 771 if (!sched_feat_names[i])
772 return -EINVAL; 772 return -EINVAL;
773 773
774 filp->f_pos += cnt; 774 filp->f_pos += cnt;
775 775
776 return cnt; 776 return cnt;
777 } 777 }
778 778
779 static int sched_feat_open(struct inode *inode, struct file *filp) 779 static int sched_feat_open(struct inode *inode, struct file *filp)
780 { 780 {
781 return single_open(filp, sched_feat_show, NULL); 781 return single_open(filp, sched_feat_show, NULL);
782 } 782 }
783 783
784 static const struct file_operations sched_feat_fops = { 784 static const struct file_operations sched_feat_fops = {
785 .open = sched_feat_open, 785 .open = sched_feat_open,
786 .write = sched_feat_write, 786 .write = sched_feat_write,
787 .read = seq_read, 787 .read = seq_read,
788 .llseek = seq_lseek, 788 .llseek = seq_lseek,
789 .release = single_release, 789 .release = single_release,
790 }; 790 };
791 791
792 static __init int sched_init_debug(void) 792 static __init int sched_init_debug(void)
793 { 793 {
794 debugfs_create_file("sched_features", 0644, NULL, NULL, 794 debugfs_create_file("sched_features", 0644, NULL, NULL,
795 &sched_feat_fops); 795 &sched_feat_fops);
796 796
797 return 0; 797 return 0;
798 } 798 }
799 late_initcall(sched_init_debug); 799 late_initcall(sched_init_debug);
800 800
801 #endif 801 #endif
802 802
803 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 803 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
804 804
805 /* 805 /*
806 * Number of tasks to iterate in a single balance run. 806 * Number of tasks to iterate in a single balance run.
807 * Limited because this is done with IRQs disabled. 807 * Limited because this is done with IRQs disabled.
808 */ 808 */
809 const_debug unsigned int sysctl_sched_nr_migrate = 32; 809 const_debug unsigned int sysctl_sched_nr_migrate = 32;
810 810
811 /* 811 /*
812 * ratelimit for updating the group shares. 812 * ratelimit for updating the group shares.
813 * default: 0.25ms 813 * default: 0.25ms
814 */ 814 */
815 unsigned int sysctl_sched_shares_ratelimit = 250000; 815 unsigned int sysctl_sched_shares_ratelimit = 250000;
816 816
817 /* 817 /*
818 * Inject some fuzzyness into changing the per-cpu group shares 818 * Inject some fuzzyness into changing the per-cpu group shares
819 * this avoids remote rq-locks at the expense of fairness. 819 * this avoids remote rq-locks at the expense of fairness.
820 * default: 4 820 * default: 4
821 */ 821 */
822 unsigned int sysctl_sched_shares_thresh = 4; 822 unsigned int sysctl_sched_shares_thresh = 4;
823 823
824 /* 824 /*
825 * period over which we average the RT time consumption, measured 825 * period over which we average the RT time consumption, measured
826 * in ms. 826 * in ms.
827 * 827 *
828 * default: 1s 828 * default: 1s
829 */ 829 */
830 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; 830 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
831 831
832 /* 832 /*
833 * period over which we measure -rt task cpu usage in us. 833 * period over which we measure -rt task cpu usage in us.
834 * default: 1s 834 * default: 1s
835 */ 835 */
836 unsigned int sysctl_sched_rt_period = 1000000; 836 unsigned int sysctl_sched_rt_period = 1000000;
837 837
838 static __read_mostly int scheduler_running; 838 static __read_mostly int scheduler_running;
839 839
840 /* 840 /*
841 * part of the period that we allow rt tasks to run in us. 841 * part of the period that we allow rt tasks to run in us.
842 * default: 0.95s 842 * default: 0.95s
843 */ 843 */
844 int sysctl_sched_rt_runtime = 950000; 844 int sysctl_sched_rt_runtime = 950000;
845 845
846 static inline u64 global_rt_period(void) 846 static inline u64 global_rt_period(void)
847 { 847 {
848 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 848 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
849 } 849 }
850 850
851 static inline u64 global_rt_runtime(void) 851 static inline u64 global_rt_runtime(void)
852 { 852 {
853 if (sysctl_sched_rt_runtime < 0) 853 if (sysctl_sched_rt_runtime < 0)
854 return RUNTIME_INF; 854 return RUNTIME_INF;
855 855
856 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 856 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
857 } 857 }
858 858
859 #ifndef prepare_arch_switch 859 #ifndef prepare_arch_switch
860 # define prepare_arch_switch(next) do { } while (0) 860 # define prepare_arch_switch(next) do { } while (0)
861 #endif 861 #endif
862 #ifndef finish_arch_switch 862 #ifndef finish_arch_switch
863 # define finish_arch_switch(prev) do { } while (0) 863 # define finish_arch_switch(prev) do { } while (0)
864 #endif 864 #endif
865 865
866 static inline int task_current(struct rq *rq, struct task_struct *p) 866 static inline int task_current(struct rq *rq, struct task_struct *p)
867 { 867 {
868 return rq->curr == p; 868 return rq->curr == p;
869 } 869 }
870 870
871 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 871 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
872 static inline int task_running(struct rq *rq, struct task_struct *p) 872 static inline int task_running(struct rq *rq, struct task_struct *p)
873 { 873 {
874 return task_current(rq, p); 874 return task_current(rq, p);
875 } 875 }
876 876
877 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 877 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878 { 878 {
879 } 879 }
880 880
881 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 881 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
882 { 882 {
883 #ifdef CONFIG_DEBUG_SPINLOCK 883 #ifdef CONFIG_DEBUG_SPINLOCK
884 /* this is a valid case when another task releases the spinlock */ 884 /* this is a valid case when another task releases the spinlock */
885 rq->lock.owner = current; 885 rq->lock.owner = current;
886 #endif 886 #endif
887 /* 887 /*
888 * If we are tracking spinlock dependencies then we have to 888 * If we are tracking spinlock dependencies then we have to
889 * fix up the runqueue lock - which gets 'carried over' from 889 * fix up the runqueue lock - which gets 'carried over' from
890 * prev into current: 890 * prev into current:
891 */ 891 */
892 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 892 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
893 893
894 spin_unlock_irq(&rq->lock); 894 spin_unlock_irq(&rq->lock);
895 } 895 }
896 896
897 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 897 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
898 static inline int task_running(struct rq *rq, struct task_struct *p) 898 static inline int task_running(struct rq *rq, struct task_struct *p)
899 { 899 {
900 #ifdef CONFIG_SMP 900 #ifdef CONFIG_SMP
901 return p->oncpu; 901 return p->oncpu;
902 #else 902 #else
903 return task_current(rq, p); 903 return task_current(rq, p);
904 #endif 904 #endif
905 } 905 }
906 906
907 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 907 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
908 { 908 {
909 #ifdef CONFIG_SMP 909 #ifdef CONFIG_SMP
910 /* 910 /*
911 * We can optimise this out completely for !SMP, because the 911 * We can optimise this out completely for !SMP, because the
912 * SMP rebalancing from interrupt is the only thing that cares 912 * SMP rebalancing from interrupt is the only thing that cares
913 * here. 913 * here.
914 */ 914 */
915 next->oncpu = 1; 915 next->oncpu = 1;
916 #endif 916 #endif
917 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 917 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
918 spin_unlock_irq(&rq->lock); 918 spin_unlock_irq(&rq->lock);
919 #else 919 #else
920 spin_unlock(&rq->lock); 920 spin_unlock(&rq->lock);
921 #endif 921 #endif
922 } 922 }
923 923
924 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 924 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
925 { 925 {
926 #ifdef CONFIG_SMP 926 #ifdef CONFIG_SMP
927 /* 927 /*
928 * After ->oncpu is cleared, the task can be moved to a different CPU. 928 * After ->oncpu is cleared, the task can be moved to a different CPU.
929 * We must ensure this doesn't happen until the switch is completely 929 * We must ensure this doesn't happen until the switch is completely
930 * finished. 930 * finished.
931 */ 931 */
932 smp_wmb(); 932 smp_wmb();
933 prev->oncpu = 0; 933 prev->oncpu = 0;
934 #endif 934 #endif
935 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 935 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
936 local_irq_enable(); 936 local_irq_enable();
937 #endif 937 #endif
938 } 938 }
939 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 939 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
940 940
941 /* 941 /*
942 * __task_rq_lock - lock the runqueue a given task resides on. 942 * __task_rq_lock - lock the runqueue a given task resides on.
943 * Must be called interrupts disabled. 943 * Must be called interrupts disabled.
944 */ 944 */
945 static inline struct rq *__task_rq_lock(struct task_struct *p) 945 static inline struct rq *__task_rq_lock(struct task_struct *p)
946 __acquires(rq->lock) 946 __acquires(rq->lock)
947 { 947 {
948 for (;;) { 948 for (;;) {
949 struct rq *rq = task_rq(p); 949 struct rq *rq = task_rq(p);
950 spin_lock(&rq->lock); 950 spin_lock(&rq->lock);
951 if (likely(rq == task_rq(p))) 951 if (likely(rq == task_rq(p)))
952 return rq; 952 return rq;
953 spin_unlock(&rq->lock); 953 spin_unlock(&rq->lock);
954 } 954 }
955 } 955 }
956 956
957 /* 957 /*
958 * task_rq_lock - lock the runqueue a given task resides on and disable 958 * task_rq_lock - lock the runqueue a given task resides on and disable
959 * interrupts. Note the ordering: we can safely lookup the task_rq without 959 * interrupts. Note the ordering: we can safely lookup the task_rq without
960 * explicitly disabling preemption. 960 * explicitly disabling preemption.
961 */ 961 */
962 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 962 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
963 __acquires(rq->lock) 963 __acquires(rq->lock)
964 { 964 {
965 struct rq *rq; 965 struct rq *rq;
966 966
967 for (;;) { 967 for (;;) {
968 local_irq_save(*flags); 968 local_irq_save(*flags);
969 rq = task_rq(p); 969 rq = task_rq(p);
970 spin_lock(&rq->lock); 970 spin_lock(&rq->lock);
971 if (likely(rq == task_rq(p))) 971 if (likely(rq == task_rq(p)))
972 return rq; 972 return rq;
973 spin_unlock_irqrestore(&rq->lock, *flags); 973 spin_unlock_irqrestore(&rq->lock, *flags);
974 } 974 }
975 } 975 }
976 976
977 void task_rq_unlock_wait(struct task_struct *p) 977 void task_rq_unlock_wait(struct task_struct *p)
978 { 978 {
979 struct rq *rq = task_rq(p); 979 struct rq *rq = task_rq(p);
980 980
981 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 981 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
982 spin_unlock_wait(&rq->lock); 982 spin_unlock_wait(&rq->lock);
983 } 983 }
984 984
985 static void __task_rq_unlock(struct rq *rq) 985 static void __task_rq_unlock(struct rq *rq)
986 __releases(rq->lock) 986 __releases(rq->lock)
987 { 987 {
988 spin_unlock(&rq->lock); 988 spin_unlock(&rq->lock);
989 } 989 }
990 990
991 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 991 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
992 __releases(rq->lock) 992 __releases(rq->lock)
993 { 993 {
994 spin_unlock_irqrestore(&rq->lock, *flags); 994 spin_unlock_irqrestore(&rq->lock, *flags);
995 } 995 }
996 996
997 /* 997 /*
998 * this_rq_lock - lock this runqueue and disable interrupts. 998 * this_rq_lock - lock this runqueue and disable interrupts.
999 */ 999 */
1000 static struct rq *this_rq_lock(void) 1000 static struct rq *this_rq_lock(void)
1001 __acquires(rq->lock) 1001 __acquires(rq->lock)
1002 { 1002 {
1003 struct rq *rq; 1003 struct rq *rq;
1004 1004
1005 local_irq_disable(); 1005 local_irq_disable();
1006 rq = this_rq(); 1006 rq = this_rq();
1007 spin_lock(&rq->lock); 1007 spin_lock(&rq->lock);
1008 1008
1009 return rq; 1009 return rq;
1010 } 1010 }
1011 1011
1012 #ifdef CONFIG_SCHED_HRTICK 1012 #ifdef CONFIG_SCHED_HRTICK
1013 /* 1013 /*
1014 * Use HR-timers to deliver accurate preemption points. 1014 * Use HR-timers to deliver accurate preemption points.
1015 * 1015 *
1016 * Its all a bit involved since we cannot program an hrt while holding the 1016 * Its all a bit involved since we cannot program an hrt while holding the
1017 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a 1017 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1018 * reschedule event. 1018 * reschedule event.
1019 * 1019 *
1020 * When we get rescheduled we reprogram the hrtick_timer outside of the 1020 * When we get rescheduled we reprogram the hrtick_timer outside of the
1021 * rq->lock. 1021 * rq->lock.
1022 */ 1022 */
1023 1023
1024 /* 1024 /*
1025 * Use hrtick when: 1025 * Use hrtick when:
1026 * - enabled by features 1026 * - enabled by features
1027 * - hrtimer is actually high res 1027 * - hrtimer is actually high res
1028 */ 1028 */
1029 static inline int hrtick_enabled(struct rq *rq) 1029 static inline int hrtick_enabled(struct rq *rq)
1030 { 1030 {
1031 if (!sched_feat(HRTICK)) 1031 if (!sched_feat(HRTICK))
1032 return 0; 1032 return 0;
1033 if (!cpu_active(cpu_of(rq))) 1033 if (!cpu_active(cpu_of(rq)))
1034 return 0; 1034 return 0;
1035 return hrtimer_is_hres_active(&rq->hrtick_timer); 1035 return hrtimer_is_hres_active(&rq->hrtick_timer);
1036 } 1036 }
1037 1037
1038 static void hrtick_clear(struct rq *rq) 1038 static void hrtick_clear(struct rq *rq)
1039 { 1039 {
1040 if (hrtimer_active(&rq->hrtick_timer)) 1040 if (hrtimer_active(&rq->hrtick_timer))
1041 hrtimer_cancel(&rq->hrtick_timer); 1041 hrtimer_cancel(&rq->hrtick_timer);
1042 } 1042 }
1043 1043
1044 /* 1044 /*
1045 * High-resolution timer tick. 1045 * High-resolution timer tick.
1046 * Runs from hardirq context with interrupts disabled. 1046 * Runs from hardirq context with interrupts disabled.
1047 */ 1047 */
1048 static enum hrtimer_restart hrtick(struct hrtimer *timer) 1048 static enum hrtimer_restart hrtick(struct hrtimer *timer)
1049 { 1049 {
1050 struct rq *rq = container_of(timer, struct rq, hrtick_timer); 1050 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1051 1051
1052 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1052 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1053 1053
1054 spin_lock(&rq->lock); 1054 spin_lock(&rq->lock);
1055 update_rq_clock(rq); 1055 update_rq_clock(rq);
1056 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1056 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1057 spin_unlock(&rq->lock); 1057 spin_unlock(&rq->lock);
1058 1058
1059 return HRTIMER_NORESTART; 1059 return HRTIMER_NORESTART;
1060 } 1060 }
1061 1061
1062 #ifdef CONFIG_SMP 1062 #ifdef CONFIG_SMP
1063 /* 1063 /*
1064 * called from hardirq (IPI) context 1064 * called from hardirq (IPI) context
1065 */ 1065 */
1066 static void __hrtick_start(void *arg) 1066 static void __hrtick_start(void *arg)
1067 { 1067 {
1068 struct rq *rq = arg; 1068 struct rq *rq = arg;
1069 1069
1070 spin_lock(&rq->lock); 1070 spin_lock(&rq->lock);
1071 hrtimer_restart(&rq->hrtick_timer); 1071 hrtimer_restart(&rq->hrtick_timer);
1072 rq->hrtick_csd_pending = 0; 1072 rq->hrtick_csd_pending = 0;
1073 spin_unlock(&rq->lock); 1073 spin_unlock(&rq->lock);
1074 } 1074 }
1075 1075
1076 /* 1076 /*
1077 * Called to set the hrtick timer state. 1077 * Called to set the hrtick timer state.
1078 * 1078 *
1079 * called with rq->lock held and irqs disabled 1079 * called with rq->lock held and irqs disabled
1080 */ 1080 */
1081 static void hrtick_start(struct rq *rq, u64 delay) 1081 static void hrtick_start(struct rq *rq, u64 delay)
1082 { 1082 {
1083 struct hrtimer *timer = &rq->hrtick_timer; 1083 struct hrtimer *timer = &rq->hrtick_timer;
1084 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1084 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1085 1085
1086 hrtimer_set_expires(timer, time); 1086 hrtimer_set_expires(timer, time);
1087 1087
1088 if (rq == this_rq()) { 1088 if (rq == this_rq()) {
1089 hrtimer_restart(timer); 1089 hrtimer_restart(timer);
1090 } else if (!rq->hrtick_csd_pending) { 1090 } else if (!rq->hrtick_csd_pending) {
1091 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); 1091 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1092 rq->hrtick_csd_pending = 1; 1092 rq->hrtick_csd_pending = 1;
1093 } 1093 }
1094 } 1094 }
1095 1095
1096 static int 1096 static int
1097 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) 1097 hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1098 { 1098 {
1099 int cpu = (int)(long)hcpu; 1099 int cpu = (int)(long)hcpu;
1100 1100
1101 switch (action) { 1101 switch (action) {
1102 case CPU_UP_CANCELED: 1102 case CPU_UP_CANCELED:
1103 case CPU_UP_CANCELED_FROZEN: 1103 case CPU_UP_CANCELED_FROZEN:
1104 case CPU_DOWN_PREPARE: 1104 case CPU_DOWN_PREPARE:
1105 case CPU_DOWN_PREPARE_FROZEN: 1105 case CPU_DOWN_PREPARE_FROZEN:
1106 case CPU_DEAD: 1106 case CPU_DEAD:
1107 case CPU_DEAD_FROZEN: 1107 case CPU_DEAD_FROZEN:
1108 hrtick_clear(cpu_rq(cpu)); 1108 hrtick_clear(cpu_rq(cpu));
1109 return NOTIFY_OK; 1109 return NOTIFY_OK;
1110 } 1110 }
1111 1111
1112 return NOTIFY_DONE; 1112 return NOTIFY_DONE;
1113 } 1113 }
1114 1114
1115 static __init void init_hrtick(void) 1115 static __init void init_hrtick(void)
1116 { 1116 {
1117 hotcpu_notifier(hotplug_hrtick, 0); 1117 hotcpu_notifier(hotplug_hrtick, 0);
1118 } 1118 }
1119 #else 1119 #else
1120 /* 1120 /*
1121 * Called to set the hrtick timer state. 1121 * Called to set the hrtick timer state.
1122 * 1122 *
1123 * called with rq->lock held and irqs disabled 1123 * called with rq->lock held and irqs disabled
1124 */ 1124 */
1125 static void hrtick_start(struct rq *rq, u64 delay) 1125 static void hrtick_start(struct rq *rq, u64 delay)
1126 { 1126 {
1127 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 1127 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1128 HRTIMER_MODE_REL_PINNED, 0); 1128 HRTIMER_MODE_REL_PINNED, 0);
1129 } 1129 }
1130 1130
1131 static inline void init_hrtick(void) 1131 static inline void init_hrtick(void)
1132 { 1132 {
1133 } 1133 }
1134 #endif /* CONFIG_SMP */ 1134 #endif /* CONFIG_SMP */
1135 1135
1136 static void init_rq_hrtick(struct rq *rq) 1136 static void init_rq_hrtick(struct rq *rq)
1137 { 1137 {
1138 #ifdef CONFIG_SMP 1138 #ifdef CONFIG_SMP
1139 rq->hrtick_csd_pending = 0; 1139 rq->hrtick_csd_pending = 0;
1140 1140
1141 rq->hrtick_csd.flags = 0; 1141 rq->hrtick_csd.flags = 0;
1142 rq->hrtick_csd.func = __hrtick_start; 1142 rq->hrtick_csd.func = __hrtick_start;
1143 rq->hrtick_csd.info = rq; 1143 rq->hrtick_csd.info = rq;
1144 #endif 1144 #endif
1145 1145
1146 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1146 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1147 rq->hrtick_timer.function = hrtick; 1147 rq->hrtick_timer.function = hrtick;
1148 } 1148 }
1149 #else /* CONFIG_SCHED_HRTICK */ 1149 #else /* CONFIG_SCHED_HRTICK */
1150 static inline void hrtick_clear(struct rq *rq) 1150 static inline void hrtick_clear(struct rq *rq)
1151 { 1151 {
1152 } 1152 }
1153 1153
1154 static inline void init_rq_hrtick(struct rq *rq) 1154 static inline void init_rq_hrtick(struct rq *rq)
1155 { 1155 {
1156 } 1156 }
1157 1157
1158 static inline void init_hrtick(void) 1158 static inline void init_hrtick(void)
1159 { 1159 {
1160 } 1160 }
1161 #endif /* CONFIG_SCHED_HRTICK */ 1161 #endif /* CONFIG_SCHED_HRTICK */
1162 1162
1163 /* 1163 /*
1164 * resched_task - mark a task 'to be rescheduled now'. 1164 * resched_task - mark a task 'to be rescheduled now'.
1165 * 1165 *
1166 * On UP this means the setting of the need_resched flag, on SMP it 1166 * On UP this means the setting of the need_resched flag, on SMP it
1167 * might also involve a cross-CPU call to trigger the scheduler on 1167 * might also involve a cross-CPU call to trigger the scheduler on
1168 * the target CPU. 1168 * the target CPU.
1169 */ 1169 */
1170 #ifdef CONFIG_SMP 1170 #ifdef CONFIG_SMP
1171 1171
1172 #ifndef tsk_is_polling 1172 #ifndef tsk_is_polling
1173 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1173 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1174 #endif 1174 #endif
1175 1175
1176 static void resched_task(struct task_struct *p) 1176 static void resched_task(struct task_struct *p)
1177 { 1177 {
1178 int cpu; 1178 int cpu;
1179 1179
1180 assert_spin_locked(&task_rq(p)->lock); 1180 assert_spin_locked(&task_rq(p)->lock);
1181 1181
1182 if (test_tsk_need_resched(p)) 1182 if (test_tsk_need_resched(p))
1183 return; 1183 return;
1184 1184
1185 set_tsk_need_resched(p); 1185 set_tsk_need_resched(p);
1186 1186
1187 cpu = task_cpu(p); 1187 cpu = task_cpu(p);
1188 if (cpu == smp_processor_id()) 1188 if (cpu == smp_processor_id())
1189 return; 1189 return;
1190 1190
1191 /* NEED_RESCHED must be visible before we test polling */ 1191 /* NEED_RESCHED must be visible before we test polling */
1192 smp_mb(); 1192 smp_mb();
1193 if (!tsk_is_polling(p)) 1193 if (!tsk_is_polling(p))
1194 smp_send_reschedule(cpu); 1194 smp_send_reschedule(cpu);
1195 } 1195 }
1196 1196
1197 static void resched_cpu(int cpu) 1197 static void resched_cpu(int cpu)
1198 { 1198 {
1199 struct rq *rq = cpu_rq(cpu); 1199 struct rq *rq = cpu_rq(cpu);
1200 unsigned long flags; 1200 unsigned long flags;
1201 1201
1202 if (!spin_trylock_irqsave(&rq->lock, flags)) 1202 if (!spin_trylock_irqsave(&rq->lock, flags))
1203 return; 1203 return;
1204 resched_task(cpu_curr(cpu)); 1204 resched_task(cpu_curr(cpu));
1205 spin_unlock_irqrestore(&rq->lock, flags); 1205 spin_unlock_irqrestore(&rq->lock, flags);
1206 } 1206 }
1207 1207
1208 #ifdef CONFIG_NO_HZ 1208 #ifdef CONFIG_NO_HZ
1209 /* 1209 /*
1210 * When add_timer_on() enqueues a timer into the timer wheel of an 1210 * When add_timer_on() enqueues a timer into the timer wheel of an
1211 * idle CPU then this timer might expire before the next timer event 1211 * idle CPU then this timer might expire before the next timer event
1212 * which is scheduled to wake up that CPU. In case of a completely 1212 * which is scheduled to wake up that CPU. In case of a completely
1213 * idle system the next event might even be infinite time into the 1213 * idle system the next event might even be infinite time into the
1214 * future. wake_up_idle_cpu() ensures that the CPU is woken up and 1214 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1215 * leaves the inner idle loop so the newly added timer is taken into 1215 * leaves the inner idle loop so the newly added timer is taken into
1216 * account when the CPU goes back to idle and evaluates the timer 1216 * account when the CPU goes back to idle and evaluates the timer
1217 * wheel for the next timer event. 1217 * wheel for the next timer event.
1218 */ 1218 */
1219 void wake_up_idle_cpu(int cpu) 1219 void wake_up_idle_cpu(int cpu)
1220 { 1220 {
1221 struct rq *rq = cpu_rq(cpu); 1221 struct rq *rq = cpu_rq(cpu);
1222 1222
1223 if (cpu == smp_processor_id()) 1223 if (cpu == smp_processor_id())
1224 return; 1224 return;
1225 1225
1226 /* 1226 /*
1227 * This is safe, as this function is called with the timer 1227 * This is safe, as this function is called with the timer
1228 * wheel base lock of (cpu) held. When the CPU is on the way 1228 * wheel base lock of (cpu) held. When the CPU is on the way
1229 * to idle and has not yet set rq->curr to idle then it will 1229 * to idle and has not yet set rq->curr to idle then it will
1230 * be serialized on the timer wheel base lock and take the new 1230 * be serialized on the timer wheel base lock and take the new
1231 * timer into account automatically. 1231 * timer into account automatically.
1232 */ 1232 */
1233 if (rq->curr != rq->idle) 1233 if (rq->curr != rq->idle)
1234 return; 1234 return;
1235 1235
1236 /* 1236 /*
1237 * We can set TIF_RESCHED on the idle task of the other CPU 1237 * We can set TIF_RESCHED on the idle task of the other CPU
1238 * lockless. The worst case is that the other CPU runs the 1238 * lockless. The worst case is that the other CPU runs the
1239 * idle task through an additional NOOP schedule() 1239 * idle task through an additional NOOP schedule()
1240 */ 1240 */
1241 set_tsk_need_resched(rq->idle); 1241 set_tsk_need_resched(rq->idle);
1242 1242
1243 /* NEED_RESCHED must be visible before we test polling */ 1243 /* NEED_RESCHED must be visible before we test polling */
1244 smp_mb(); 1244 smp_mb();
1245 if (!tsk_is_polling(rq->idle)) 1245 if (!tsk_is_polling(rq->idle))
1246 smp_send_reschedule(cpu); 1246 smp_send_reschedule(cpu);
1247 } 1247 }
1248 #endif /* CONFIG_NO_HZ */ 1248 #endif /* CONFIG_NO_HZ */
1249 1249
1250 static u64 sched_avg_period(void) 1250 static u64 sched_avg_period(void)
1251 { 1251 {
1252 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 1252 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1253 } 1253 }
1254 1254
1255 static void sched_avg_update(struct rq *rq) 1255 static void sched_avg_update(struct rq *rq)
1256 { 1256 {
1257 s64 period = sched_avg_period(); 1257 s64 period = sched_avg_period();
1258 1258
1259 while ((s64)(rq->clock - rq->age_stamp) > period) { 1259 while ((s64)(rq->clock - rq->age_stamp) > period) {
1260 rq->age_stamp += period; 1260 rq->age_stamp += period;
1261 rq->rt_avg /= 2; 1261 rq->rt_avg /= 2;
1262 } 1262 }
1263 } 1263 }
1264 1264
1265 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1265 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1266 { 1266 {
1267 rq->rt_avg += rt_delta; 1267 rq->rt_avg += rt_delta;
1268 sched_avg_update(rq); 1268 sched_avg_update(rq);
1269 } 1269 }
1270 1270
1271 #else /* !CONFIG_SMP */ 1271 #else /* !CONFIG_SMP */
1272 static void resched_task(struct task_struct *p) 1272 static void resched_task(struct task_struct *p)
1273 { 1273 {
1274 assert_spin_locked(&task_rq(p)->lock); 1274 assert_spin_locked(&task_rq(p)->lock);
1275 set_tsk_need_resched(p); 1275 set_tsk_need_resched(p);
1276 } 1276 }
1277 1277
1278 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1278 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1279 { 1279 {
1280 } 1280 }
1281 #endif /* CONFIG_SMP */ 1281 #endif /* CONFIG_SMP */
1282 1282
1283 #if BITS_PER_LONG == 32 1283 #if BITS_PER_LONG == 32
1284 # define WMULT_CONST (~0UL) 1284 # define WMULT_CONST (~0UL)
1285 #else 1285 #else
1286 # define WMULT_CONST (1UL << 32) 1286 # define WMULT_CONST (1UL << 32)
1287 #endif 1287 #endif
1288 1288
1289 #define WMULT_SHIFT 32 1289 #define WMULT_SHIFT 32
1290 1290
1291 /* 1291 /*
1292 * Shift right and round: 1292 * Shift right and round:
1293 */ 1293 */
1294 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1294 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1295 1295
1296 /* 1296 /*
1297 * delta *= weight / lw 1297 * delta *= weight / lw
1298 */ 1298 */
1299 static unsigned long 1299 static unsigned long
1300 calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1300 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1301 struct load_weight *lw) 1301 struct load_weight *lw)
1302 { 1302 {
1303 u64 tmp; 1303 u64 tmp;
1304 1304
1305 if (!lw->inv_weight) { 1305 if (!lw->inv_weight) {
1306 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1306 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1307 lw->inv_weight = 1; 1307 lw->inv_weight = 1;
1308 else 1308 else
1309 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1309 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1310 / (lw->weight+1); 1310 / (lw->weight+1);
1311 } 1311 }
1312 1312
1313 tmp = (u64)delta_exec * weight; 1313 tmp = (u64)delta_exec * weight;
1314 /* 1314 /*
1315 * Check whether we'd overflow the 64-bit multiplication: 1315 * Check whether we'd overflow the 64-bit multiplication:
1316 */ 1316 */
1317 if (unlikely(tmp > WMULT_CONST)) 1317 if (unlikely(tmp > WMULT_CONST))
1318 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, 1318 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
1319 WMULT_SHIFT/2); 1319 WMULT_SHIFT/2);
1320 else 1320 else
1321 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); 1321 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
1322 1322
1323 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1323 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1324 } 1324 }
1325 1325
1326 static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1326 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1327 { 1327 {
1328 lw->weight += inc; 1328 lw->weight += inc;
1329 lw->inv_weight = 0; 1329 lw->inv_weight = 0;
1330 } 1330 }
1331 1331
1332 static inline void update_load_sub(struct load_weight *lw, unsigned long dec) 1332 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1333 { 1333 {
1334 lw->weight -= dec; 1334 lw->weight -= dec;
1335 lw->inv_weight = 0; 1335 lw->inv_weight = 0;
1336 } 1336 }
1337 1337
1338 /* 1338 /*
1339 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1339 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1340 * of tasks with abnormal "nice" values across CPUs the contribution that 1340 * of tasks with abnormal "nice" values across CPUs the contribution that
1341 * each task makes to its run queue's load is weighted according to its 1341 * each task makes to its run queue's load is weighted according to its
1342 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1342 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
1343 * scaled version of the new time slice allocation that they receive on time 1343 * scaled version of the new time slice allocation that they receive on time
1344 * slice expiry etc. 1344 * slice expiry etc.
1345 */ 1345 */
1346 1346
1347 #define WEIGHT_IDLEPRIO 3 1347 #define WEIGHT_IDLEPRIO 3
1348 #define WMULT_IDLEPRIO 1431655765 1348 #define WMULT_IDLEPRIO 1431655765
1349 1349
1350 /* 1350 /*
1351 * Nice levels are multiplicative, with a gentle 10% change for every 1351 * Nice levels are multiplicative, with a gentle 10% change for every
1352 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 1352 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1353 * nice 1, it will get ~10% less CPU time than another CPU-bound task 1353 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1354 * that remained on nice 0. 1354 * that remained on nice 0.
1355 * 1355 *
1356 * The "10% effect" is relative and cumulative: from _any_ nice level, 1356 * The "10% effect" is relative and cumulative: from _any_ nice level,
1357 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 1357 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
1358 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 1358 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1359 * If a task goes up by ~10% and another task goes down by ~10% then 1359 * If a task goes up by ~10% and another task goes down by ~10% then
1360 * the relative distance between them is ~25%.) 1360 * the relative distance between them is ~25%.)
1361 */ 1361 */
1362 static const int prio_to_weight[40] = { 1362 static const int prio_to_weight[40] = {
1363 /* -20 */ 88761, 71755, 56483, 46273, 36291, 1363 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1364 /* -15 */ 29154, 23254, 18705, 14949, 11916, 1364 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1365 /* -10 */ 9548, 7620, 6100, 4904, 3906, 1365 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1366 /* -5 */ 3121, 2501, 1991, 1586, 1277, 1366 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1367 /* 0 */ 1024, 820, 655, 526, 423, 1367 /* 0 */ 1024, 820, 655, 526, 423,
1368 /* 5 */ 335, 272, 215, 172, 137, 1368 /* 5 */ 335, 272, 215, 172, 137,
1369 /* 10 */ 110, 87, 70, 56, 45, 1369 /* 10 */ 110, 87, 70, 56, 45,
1370 /* 15 */ 36, 29, 23, 18, 15, 1370 /* 15 */ 36, 29, 23, 18, 15,
1371 }; 1371 };
1372 1372
1373 /* 1373 /*
1374 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 1374 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1375 * 1375 *
1376 * In cases where the weight does not change often, we can use the 1376 * In cases where the weight does not change often, we can use the
1377 * precalculated inverse to speed up arithmetics by turning divisions 1377 * precalculated inverse to speed up arithmetics by turning divisions
1378 * into multiplications: 1378 * into multiplications:
1379 */ 1379 */
1380 static const u32 prio_to_wmult[40] = { 1380 static const u32 prio_to_wmult[40] = {
1381 /* -20 */ 48388, 59856, 76040, 92818, 118348, 1381 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1382 /* -15 */ 147320, 184698, 229616, 287308, 360437, 1382 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1383 /* -10 */ 449829, 563644, 704093, 875809, 1099582, 1383 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1384 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, 1384 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1385 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, 1385 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1386 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, 1386 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1387 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 1387 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1388 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1388 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1389 }; 1389 };
1390 1390
1391 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); 1391 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1392 1392
1393 /* 1393 /*
1394 * runqueue iterator, to support SMP load-balancing between different 1394 * runqueue iterator, to support SMP load-balancing between different
1395 * scheduling classes, without having to expose their internal data 1395 * scheduling classes, without having to expose their internal data
1396 * structures to the load-balancing proper: 1396 * structures to the load-balancing proper:
1397 */ 1397 */
1398 struct rq_iterator { 1398 struct rq_iterator {
1399 void *arg; 1399 void *arg;
1400 struct task_struct *(*start)(void *); 1400 struct task_struct *(*start)(void *);
1401 struct task_struct *(*next)(void *); 1401 struct task_struct *(*next)(void *);
1402 }; 1402 };
1403 1403
1404 #ifdef CONFIG_SMP 1404 #ifdef CONFIG_SMP
1405 static unsigned long 1405 static unsigned long
1406 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 1406 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1407 unsigned long max_load_move, struct sched_domain *sd, 1407 unsigned long max_load_move, struct sched_domain *sd,
1408 enum cpu_idle_type idle, int *all_pinned, 1408 enum cpu_idle_type idle, int *all_pinned,
1409 int *this_best_prio, struct rq_iterator *iterator); 1409 int *this_best_prio, struct rq_iterator *iterator);
1410 1410
1411 static int 1411 static int
1412 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 1412 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1413 struct sched_domain *sd, enum cpu_idle_type idle, 1413 struct sched_domain *sd, enum cpu_idle_type idle,
1414 struct rq_iterator *iterator); 1414 struct rq_iterator *iterator);
1415 #endif 1415 #endif
1416 1416
1417 /* Time spent by the tasks of the cpu accounting group executing in ... */ 1417 /* Time spent by the tasks of the cpu accounting group executing in ... */
1418 enum cpuacct_stat_index { 1418 enum cpuacct_stat_index {
1419 CPUACCT_STAT_USER, /* ... user mode */ 1419 CPUACCT_STAT_USER, /* ... user mode */
1420 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 1420 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1421 1421
1422 CPUACCT_STAT_NSTATS, 1422 CPUACCT_STAT_NSTATS,
1423 }; 1423 };
1424 1424
1425 #ifdef CONFIG_CGROUP_CPUACCT 1425 #ifdef CONFIG_CGROUP_CPUACCT
1426 static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1426 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1427 static void cpuacct_update_stats(struct task_struct *tsk, 1427 static void cpuacct_update_stats(struct task_struct *tsk,
1428 enum cpuacct_stat_index idx, cputime_t val); 1428 enum cpuacct_stat_index idx, cputime_t val);
1429 #else 1429 #else
1430 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1430 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1431 static inline void cpuacct_update_stats(struct task_struct *tsk, 1431 static inline void cpuacct_update_stats(struct task_struct *tsk,
1432 enum cpuacct_stat_index idx, cputime_t val) {} 1432 enum cpuacct_stat_index idx, cputime_t val) {}
1433 #endif 1433 #endif
1434 1434
1435 static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1435 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1436 { 1436 {
1437 update_load_add(&rq->load, load); 1437 update_load_add(&rq->load, load);
1438 } 1438 }
1439 1439
1440 static inline void dec_cpu_load(struct rq *rq, unsigned long load) 1440 static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1441 { 1441 {
1442 update_load_sub(&rq->load, load); 1442 update_load_sub(&rq->load, load);
1443 } 1443 }
1444 1444
1445 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) 1445 #if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1446 typedef int (*tg_visitor)(struct task_group *, void *); 1446 typedef int (*tg_visitor)(struct task_group *, void *);
1447 1447
1448 /* 1448 /*
1449 * Iterate the full tree, calling @down when first entering a node and @up when 1449 * Iterate the full tree, calling @down when first entering a node and @up when
1450 * leaving it for the final time. 1450 * leaving it for the final time.
1451 */ 1451 */
1452 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) 1452 static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1453 { 1453 {
1454 struct task_group *parent, *child; 1454 struct task_group *parent, *child;
1455 int ret; 1455 int ret;
1456 1456
1457 rcu_read_lock(); 1457 rcu_read_lock();
1458 parent = &root_task_group; 1458 parent = &root_task_group;
1459 down: 1459 down:
1460 ret = (*down)(parent, data); 1460 ret = (*down)(parent, data);
1461 if (ret) 1461 if (ret)
1462 goto out_unlock; 1462 goto out_unlock;
1463 list_for_each_entry_rcu(child, &parent->children, siblings) { 1463 list_for_each_entry_rcu(child, &parent->children, siblings) {
1464 parent = child; 1464 parent = child;
1465 goto down; 1465 goto down;
1466 1466
1467 up: 1467 up:
1468 continue; 1468 continue;
1469 } 1469 }
1470 ret = (*up)(parent, data); 1470 ret = (*up)(parent, data);
1471 if (ret) 1471 if (ret)
1472 goto out_unlock; 1472 goto out_unlock;
1473 1473
1474 child = parent; 1474 child = parent;
1475 parent = parent->parent; 1475 parent = parent->parent;
1476 if (parent) 1476 if (parent)
1477 goto up; 1477 goto up;
1478 out_unlock: 1478 out_unlock:
1479 rcu_read_unlock(); 1479 rcu_read_unlock();
1480 1480
1481 return ret; 1481 return ret;
1482 } 1482 }
1483 1483
1484 static int tg_nop(struct task_group *tg, void *data) 1484 static int tg_nop(struct task_group *tg, void *data)
1485 { 1485 {
1486 return 0; 1486 return 0;
1487 } 1487 }
1488 #endif 1488 #endif
1489 1489
1490 #ifdef CONFIG_SMP 1490 #ifdef CONFIG_SMP
1491 /* Used instead of source_load when we know the type == 0 */ 1491 /* Used instead of source_load when we know the type == 0 */
1492 static unsigned long weighted_cpuload(const int cpu) 1492 static unsigned long weighted_cpuload(const int cpu)
1493 { 1493 {
1494 return cpu_rq(cpu)->load.weight; 1494 return cpu_rq(cpu)->load.weight;
1495 } 1495 }
1496 1496
1497 /* 1497 /*
1498 * Return a low guess at the load of a migration-source cpu weighted 1498 * Return a low guess at the load of a migration-source cpu weighted
1499 * according to the scheduling class and "nice" value. 1499 * according to the scheduling class and "nice" value.
1500 * 1500 *
1501 * We want to under-estimate the load of migration sources, to 1501 * We want to under-estimate the load of migration sources, to
1502 * balance conservatively. 1502 * balance conservatively.
1503 */ 1503 */
1504 static unsigned long source_load(int cpu, int type) 1504 static unsigned long source_load(int cpu, int type)
1505 { 1505 {
1506 struct rq *rq = cpu_rq(cpu); 1506 struct rq *rq = cpu_rq(cpu);
1507 unsigned long total = weighted_cpuload(cpu); 1507 unsigned long total = weighted_cpuload(cpu);
1508 1508
1509 if (type == 0 || !sched_feat(LB_BIAS)) 1509 if (type == 0 || !sched_feat(LB_BIAS))
1510 return total; 1510 return total;
1511 1511
1512 return min(rq->cpu_load[type-1], total); 1512 return min(rq->cpu_load[type-1], total);
1513 } 1513 }
1514 1514
1515 /* 1515 /*
1516 * Return a high guess at the load of a migration-target cpu weighted 1516 * Return a high guess at the load of a migration-target cpu weighted
1517 * according to the scheduling class and "nice" value. 1517 * according to the scheduling class and "nice" value.
1518 */ 1518 */
1519 static unsigned long target_load(int cpu, int type) 1519 static unsigned long target_load(int cpu, int type)
1520 { 1520 {
1521 struct rq *rq = cpu_rq(cpu); 1521 struct rq *rq = cpu_rq(cpu);
1522 unsigned long total = weighted_cpuload(cpu); 1522 unsigned long total = weighted_cpuload(cpu);
1523 1523
1524 if (type == 0 || !sched_feat(LB_BIAS)) 1524 if (type == 0 || !sched_feat(LB_BIAS))
1525 return total; 1525 return total;
1526 1526
1527 return max(rq->cpu_load[type-1], total); 1527 return max(rq->cpu_load[type-1], total);
1528 } 1528 }
1529 1529
1530 static struct sched_group *group_of(int cpu) 1530 static struct sched_group *group_of(int cpu)
1531 { 1531 {
1532 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1532 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1533 1533
1534 if (!sd) 1534 if (!sd)
1535 return NULL; 1535 return NULL;
1536 1536
1537 return sd->groups; 1537 return sd->groups;
1538 } 1538 }
1539 1539
1540 static unsigned long power_of(int cpu) 1540 static unsigned long power_of(int cpu)
1541 { 1541 {
1542 struct sched_group *group = group_of(cpu); 1542 struct sched_group *group = group_of(cpu);
1543 1543
1544 if (!group) 1544 if (!group)
1545 return SCHED_LOAD_SCALE; 1545 return SCHED_LOAD_SCALE;
1546 1546
1547 return group->cpu_power; 1547 return group->cpu_power;
1548 } 1548 }
1549 1549
1550 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1550 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1551 1551
1552 static unsigned long cpu_avg_load_per_task(int cpu) 1552 static unsigned long cpu_avg_load_per_task(int cpu)
1553 { 1553 {
1554 struct rq *rq = cpu_rq(cpu); 1554 struct rq *rq = cpu_rq(cpu);
1555 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 1555 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
1556 1556
1557 if (nr_running) 1557 if (nr_running)
1558 rq->avg_load_per_task = rq->load.weight / nr_running; 1558 rq->avg_load_per_task = rq->load.weight / nr_running;
1559 else 1559 else
1560 rq->avg_load_per_task = 0; 1560 rq->avg_load_per_task = 0;
1561 1561
1562 return rq->avg_load_per_task; 1562 return rq->avg_load_per_task;
1563 } 1563 }
1564 1564
1565 #ifdef CONFIG_FAIR_GROUP_SCHED 1565 #ifdef CONFIG_FAIR_GROUP_SCHED
1566 1566
1567 static __read_mostly unsigned long *update_shares_data; 1567 static __read_mostly unsigned long *update_shares_data;
1568 1568
1569 static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1569 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1570 1570
1571 /* 1571 /*
1572 * Calculate and set the cpu's group shares. 1572 * Calculate and set the cpu's group shares.
1573 */ 1573 */
1574 static void update_group_shares_cpu(struct task_group *tg, int cpu, 1574 static void update_group_shares_cpu(struct task_group *tg, int cpu,
1575 unsigned long sd_shares, 1575 unsigned long sd_shares,
1576 unsigned long sd_rq_weight, 1576 unsigned long sd_rq_weight,
1577 unsigned long *usd_rq_weight) 1577 unsigned long *usd_rq_weight)
1578 { 1578 {
1579 unsigned long shares, rq_weight; 1579 unsigned long shares, rq_weight;
1580 int boost = 0; 1580 int boost = 0;
1581 1581
1582 rq_weight = usd_rq_weight[cpu]; 1582 rq_weight = usd_rq_weight[cpu];
1583 if (!rq_weight) { 1583 if (!rq_weight) {
1584 boost = 1; 1584 boost = 1;
1585 rq_weight = NICE_0_LOAD; 1585 rq_weight = NICE_0_LOAD;
1586 } 1586 }
1587 1587
1588 /* 1588 /*
1589 * \Sum_j shares_j * rq_weight_i 1589 * \Sum_j shares_j * rq_weight_i
1590 * shares_i = ----------------------------- 1590 * shares_i = -----------------------------
1591 * \Sum_j rq_weight_j 1591 * \Sum_j rq_weight_j
1592 */ 1592 */
1593 shares = (sd_shares * rq_weight) / sd_rq_weight; 1593 shares = (sd_shares * rq_weight) / sd_rq_weight;
1594 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1594 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1595 1595
1596 if (abs(shares - tg->se[cpu]->load.weight) > 1596 if (abs(shares - tg->se[cpu]->load.weight) >
1597 sysctl_sched_shares_thresh) { 1597 sysctl_sched_shares_thresh) {
1598 struct rq *rq = cpu_rq(cpu); 1598 struct rq *rq = cpu_rq(cpu);
1599 unsigned long flags; 1599 unsigned long flags;
1600 1600
1601 spin_lock_irqsave(&rq->lock, flags); 1601 spin_lock_irqsave(&rq->lock, flags);
1602 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1602 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1603 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1603 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1604 __set_se_shares(tg->se[cpu], shares); 1604 __set_se_shares(tg->se[cpu], shares);
1605 spin_unlock_irqrestore(&rq->lock, flags); 1605 spin_unlock_irqrestore(&rq->lock, flags);
1606 } 1606 }
1607 } 1607 }
1608 1608
1609 /* 1609 /*
1610 * Re-compute the task group their per cpu shares over the given domain. 1610 * Re-compute the task group their per cpu shares over the given domain.
1611 * This needs to be done in a bottom-up fashion because the rq weight of a 1611 * This needs to be done in a bottom-up fashion because the rq weight of a
1612 * parent group depends on the shares of its child groups. 1612 * parent group depends on the shares of its child groups.
1613 */ 1613 */
1614 static int tg_shares_up(struct task_group *tg, void *data) 1614 static int tg_shares_up(struct task_group *tg, void *data)
1615 { 1615 {
1616 unsigned long weight, rq_weight = 0, shares = 0; 1616 unsigned long weight, rq_weight = 0, shares = 0;
1617 unsigned long *usd_rq_weight; 1617 unsigned long *usd_rq_weight;
1618 struct sched_domain *sd = data; 1618 struct sched_domain *sd = data;
1619 unsigned long flags; 1619 unsigned long flags;
1620 int i; 1620 int i;
1621 1621
1622 if (!tg->se[0]) 1622 if (!tg->se[0])
1623 return 0; 1623 return 0;
1624 1624
1625 local_irq_save(flags); 1625 local_irq_save(flags);
1626 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); 1626 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1627 1627
1628 for_each_cpu(i, sched_domain_span(sd)) { 1628 for_each_cpu(i, sched_domain_span(sd)) {
1629 weight = tg->cfs_rq[i]->load.weight; 1629 weight = tg->cfs_rq[i]->load.weight;
1630 usd_rq_weight[i] = weight; 1630 usd_rq_weight[i] = weight;
1631 1631
1632 /* 1632 /*
1633 * If there are currently no tasks on the cpu pretend there 1633 * If there are currently no tasks on the cpu pretend there
1634 * is one of average load so that when a new task gets to 1634 * is one of average load so that when a new task gets to
1635 * run here it will not get delayed by group starvation. 1635 * run here it will not get delayed by group starvation.
1636 */ 1636 */
1637 if (!weight) 1637 if (!weight)
1638 weight = NICE_0_LOAD; 1638 weight = NICE_0_LOAD;
1639 1639
1640 rq_weight += weight; 1640 rq_weight += weight;
1641 shares += tg->cfs_rq[i]->shares; 1641 shares += tg->cfs_rq[i]->shares;
1642 } 1642 }
1643 1643
1644 if ((!shares && rq_weight) || shares > tg->shares) 1644 if ((!shares && rq_weight) || shares > tg->shares)
1645 shares = tg->shares; 1645 shares = tg->shares;
1646 1646
1647 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1647 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1648 shares = tg->shares; 1648 shares = tg->shares;
1649 1649
1650 for_each_cpu(i, sched_domain_span(sd)) 1650 for_each_cpu(i, sched_domain_span(sd))
1651 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); 1651 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1652 1652
1653 local_irq_restore(flags); 1653 local_irq_restore(flags);
1654 1654
1655 return 0; 1655 return 0;
1656 } 1656 }
1657 1657
1658 /* 1658 /*
1659 * Compute the cpu's hierarchical load factor for each task group. 1659 * Compute the cpu's hierarchical load factor for each task group.
1660 * This needs to be done in a top-down fashion because the load of a child 1660 * This needs to be done in a top-down fashion because the load of a child
1661 * group is a fraction of its parents load. 1661 * group is a fraction of its parents load.
1662 */ 1662 */
1663 static int tg_load_down(struct task_group *tg, void *data) 1663 static int tg_load_down(struct task_group *tg, void *data)
1664 { 1664 {
1665 unsigned long load; 1665 unsigned long load;
1666 long cpu = (long)data; 1666 long cpu = (long)data;
1667 1667
1668 if (!tg->parent) { 1668 if (!tg->parent) {
1669 load = cpu_rq(cpu)->load.weight; 1669 load = cpu_rq(cpu)->load.weight;
1670 } else { 1670 } else {
1671 load = tg->parent->cfs_rq[cpu]->h_load; 1671 load = tg->parent->cfs_rq[cpu]->h_load;
1672 load *= tg->cfs_rq[cpu]->shares; 1672 load *= tg->cfs_rq[cpu]->shares;
1673 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1673 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1674 } 1674 }
1675 1675
1676 tg->cfs_rq[cpu]->h_load = load; 1676 tg->cfs_rq[cpu]->h_load = load;
1677 1677
1678 return 0; 1678 return 0;
1679 } 1679 }
1680 1680
1681 static void update_shares(struct sched_domain *sd) 1681 static void update_shares(struct sched_domain *sd)
1682 { 1682 {
1683 s64 elapsed; 1683 s64 elapsed;
1684 u64 now; 1684 u64 now;
1685 1685
1686 if (root_task_group_empty()) 1686 if (root_task_group_empty())
1687 return; 1687 return;
1688 1688
1689 now = cpu_clock(raw_smp_processor_id()); 1689 now = cpu_clock(raw_smp_processor_id());
1690 elapsed = now - sd->last_update; 1690 elapsed = now - sd->last_update;
1691 1691
1692 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1692 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1693 sd->last_update = now; 1693 sd->last_update = now;
1694 walk_tg_tree(tg_nop, tg_shares_up, sd); 1694 walk_tg_tree(tg_nop, tg_shares_up, sd);
1695 } 1695 }
1696 } 1696 }
1697 1697
1698 static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1698 static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1699 { 1699 {
1700 if (root_task_group_empty()) 1700 if (root_task_group_empty())
1701 return; 1701 return;
1702 1702
1703 spin_unlock(&rq->lock); 1703 spin_unlock(&rq->lock);
1704 update_shares(sd); 1704 update_shares(sd);
1705 spin_lock(&rq->lock); 1705 spin_lock(&rq->lock);
1706 } 1706 }
1707 1707
1708 static void update_h_load(long cpu) 1708 static void update_h_load(long cpu)
1709 { 1709 {
1710 if (root_task_group_empty()) 1710 if (root_task_group_empty())
1711 return; 1711 return;
1712 1712
1713 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1713 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1714 } 1714 }
1715 1715
1716 #else 1716 #else
1717 1717
1718 static inline void update_shares(struct sched_domain *sd) 1718 static inline void update_shares(struct sched_domain *sd)
1719 { 1719 {
1720 } 1720 }
1721 1721
1722 static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1722 static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1723 { 1723 {
1724 } 1724 }
1725 1725
1726 #endif 1726 #endif
1727 1727
1728 #ifdef CONFIG_PREEMPT 1728 #ifdef CONFIG_PREEMPT
1729 1729
1730 static void double_rq_lock(struct rq *rq1, struct rq *rq2); 1730 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1731 1731
1732 /* 1732 /*
1733 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1733 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1734 * way at the expense of forcing extra atomic operations in all 1734 * way at the expense of forcing extra atomic operations in all
1735 * invocations. This assures that the double_lock is acquired using the 1735 * invocations. This assures that the double_lock is acquired using the
1736 * same underlying policy as the spinlock_t on this architecture, which 1736 * same underlying policy as the spinlock_t on this architecture, which
1737 * reduces latency compared to the unfair variant below. However, it 1737 * reduces latency compared to the unfair variant below. However, it
1738 * also adds more overhead and therefore may reduce throughput. 1738 * also adds more overhead and therefore may reduce throughput.
1739 */ 1739 */
1740 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1740 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1741 __releases(this_rq->lock) 1741 __releases(this_rq->lock)
1742 __acquires(busiest->lock) 1742 __acquires(busiest->lock)
1743 __acquires(this_rq->lock) 1743 __acquires(this_rq->lock)
1744 { 1744 {
1745 spin_unlock(&this_rq->lock); 1745 spin_unlock(&this_rq->lock);
1746 double_rq_lock(this_rq, busiest); 1746 double_rq_lock(this_rq, busiest);
1747 1747
1748 return 1; 1748 return 1;
1749 } 1749 }
1750 1750
1751 #else 1751 #else
1752 /* 1752 /*
1753 * Unfair double_lock_balance: Optimizes throughput at the expense of 1753 * Unfair double_lock_balance: Optimizes throughput at the expense of
1754 * latency by eliminating extra atomic operations when the locks are 1754 * latency by eliminating extra atomic operations when the locks are
1755 * already in proper order on entry. This favors lower cpu-ids and will 1755 * already in proper order on entry. This favors lower cpu-ids and will
1756 * grant the double lock to lower cpus over higher ids under contention, 1756 * grant the double lock to lower cpus over higher ids under contention,
1757 * regardless of entry order into the function. 1757 * regardless of entry order into the function.
1758 */ 1758 */
1759 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1759 static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1760 __releases(this_rq->lock) 1760 __releases(this_rq->lock)
1761 __acquires(busiest->lock) 1761 __acquires(busiest->lock)
1762 __acquires(this_rq->lock) 1762 __acquires(this_rq->lock)
1763 { 1763 {
1764 int ret = 0; 1764 int ret = 0;
1765 1765
1766 if (unlikely(!spin_trylock(&busiest->lock))) { 1766 if (unlikely(!spin_trylock(&busiest->lock))) {
1767 if (busiest < this_rq) { 1767 if (busiest < this_rq) {
1768 spin_unlock(&this_rq->lock); 1768 spin_unlock(&this_rq->lock);
1769 spin_lock(&busiest->lock); 1769 spin_lock(&busiest->lock);
1770 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1770 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
1771 ret = 1; 1771 ret = 1;
1772 } else 1772 } else
1773 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1773 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
1774 } 1774 }
1775 return ret; 1775 return ret;
1776 } 1776 }
1777 1777
1778 #endif /* CONFIG_PREEMPT */ 1778 #endif /* CONFIG_PREEMPT */
1779 1779
1780 /* 1780 /*
1781 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1781 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1782 */ 1782 */
1783 static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1783 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1784 { 1784 {
1785 if (unlikely(!irqs_disabled())) { 1785 if (unlikely(!irqs_disabled())) {
1786 /* printk() doesn't work good under rq->lock */ 1786 /* printk() doesn't work good under rq->lock */
1787 spin_unlock(&this_rq->lock); 1787 spin_unlock(&this_rq->lock);
1788 BUG_ON(1); 1788 BUG_ON(1);
1789 } 1789 }
1790 1790
1791 return _double_lock_balance(this_rq, busiest); 1791 return _double_lock_balance(this_rq, busiest);
1792 } 1792 }
1793 1793
1794 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1794 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1795 __releases(busiest->lock) 1795 __releases(busiest->lock)
1796 { 1796 {
1797 spin_unlock(&busiest->lock); 1797 spin_unlock(&busiest->lock);
1798 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1798 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1799 } 1799 }
1800 #endif 1800 #endif
1801 1801
1802 #ifdef CONFIG_FAIR_GROUP_SCHED 1802 #ifdef CONFIG_FAIR_GROUP_SCHED
1803 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1803 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1804 { 1804 {
1805 #ifdef CONFIG_SMP 1805 #ifdef CONFIG_SMP
1806 cfs_rq->shares = shares; 1806 cfs_rq->shares = shares;
1807 #endif 1807 #endif
1808 } 1808 }
1809 #endif 1809 #endif
1810 1810
1811 static void calc_load_account_active(struct rq *this_rq); 1811 static void calc_load_account_active(struct rq *this_rq);
1812 1812
1813 #include "sched_stats.h" 1813 #include "sched_stats.h"
1814 #include "sched_idletask.c" 1814 #include "sched_idletask.c"
1815 #include "sched_fair.c" 1815 #include "sched_fair.c"
1816 #include "sched_rt.c" 1816 #include "sched_rt.c"
1817 #ifdef CONFIG_SCHED_DEBUG 1817 #ifdef CONFIG_SCHED_DEBUG
1818 # include "sched_debug.c" 1818 # include "sched_debug.c"
1819 #endif 1819 #endif
1820 1820
1821 #define sched_class_highest (&rt_sched_class) 1821 #define sched_class_highest (&rt_sched_class)
1822 #define for_each_class(class) \ 1822 #define for_each_class(class) \
1823 for (class = sched_class_highest; class; class = class->next) 1823 for (class = sched_class_highest; class; class = class->next)
1824 1824
1825 static void inc_nr_running(struct rq *rq) 1825 static void inc_nr_running(struct rq *rq)
1826 { 1826 {
1827 rq->nr_running++; 1827 rq->nr_running++;
1828 } 1828 }
1829 1829
1830 static void dec_nr_running(struct rq *rq) 1830 static void dec_nr_running(struct rq *rq)
1831 { 1831 {
1832 rq->nr_running--; 1832 rq->nr_running--;
1833 } 1833 }
1834 1834
1835 static void set_load_weight(struct task_struct *p) 1835 static void set_load_weight(struct task_struct *p)
1836 { 1836 {
1837 if (task_has_rt_policy(p)) { 1837 if (task_has_rt_policy(p)) {
1838 p->se.load.weight = prio_to_weight[0] * 2; 1838 p->se.load.weight = prio_to_weight[0] * 2;
1839 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1839 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
1840 return; 1840 return;
1841 } 1841 }
1842 1842
1843 /* 1843 /*
1844 * SCHED_IDLE tasks get minimal weight: 1844 * SCHED_IDLE tasks get minimal weight:
1845 */ 1845 */
1846 if (p->policy == SCHED_IDLE) { 1846 if (p->policy == SCHED_IDLE) {
1847 p->se.load.weight = WEIGHT_IDLEPRIO; 1847 p->se.load.weight = WEIGHT_IDLEPRIO;
1848 p->se.load.inv_weight = WMULT_IDLEPRIO; 1848 p->se.load.inv_weight = WMULT_IDLEPRIO;
1849 return; 1849 return;
1850 } 1850 }
1851 1851
1852 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1852 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
1853 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1853 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1854 } 1854 }
1855 1855
1856 static void update_avg(u64 *avg, u64 sample) 1856 static void update_avg(u64 *avg, u64 sample)
1857 { 1857 {
1858 s64 diff = sample - *avg; 1858 s64 diff = sample - *avg;
1859 *avg += diff >> 3; 1859 *avg += diff >> 3;
1860 } 1860 }
1861 1861
1862 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1862 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1863 { 1863 {
1864 if (wakeup) 1864 if (wakeup)
1865 p->se.start_runtime = p->se.sum_exec_runtime; 1865 p->se.start_runtime = p->se.sum_exec_runtime;
1866 1866
1867 sched_info_queued(p); 1867 sched_info_queued(p);
1868 p->sched_class->enqueue_task(rq, p, wakeup); 1868 p->sched_class->enqueue_task(rq, p, wakeup);
1869 p->se.on_rq = 1; 1869 p->se.on_rq = 1;
1870 } 1870 }
1871 1871
1872 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1872 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1873 { 1873 {
1874 if (sleep) { 1874 if (sleep) {
1875 if (p->se.last_wakeup) { 1875 if (p->se.last_wakeup) {
1876 update_avg(&p->se.avg_overlap, 1876 update_avg(&p->se.avg_overlap,
1877 p->se.sum_exec_runtime - p->se.last_wakeup); 1877 p->se.sum_exec_runtime - p->se.last_wakeup);
1878 p->se.last_wakeup = 0; 1878 p->se.last_wakeup = 0;
1879 } else { 1879 } else {
1880 update_avg(&p->se.avg_wakeup, 1880 update_avg(&p->se.avg_wakeup,
1881 sysctl_sched_wakeup_granularity); 1881 sysctl_sched_wakeup_granularity);
1882 } 1882 }
1883 } 1883 }
1884 1884
1885 sched_info_dequeued(p); 1885 sched_info_dequeued(p);
1886 p->sched_class->dequeue_task(rq, p, sleep); 1886 p->sched_class->dequeue_task(rq, p, sleep);
1887 p->se.on_rq = 0; 1887 p->se.on_rq = 0;
1888 } 1888 }
1889 1889
1890 /* 1890 /*
1891 * __normal_prio - return the priority that is based on the static prio 1891 * __normal_prio - return the priority that is based on the static prio
1892 */ 1892 */
1893 static inline int __normal_prio(struct task_struct *p) 1893 static inline int __normal_prio(struct task_struct *p)
1894 { 1894 {
1895 return p->static_prio; 1895 return p->static_prio;
1896 } 1896 }
1897 1897
1898 /* 1898 /*
1899 * Calculate the expected normal priority: i.e. priority 1899 * Calculate the expected normal priority: i.e. priority
1900 * without taking RT-inheritance into account. Might be 1900 * without taking RT-inheritance into account. Might be
1901 * boosted by interactivity modifiers. Changes upon fork, 1901 * boosted by interactivity modifiers. Changes upon fork,
1902 * setprio syscalls, and whenever the interactivity 1902 * setprio syscalls, and whenever the interactivity
1903 * estimator recalculates. 1903 * estimator recalculates.
1904 */ 1904 */
1905 static inline int normal_prio(struct task_struct *p) 1905 static inline int normal_prio(struct task_struct *p)
1906 { 1906 {
1907 int prio; 1907 int prio;
1908 1908
1909 if (task_has_rt_policy(p)) 1909 if (task_has_rt_policy(p))
1910 prio = MAX_RT_PRIO-1 - p->rt_priority; 1910 prio = MAX_RT_PRIO-1 - p->rt_priority;
1911 else 1911 else
1912 prio = __normal_prio(p); 1912 prio = __normal_prio(p);
1913 return prio; 1913 return prio;
1914 } 1914 }
1915 1915
1916 /* 1916 /*
1917 * Calculate the current priority, i.e. the priority 1917 * Calculate the current priority, i.e. the priority
1918 * taken into account by the scheduler. This value might 1918 * taken into account by the scheduler. This value might
1919 * be boosted by RT tasks, or might be boosted by 1919 * be boosted by RT tasks, or might be boosted by
1920 * interactivity modifiers. Will be RT if the task got 1920 * interactivity modifiers. Will be RT if the task got
1921 * RT-boosted. If not then it returns p->normal_prio. 1921 * RT-boosted. If not then it returns p->normal_prio.
1922 */ 1922 */
1923 static int effective_prio(struct task_struct *p) 1923 static int effective_prio(struct task_struct *p)
1924 { 1924 {
1925 p->normal_prio = normal_prio(p); 1925 p->normal_prio = normal_prio(p);
1926 /* 1926 /*
1927 * If we are RT tasks or we were boosted to RT priority, 1927 * If we are RT tasks or we were boosted to RT priority,
1928 * keep the priority unchanged. Otherwise, update priority 1928 * keep the priority unchanged. Otherwise, update priority
1929 * to the normal priority: 1929 * to the normal priority:
1930 */ 1930 */
1931 if (!rt_prio(p->prio)) 1931 if (!rt_prio(p->prio))
1932 return p->normal_prio; 1932 return p->normal_prio;
1933 return p->prio; 1933 return p->prio;
1934 } 1934 }
1935 1935
1936 /* 1936 /*
1937 * activate_task - move a task to the runqueue. 1937 * activate_task - move a task to the runqueue.
1938 */ 1938 */
1939 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1939 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1940 { 1940 {
1941 if (task_contributes_to_load(p)) 1941 if (task_contributes_to_load(p))
1942 rq->nr_uninterruptible--; 1942 rq->nr_uninterruptible--;
1943 1943
1944 enqueue_task(rq, p, wakeup); 1944 enqueue_task(rq, p, wakeup);
1945 inc_nr_running(rq); 1945 inc_nr_running(rq);
1946 } 1946 }
1947 1947
1948 /* 1948 /*
1949 * deactivate_task - remove a task from the runqueue. 1949 * deactivate_task - remove a task from the runqueue.
1950 */ 1950 */
1951 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1951 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1952 { 1952 {
1953 if (task_contributes_to_load(p)) 1953 if (task_contributes_to_load(p))
1954 rq->nr_uninterruptible++; 1954 rq->nr_uninterruptible++;
1955 1955
1956 dequeue_task(rq, p, sleep); 1956 dequeue_task(rq, p, sleep);
1957 dec_nr_running(rq); 1957 dec_nr_running(rq);
1958 } 1958 }
1959 1959
1960 /** 1960 /**
1961 * task_curr - is this task currently executing on a CPU? 1961 * task_curr - is this task currently executing on a CPU?
1962 * @p: the task in question. 1962 * @p: the task in question.
1963 */ 1963 */
1964 inline int task_curr(const struct task_struct *p) 1964 inline int task_curr(const struct task_struct *p)
1965 { 1965 {
1966 return cpu_curr(task_cpu(p)) == p; 1966 return cpu_curr(task_cpu(p)) == p;
1967 } 1967 }
1968 1968
1969 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1969 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1970 { 1970 {
1971 set_task_rq(p, cpu); 1971 set_task_rq(p, cpu);
1972 #ifdef CONFIG_SMP 1972 #ifdef CONFIG_SMP
1973 /* 1973 /*
1974 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be 1974 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1975 * successfuly executed on another CPU. We must ensure that updates of 1975 * successfuly executed on another CPU. We must ensure that updates of
1976 * per-task data have been completed by this moment. 1976 * per-task data have been completed by this moment.
1977 */ 1977 */
1978 smp_wmb(); 1978 smp_wmb();
1979 task_thread_info(p)->cpu = cpu; 1979 task_thread_info(p)->cpu = cpu;
1980 #endif 1980 #endif
1981 } 1981 }
1982 1982
1983 static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1983 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1984 const struct sched_class *prev_class, 1984 const struct sched_class *prev_class,
1985 int oldprio, int running) 1985 int oldprio, int running)
1986 { 1986 {
1987 if (prev_class != p->sched_class) { 1987 if (prev_class != p->sched_class) {
1988 if (prev_class->switched_from) 1988 if (prev_class->switched_from)
1989 prev_class->switched_from(rq, p, running); 1989 prev_class->switched_from(rq, p, running);
1990 p->sched_class->switched_to(rq, p, running); 1990 p->sched_class->switched_to(rq, p, running);
1991 } else 1991 } else
1992 p->sched_class->prio_changed(rq, p, oldprio, running); 1992 p->sched_class->prio_changed(rq, p, oldprio, running);
1993 } 1993 }
1994 1994
1995 #ifdef CONFIG_SMP 1995 #ifdef CONFIG_SMP
1996 /* 1996 /*
1997 * Is this task likely cache-hot: 1997 * Is this task likely cache-hot:
1998 */ 1998 */
1999 static int 1999 static int
2000 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 2000 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2001 { 2001 {
2002 s64 delta; 2002 s64 delta;
2003 2003
2004 /* 2004 /*
2005 * Buddy candidates are cache hot: 2005 * Buddy candidates are cache hot:
2006 */ 2006 */
2007 if (sched_feat(CACHE_HOT_BUDDY) && 2007 if (sched_feat(CACHE_HOT_BUDDY) &&
2008 (&p->se == cfs_rq_of(&p->se)->next || 2008 (&p->se == cfs_rq_of(&p->se)->next ||
2009 &p->se == cfs_rq_of(&p->se)->last)) 2009 &p->se == cfs_rq_of(&p->se)->last))
2010 return 1; 2010 return 1;
2011 2011
2012 if (p->sched_class != &fair_sched_class) 2012 if (p->sched_class != &fair_sched_class)
2013 return 0; 2013 return 0;
2014 2014
2015 if (sysctl_sched_migration_cost == -1) 2015 if (sysctl_sched_migration_cost == -1)
2016 return 1; 2016 return 1;
2017 if (sysctl_sched_migration_cost == 0) 2017 if (sysctl_sched_migration_cost == 0)
2018 return 0; 2018 return 0;
2019 2019
2020 delta = now - p->se.exec_start; 2020 delta = now - p->se.exec_start;
2021 2021
2022 return delta < (s64)sysctl_sched_migration_cost; 2022 return delta < (s64)sysctl_sched_migration_cost;
2023 } 2023 }
2024 2024
2025 2025
2026 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2026 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2027 { 2027 {
2028 int old_cpu = task_cpu(p); 2028 int old_cpu = task_cpu(p);
2029 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 2029 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
2030 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2030 struct cfs_rq *old_cfsrq = task_cfs_rq(p),
2031 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2031 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu);
2032 u64 clock_offset; 2032 u64 clock_offset;
2033 2033
2034 clock_offset = old_rq->clock - new_rq->clock; 2034 clock_offset = old_rq->clock - new_rq->clock;
2035 2035
2036 trace_sched_migrate_task(p, new_cpu); 2036 trace_sched_migrate_task(p, new_cpu);
2037 2037
2038 #ifdef CONFIG_SCHEDSTATS 2038 #ifdef CONFIG_SCHEDSTATS
2039 if (p->se.wait_start) 2039 if (p->se.wait_start)
2040 p->se.wait_start -= clock_offset; 2040 p->se.wait_start -= clock_offset;
2041 if (p->se.sleep_start) 2041 if (p->se.sleep_start)
2042 p->se.sleep_start -= clock_offset; 2042 p->se.sleep_start -= clock_offset;
2043 if (p->se.block_start) 2043 if (p->se.block_start)
2044 p->se.block_start -= clock_offset; 2044 p->se.block_start -= clock_offset;
2045 #endif 2045 #endif
2046 if (old_cpu != new_cpu) { 2046 if (old_cpu != new_cpu) {
2047 p->se.nr_migrations++; 2047 p->se.nr_migrations++;
2048 new_rq->nr_migrations_in++; 2048 new_rq->nr_migrations_in++;
2049 #ifdef CONFIG_SCHEDSTATS 2049 #ifdef CONFIG_SCHEDSTATS
2050 if (task_hot(p, old_rq->clock, NULL)) 2050 if (task_hot(p, old_rq->clock, NULL))
2051 schedstat_inc(p, se.nr_forced2_migrations); 2051 schedstat_inc(p, se.nr_forced2_migrations);
2052 #endif 2052 #endif
2053 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2053 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2054 1, 1, NULL, 0); 2054 1, 1, NULL, 0);
2055 } 2055 }
2056 p->se.vruntime -= old_cfsrq->min_vruntime - 2056 p->se.vruntime -= old_cfsrq->min_vruntime -
2057 new_cfsrq->min_vruntime; 2057 new_cfsrq->min_vruntime;
2058 2058
2059 __set_task_cpu(p, new_cpu); 2059 __set_task_cpu(p, new_cpu);
2060 } 2060 }
2061 2061
2062 struct migration_req { 2062 struct migration_req {
2063 struct list_head list; 2063 struct list_head list;
2064 2064
2065 struct task_struct *task; 2065 struct task_struct *task;
2066 int dest_cpu; 2066 int dest_cpu;
2067 2067
2068 struct completion done; 2068 struct completion done;
2069 }; 2069 };
2070 2070
2071 /* 2071 /*
2072 * The task's runqueue lock must be held. 2072 * The task's runqueue lock must be held.
2073 * Returns true if you have to wait for migration thread. 2073 * Returns true if you have to wait for migration thread.
2074 */ 2074 */
2075 static int 2075 static int
2076 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) 2076 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2077 { 2077 {
2078 struct rq *rq = task_rq(p); 2078 struct rq *rq = task_rq(p);
2079 2079
2080 /* 2080 /*
2081 * If the task is not on a runqueue (and not running), then 2081 * If the task is not on a runqueue (and not running), then
2082 * it is sufficient to simply update the task's cpu field. 2082 * it is sufficient to simply update the task's cpu field.
2083 */ 2083 */
2084 if (!p->se.on_rq && !task_running(rq, p)) { 2084 if (!p->se.on_rq && !task_running(rq, p)) {
2085 set_task_cpu(p, dest_cpu); 2085 set_task_cpu(p, dest_cpu);
2086 return 0; 2086 return 0;
2087 } 2087 }
2088 2088
2089 init_completion(&req->done); 2089 init_completion(&req->done);
2090 req->task = p; 2090 req->task = p;
2091 req->dest_cpu = dest_cpu; 2091 req->dest_cpu = dest_cpu;
2092 list_add(&req->list, &rq->migration_queue); 2092 list_add(&req->list, &rq->migration_queue);
2093 2093
2094 return 1; 2094 return 1;
2095 } 2095 }
2096 2096
2097 /* 2097 /*
2098 * wait_task_context_switch - wait for a thread to complete at least one 2098 * wait_task_context_switch - wait for a thread to complete at least one
2099 * context switch. 2099 * context switch.
2100 * 2100 *
2101 * @p must not be current. 2101 * @p must not be current.
2102 */ 2102 */
2103 void wait_task_context_switch(struct task_struct *p) 2103 void wait_task_context_switch(struct task_struct *p)
2104 { 2104 {
2105 unsigned long nvcsw, nivcsw, flags; 2105 unsigned long nvcsw, nivcsw, flags;
2106 int running; 2106 int running;
2107 struct rq *rq; 2107 struct rq *rq;
2108 2108
2109 nvcsw = p->nvcsw; 2109 nvcsw = p->nvcsw;
2110 nivcsw = p->nivcsw; 2110 nivcsw = p->nivcsw;
2111 for (;;) { 2111 for (;;) {
2112 /* 2112 /*
2113 * The runqueue is assigned before the actual context 2113 * The runqueue is assigned before the actual context
2114 * switch. We need to take the runqueue lock. 2114 * switch. We need to take the runqueue lock.
2115 * 2115 *
2116 * We could check initially without the lock but it is 2116 * We could check initially without the lock but it is
2117 * very likely that we need to take the lock in every 2117 * very likely that we need to take the lock in every
2118 * iteration. 2118 * iteration.
2119 */ 2119 */
2120 rq = task_rq_lock(p, &flags); 2120 rq = task_rq_lock(p, &flags);
2121 running = task_running(rq, p); 2121 running = task_running(rq, p);
2122 task_rq_unlock(rq, &flags); 2122 task_rq_unlock(rq, &flags);
2123 2123
2124 if (likely(!running)) 2124 if (likely(!running))
2125 break; 2125 break;
2126 /* 2126 /*
2127 * The switch count is incremented before the actual 2127 * The switch count is incremented before the actual
2128 * context switch. We thus wait for two switches to be 2128 * context switch. We thus wait for two switches to be
2129 * sure at least one completed. 2129 * sure at least one completed.
2130 */ 2130 */
2131 if ((p->nvcsw - nvcsw) > 1) 2131 if ((p->nvcsw - nvcsw) > 1)
2132 break; 2132 break;
2133 if ((p->nivcsw - nivcsw) > 1) 2133 if ((p->nivcsw - nivcsw) > 1)
2134 break; 2134 break;
2135 2135
2136 cpu_relax(); 2136 cpu_relax();
2137 } 2137 }
2138 } 2138 }
2139 2139
2140 /* 2140 /*
2141 * wait_task_inactive - wait for a thread to unschedule. 2141 * wait_task_inactive - wait for a thread to unschedule.
2142 * 2142 *
2143 * If @match_state is nonzero, it's the @p->state value just checked and 2143 * If @match_state is nonzero, it's the @p->state value just checked and
2144 * not expected to change. If it changes, i.e. @p might have woken up, 2144 * not expected to change. If it changes, i.e. @p might have woken up,
2145 * then return zero. When we succeed in waiting for @p to be off its CPU, 2145 * then return zero. When we succeed in waiting for @p to be off its CPU,
2146 * we return a positive number (its total switch count). If a second call 2146 * we return a positive number (its total switch count). If a second call
2147 * a short while later returns the same number, the caller can be sure that 2147 * a short while later returns the same number, the caller can be sure that
2148 * @p has remained unscheduled the whole time. 2148 * @p has remained unscheduled the whole time.
2149 * 2149 *
2150 * The caller must ensure that the task *will* unschedule sometime soon, 2150 * The caller must ensure that the task *will* unschedule sometime soon,
2151 * else this function might spin for a *long* time. This function can't 2151 * else this function might spin for a *long* time. This function can't
2152 * be called with interrupts off, or it may introduce deadlock with 2152 * be called with interrupts off, or it may introduce deadlock with
2153 * smp_call_function() if an IPI is sent by the same process we are 2153 * smp_call_function() if an IPI is sent by the same process we are
2154 * waiting to become inactive. 2154 * waiting to become inactive.
2155 */ 2155 */
2156 unsigned long wait_task_inactive(struct task_struct *p, long match_state) 2156 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2157 { 2157 {
2158 unsigned long flags; 2158 unsigned long flags;
2159 int running, on_rq; 2159 int running, on_rq;
2160 unsigned long ncsw; 2160 unsigned long ncsw;
2161 struct rq *rq; 2161 struct rq *rq;
2162 2162
2163 for (;;) { 2163 for (;;) {
2164 /* 2164 /*
2165 * We do the initial early heuristics without holding 2165 * We do the initial early heuristics without holding
2166 * any task-queue locks at all. We'll only try to get 2166 * any task-queue locks at all. We'll only try to get
2167 * the runqueue lock when things look like they will 2167 * the runqueue lock when things look like they will
2168 * work out! 2168 * work out!
2169 */ 2169 */
2170 rq = task_rq(p); 2170 rq = task_rq(p);
2171 2171
2172 /* 2172 /*
2173 * If the task is actively running on another CPU 2173 * If the task is actively running on another CPU
2174 * still, just relax and busy-wait without holding 2174 * still, just relax and busy-wait without holding
2175 * any locks. 2175 * any locks.
2176 * 2176 *
2177 * NOTE! Since we don't hold any locks, it's not 2177 * NOTE! Since we don't hold any locks, it's not
2178 * even sure that "rq" stays as the right runqueue! 2178 * even sure that "rq" stays as the right runqueue!
2179 * But we don't care, since "task_running()" will 2179 * But we don't care, since "task_running()" will
2180 * return false if the runqueue has changed and p 2180 * return false if the runqueue has changed and p
2181 * is actually now running somewhere else! 2181 * is actually now running somewhere else!
2182 */ 2182 */
2183 while (task_running(rq, p)) { 2183 while (task_running(rq, p)) {
2184 if (match_state && unlikely(p->state != match_state)) 2184 if (match_state && unlikely(p->state != match_state))
2185 return 0; 2185 return 0;
2186 cpu_relax(); 2186 cpu_relax();
2187 } 2187 }
2188 2188
2189 /* 2189 /*
2190 * Ok, time to look more closely! We need the rq 2190 * Ok, time to look more closely! We need the rq
2191 * lock now, to be *sure*. If we're wrong, we'll 2191 * lock now, to be *sure*. If we're wrong, we'll
2192 * just go back and repeat. 2192 * just go back and repeat.
2193 */ 2193 */
2194 rq = task_rq_lock(p, &flags); 2194 rq = task_rq_lock(p, &flags);
2195 trace_sched_wait_task(rq, p); 2195 trace_sched_wait_task(rq, p);
2196 running = task_running(rq, p); 2196 running = task_running(rq, p);
2197 on_rq = p->se.on_rq; 2197 on_rq = p->se.on_rq;
2198 ncsw = 0; 2198 ncsw = 0;
2199 if (!match_state || p->state == match_state) 2199 if (!match_state || p->state == match_state)
2200 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2200 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2201 task_rq_unlock(rq, &flags); 2201 task_rq_unlock(rq, &flags);
2202 2202
2203 /* 2203 /*
2204 * If it changed from the expected state, bail out now. 2204 * If it changed from the expected state, bail out now.
2205 */ 2205 */
2206 if (unlikely(!ncsw)) 2206 if (unlikely(!ncsw))
2207 break; 2207 break;
2208 2208
2209 /* 2209 /*
2210 * Was it really running after all now that we 2210 * Was it really running after all now that we
2211 * checked with the proper locks actually held? 2211 * checked with the proper locks actually held?
2212 * 2212 *
2213 * Oops. Go back and try again.. 2213 * Oops. Go back and try again..
2214 */ 2214 */
2215 if (unlikely(running)) { 2215 if (unlikely(running)) {
2216 cpu_relax(); 2216 cpu_relax();
2217 continue; 2217 continue;
2218 } 2218 }
2219 2219
2220 /* 2220 /*
2221 * It's not enough that it's not actively running, 2221 * It's not enough that it's not actively running,
2222 * it must be off the runqueue _entirely_, and not 2222 * it must be off the runqueue _entirely_, and not
2223 * preempted! 2223 * preempted!
2224 * 2224 *
2225 * So if it was still runnable (but just not actively 2225 * So if it was still runnable (but just not actively
2226 * running right now), it's preempted, and we should 2226 * running right now), it's preempted, and we should
2227 * yield - it could be a while. 2227 * yield - it could be a while.
2228 */ 2228 */
2229 if (unlikely(on_rq)) { 2229 if (unlikely(on_rq)) {
2230 schedule_timeout_uninterruptible(1); 2230 schedule_timeout_uninterruptible(1);
2231 continue; 2231 continue;
2232 } 2232 }
2233 2233
2234 /* 2234 /*
2235 * Ahh, all good. It wasn't running, and it wasn't 2235 * Ahh, all good. It wasn't running, and it wasn't
2236 * runnable, which means that it will never become 2236 * runnable, which means that it will never become
2237 * running in the future either. We're all done! 2237 * running in the future either. We're all done!
2238 */ 2238 */
2239 break; 2239 break;
2240 } 2240 }
2241 2241
2242 return ncsw; 2242 return ncsw;
2243 } 2243 }
2244 2244
2245 /*** 2245 /***
2246 * kick_process - kick a running thread to enter/exit the kernel 2246 * kick_process - kick a running thread to enter/exit the kernel
2247 * @p: the to-be-kicked thread 2247 * @p: the to-be-kicked thread
2248 * 2248 *
2249 * Cause a process which is running on another CPU to enter 2249 * Cause a process which is running on another CPU to enter
2250 * kernel-mode, without any delay. (to get signals handled.) 2250 * kernel-mode, without any delay. (to get signals handled.)
2251 * 2251 *
2252 * NOTE: this function doesnt have to take the runqueue lock, 2252 * NOTE: this function doesnt have to take the runqueue lock,
2253 * because all it wants to ensure is that the remote task enters 2253 * because all it wants to ensure is that the remote task enters
2254 * the kernel. If the IPI races and the task has been migrated 2254 * the kernel. If the IPI races and the task has been migrated
2255 * to another CPU then no harm is done and the purpose has been 2255 * to another CPU then no harm is done and the purpose has been
2256 * achieved as well. 2256 * achieved as well.
2257 */ 2257 */
2258 void kick_process(struct task_struct *p) 2258 void kick_process(struct task_struct *p)
2259 { 2259 {
2260 int cpu; 2260 int cpu;
2261 2261
2262 preempt_disable(); 2262 preempt_disable();
2263 cpu = task_cpu(p); 2263 cpu = task_cpu(p);
2264 if ((cpu != smp_processor_id()) && task_curr(p)) 2264 if ((cpu != smp_processor_id()) && task_curr(p))
2265 smp_send_reschedule(cpu); 2265 smp_send_reschedule(cpu);
2266 preempt_enable(); 2266 preempt_enable();
2267 } 2267 }
2268 EXPORT_SYMBOL_GPL(kick_process); 2268 EXPORT_SYMBOL_GPL(kick_process);
2269 #endif /* CONFIG_SMP */ 2269 #endif /* CONFIG_SMP */
2270 2270
2271 /** 2271 /**
2272 * task_oncpu_function_call - call a function on the cpu on which a task runs 2272 * task_oncpu_function_call - call a function on the cpu on which a task runs
2273 * @p: the task to evaluate 2273 * @p: the task to evaluate
2274 * @func: the function to be called 2274 * @func: the function to be called
2275 * @info: the function call argument 2275 * @info: the function call argument
2276 * 2276 *
2277 * Calls the function @func when the task is currently running. This might 2277 * Calls the function @func when the task is currently running. This might
2278 * be on the current CPU, which just calls the function directly 2278 * be on the current CPU, which just calls the function directly
2279 */ 2279 */
2280 void task_oncpu_function_call(struct task_struct *p, 2280 void task_oncpu_function_call(struct task_struct *p,
2281 void (*func) (void *info), void *info) 2281 void (*func) (void *info), void *info)
2282 { 2282 {
2283 int cpu; 2283 int cpu;
2284 2284
2285 preempt_disable(); 2285 preempt_disable();
2286 cpu = task_cpu(p); 2286 cpu = task_cpu(p);
2287 if (task_curr(p)) 2287 if (task_curr(p))
2288 smp_call_function_single(cpu, func, info, 1); 2288 smp_call_function_single(cpu, func, info, 1);
2289 preempt_enable(); 2289 preempt_enable();
2290 } 2290 }
2291 2291
2292 /*** 2292 /***
2293 * try_to_wake_up - wake up a thread 2293 * try_to_wake_up - wake up a thread
2294 * @p: the to-be-woken-up thread 2294 * @p: the to-be-woken-up thread
2295 * @state: the mask of task states that can be woken 2295 * @state: the mask of task states that can be woken
2296 * @sync: do a synchronous wakeup? 2296 * @sync: do a synchronous wakeup?
2297 * 2297 *
2298 * Put it on the run-queue if it's not already there. The "current" 2298 * Put it on the run-queue if it's not already there. The "current"
2299 * thread is always on the run-queue (except when the actual 2299 * thread is always on the run-queue (except when the actual
2300 * re-schedule is in progress), and as such you're allowed to do 2300 * re-schedule is in progress), and as such you're allowed to do
2301 * the simpler "current->state = TASK_RUNNING" to mark yourself 2301 * the simpler "current->state = TASK_RUNNING" to mark yourself
2302 * runnable without the overhead of this. 2302 * runnable without the overhead of this.
2303 * 2303 *
2304 * returns failure only if the task is already active. 2304 * returns failure only if the task is already active.
2305 */ 2305 */
2306 static int try_to_wake_up(struct task_struct *p, unsigned int state, 2306 static int try_to_wake_up(struct task_struct *p, unsigned int state,
2307 int wake_flags) 2307 int wake_flags)
2308 { 2308 {
2309 int cpu, orig_cpu, this_cpu, success = 0; 2309 int cpu, orig_cpu, this_cpu, success = 0;
2310 unsigned long flags; 2310 unsigned long flags;
2311 struct rq *rq, *orig_rq; 2311 struct rq *rq, *orig_rq;
2312 2312
2313 if (!sched_feat(SYNC_WAKEUPS)) 2313 if (!sched_feat(SYNC_WAKEUPS))
2314 wake_flags &= ~WF_SYNC; 2314 wake_flags &= ~WF_SYNC;
2315 2315
2316 this_cpu = get_cpu(); 2316 this_cpu = get_cpu();
2317 2317
2318 smp_wmb(); 2318 smp_wmb();
2319 rq = orig_rq = task_rq_lock(p, &flags); 2319 rq = orig_rq = task_rq_lock(p, &flags);
2320 update_rq_clock(rq); 2320 update_rq_clock(rq);
2321 if (!(p->state & state)) 2321 if (!(p->state & state))
2322 goto out; 2322 goto out;
2323 2323
2324 if (p->se.on_rq) 2324 if (p->se.on_rq)
2325 goto out_running; 2325 goto out_running;
2326 2326
2327 cpu = task_cpu(p); 2327 cpu = task_cpu(p);
2328 orig_cpu = cpu; 2328 orig_cpu = cpu;
2329 2329
2330 #ifdef CONFIG_SMP 2330 #ifdef CONFIG_SMP
2331 if (unlikely(task_running(rq, p))) 2331 if (unlikely(task_running(rq, p)))
2332 goto out_activate; 2332 goto out_activate;
2333 2333
2334 /* 2334 /*
2335 * In order to handle concurrent wakeups and release the rq->lock 2335 * In order to handle concurrent wakeups and release the rq->lock
2336 * we put the task in TASK_WAKING state. 2336 * we put the task in TASK_WAKING state.
2337 * 2337 *
2338 * First fix up the nr_uninterruptible count: 2338 * First fix up the nr_uninterruptible count:
2339 */ 2339 */
2340 if (task_contributes_to_load(p)) 2340 if (task_contributes_to_load(p))
2341 rq->nr_uninterruptible--; 2341 rq->nr_uninterruptible--;
2342 p->state = TASK_WAKING; 2342 p->state = TASK_WAKING;
2343 task_rq_unlock(rq, &flags); 2343 task_rq_unlock(rq, &flags);
2344 2344
2345 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2345 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2346 if (cpu != orig_cpu) 2346 if (cpu != orig_cpu)
2347 set_task_cpu(p, cpu); 2347 set_task_cpu(p, cpu);
2348 2348
2349 rq = task_rq_lock(p, &flags); 2349 rq = task_rq_lock(p, &flags);
2350 2350
2351 if (rq != orig_rq) 2351 if (rq != orig_rq)
2352 update_rq_clock(rq); 2352 update_rq_clock(rq);
2353 2353
2354 WARN_ON(p->state != TASK_WAKING); 2354 WARN_ON(p->state != TASK_WAKING);
2355 cpu = task_cpu(p); 2355 cpu = task_cpu(p);
2356 2356
2357 #ifdef CONFIG_SCHEDSTATS 2357 #ifdef CONFIG_SCHEDSTATS
2358 schedstat_inc(rq, ttwu_count); 2358 schedstat_inc(rq, ttwu_count);
2359 if (cpu == this_cpu) 2359 if (cpu == this_cpu)
2360 schedstat_inc(rq, ttwu_local); 2360 schedstat_inc(rq, ttwu_local);
2361 else { 2361 else {
2362 struct sched_domain *sd; 2362 struct sched_domain *sd;
2363 for_each_domain(this_cpu, sd) { 2363 for_each_domain(this_cpu, sd) {
2364 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { 2364 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2365 schedstat_inc(sd, ttwu_wake_remote); 2365 schedstat_inc(sd, ttwu_wake_remote);
2366 break; 2366 break;
2367 } 2367 }
2368 } 2368 }
2369 } 2369 }
2370 #endif /* CONFIG_SCHEDSTATS */ 2370 #endif /* CONFIG_SCHEDSTATS */
2371 2371
2372 out_activate: 2372 out_activate:
2373 #endif /* CONFIG_SMP */ 2373 #endif /* CONFIG_SMP */
2374 schedstat_inc(p, se.nr_wakeups); 2374 schedstat_inc(p, se.nr_wakeups);
2375 if (wake_flags & WF_SYNC) 2375 if (wake_flags & WF_SYNC)
2376 schedstat_inc(p, se.nr_wakeups_sync); 2376 schedstat_inc(p, se.nr_wakeups_sync);
2377 if (orig_cpu != cpu) 2377 if (orig_cpu != cpu)
2378 schedstat_inc(p, se.nr_wakeups_migrate); 2378 schedstat_inc(p, se.nr_wakeups_migrate);
2379 if (cpu == this_cpu) 2379 if (cpu == this_cpu)
2380 schedstat_inc(p, se.nr_wakeups_local); 2380 schedstat_inc(p, se.nr_wakeups_local);
2381 else 2381 else
2382 schedstat_inc(p, se.nr_wakeups_remote); 2382 schedstat_inc(p, se.nr_wakeups_remote);
2383 activate_task(rq, p, 1); 2383 activate_task(rq, p, 1);
2384 success = 1; 2384 success = 1;
2385 2385
2386 /* 2386 /*
2387 * Only attribute actual wakeups done by this task. 2387 * Only attribute actual wakeups done by this task.
2388 */ 2388 */
2389 if (!in_interrupt()) { 2389 if (!in_interrupt()) {
2390 struct sched_entity *se = &current->se; 2390 struct sched_entity *se = &current->se;
2391 u64 sample = se->sum_exec_runtime; 2391 u64 sample = se->sum_exec_runtime;
2392 2392
2393 if (se->last_wakeup) 2393 if (se->last_wakeup)
2394 sample -= se->last_wakeup; 2394 sample -= se->last_wakeup;
2395 else 2395 else
2396 sample -= se->start_runtime; 2396 sample -= se->start_runtime;
2397 update_avg(&se->avg_wakeup, sample); 2397 update_avg(&se->avg_wakeup, sample);
2398 2398
2399 se->last_wakeup = se->sum_exec_runtime; 2399 se->last_wakeup = se->sum_exec_runtime;
2400 } 2400 }
2401 2401
2402 out_running: 2402 out_running:
2403 trace_sched_wakeup(rq, p, success); 2403 trace_sched_wakeup(rq, p, success);
2404 check_preempt_curr(rq, p, wake_flags); 2404 check_preempt_curr(rq, p, wake_flags);
2405 2405
2406 p->state = TASK_RUNNING; 2406 p->state = TASK_RUNNING;
2407 #ifdef CONFIG_SMP 2407 #ifdef CONFIG_SMP
2408 if (p->sched_class->task_wake_up) 2408 if (p->sched_class->task_wake_up)
2409 p->sched_class->task_wake_up(rq, p); 2409 p->sched_class->task_wake_up(rq, p);
2410 #endif 2410 #endif
2411 out: 2411 out:
2412 task_rq_unlock(rq, &flags); 2412 task_rq_unlock(rq, &flags);
2413 put_cpu(); 2413 put_cpu();
2414 2414
2415 return success; 2415 return success;
2416 } 2416 }
2417 2417
2418 /** 2418 /**
2419 * wake_up_process - Wake up a specific process 2419 * wake_up_process - Wake up a specific process
2420 * @p: The process to be woken up. 2420 * @p: The process to be woken up.
2421 * 2421 *
2422 * Attempt to wake up the nominated process and move it to the set of runnable 2422 * Attempt to wake up the nominated process and move it to the set of runnable
2423 * processes. Returns 1 if the process was woken up, 0 if it was already 2423 * processes. Returns 1 if the process was woken up, 0 if it was already
2424 * running. 2424 * running.
2425 * 2425 *
2426 * It may be assumed that this function implies a write memory barrier before 2426 * It may be assumed that this function implies a write memory barrier before
2427 * changing the task state if and only if any tasks are woken up. 2427 * changing the task state if and only if any tasks are woken up.
2428 */ 2428 */
2429 int wake_up_process(struct task_struct *p) 2429 int wake_up_process(struct task_struct *p)
2430 { 2430 {
2431 return try_to_wake_up(p, TASK_ALL, 0); 2431 return try_to_wake_up(p, TASK_ALL, 0);
2432 } 2432 }
2433 EXPORT_SYMBOL(wake_up_process); 2433 EXPORT_SYMBOL(wake_up_process);
2434 2434
2435 int wake_up_state(struct task_struct *p, unsigned int state) 2435 int wake_up_state(struct task_struct *p, unsigned int state)
2436 { 2436 {
2437 return try_to_wake_up(p, state, 0); 2437 return try_to_wake_up(p, state, 0);
2438 } 2438 }
2439 2439
2440 /* 2440 /*
2441 * Perform scheduler related setup for a newly forked process p. 2441 * Perform scheduler related setup for a newly forked process p.
2442 * p is forked by current. 2442 * p is forked by current.
2443 * 2443 *
2444 * __sched_fork() is basic setup used by init_idle() too: 2444 * __sched_fork() is basic setup used by init_idle() too:
2445 */ 2445 */
2446 static void __sched_fork(struct task_struct *p) 2446 static void __sched_fork(struct task_struct *p)
2447 { 2447 {
2448 p->se.exec_start = 0; 2448 p->se.exec_start = 0;
2449 p->se.sum_exec_runtime = 0; 2449 p->se.sum_exec_runtime = 0;
2450 p->se.prev_sum_exec_runtime = 0; 2450 p->se.prev_sum_exec_runtime = 0;
2451 p->se.nr_migrations = 0; 2451 p->se.nr_migrations = 0;
2452 p->se.last_wakeup = 0; 2452 p->se.last_wakeup = 0;
2453 p->se.avg_overlap = 0; 2453 p->se.avg_overlap = 0;
2454 p->se.start_runtime = 0; 2454 p->se.start_runtime = 0;
2455 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2455 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2456 p->se.avg_running = 0; 2456 p->se.avg_running = 0;
2457 2457
2458 #ifdef CONFIG_SCHEDSTATS 2458 #ifdef CONFIG_SCHEDSTATS
2459 p->se.wait_start = 0; 2459 p->se.wait_start = 0;
2460 p->se.wait_max = 0; 2460 p->se.wait_max = 0;
2461 p->se.wait_count = 0; 2461 p->se.wait_count = 0;
2462 p->se.wait_sum = 0; 2462 p->se.wait_sum = 0;
2463 2463
2464 p->se.sleep_start = 0; 2464 p->se.sleep_start = 0;
2465 p->se.sleep_max = 0; 2465 p->se.sleep_max = 0;
2466 p->se.sum_sleep_runtime = 0; 2466 p->se.sum_sleep_runtime = 0;
2467 2467
2468 p->se.block_start = 0; 2468 p->se.block_start = 0;
2469 p->se.block_max = 0; 2469 p->se.block_max = 0;
2470 p->se.exec_max = 0; 2470 p->se.exec_max = 0;
2471 p->se.slice_max = 0; 2471 p->se.slice_max = 0;
2472 2472
2473 p->se.nr_migrations_cold = 0; 2473 p->se.nr_migrations_cold = 0;
2474 p->se.nr_failed_migrations_affine = 0; 2474 p->se.nr_failed_migrations_affine = 0;
2475 p->se.nr_failed_migrations_running = 0; 2475 p->se.nr_failed_migrations_running = 0;
2476 p->se.nr_failed_migrations_hot = 0; 2476 p->se.nr_failed_migrations_hot = 0;
2477 p->se.nr_forced_migrations = 0; 2477 p->se.nr_forced_migrations = 0;
2478 p->se.nr_forced2_migrations = 0; 2478 p->se.nr_forced2_migrations = 0;
2479 2479
2480 p->se.nr_wakeups = 0; 2480 p->se.nr_wakeups = 0;
2481 p->se.nr_wakeups_sync = 0; 2481 p->se.nr_wakeups_sync = 0;
2482 p->se.nr_wakeups_migrate = 0; 2482 p->se.nr_wakeups_migrate = 0;
2483 p->se.nr_wakeups_local = 0; 2483 p->se.nr_wakeups_local = 0;
2484 p->se.nr_wakeups_remote = 0; 2484 p->se.nr_wakeups_remote = 0;
2485 p->se.nr_wakeups_affine = 0; 2485 p->se.nr_wakeups_affine = 0;
2486 p->se.nr_wakeups_affine_attempts = 0; 2486 p->se.nr_wakeups_affine_attempts = 0;
2487 p->se.nr_wakeups_passive = 0; 2487 p->se.nr_wakeups_passive = 0;
2488 p->se.nr_wakeups_idle = 0; 2488 p->se.nr_wakeups_idle = 0;
2489 2489
2490 #endif 2490 #endif
2491 2491
2492 INIT_LIST_HEAD(&p->rt.run_list); 2492 INIT_LIST_HEAD(&p->rt.run_list);
2493 p->se.on_rq = 0; 2493 p->se.on_rq = 0;
2494 INIT_LIST_HEAD(&p->se.group_node); 2494 INIT_LIST_HEAD(&p->se.group_node);
2495 2495
2496 #ifdef CONFIG_PREEMPT_NOTIFIERS 2496 #ifdef CONFIG_PREEMPT_NOTIFIERS
2497 INIT_HLIST_HEAD(&p->preempt_notifiers); 2497 INIT_HLIST_HEAD(&p->preempt_notifiers);
2498 #endif 2498 #endif
2499 2499
2500 /* 2500 /*
2501 * We mark the process as running here, but have not actually 2501 * We mark the process as running here, but have not actually
2502 * inserted it onto the runqueue yet. This guarantees that 2502 * inserted it onto the runqueue yet. This guarantees that
2503 * nobody will actually run it, and a signal or other external 2503 * nobody will actually run it, and a signal or other external
2504 * event cannot wake it up and insert it on the runqueue either. 2504 * event cannot wake it up and insert it on the runqueue either.
2505 */ 2505 */
2506 p->state = TASK_RUNNING; 2506 p->state = TASK_RUNNING;
2507 } 2507 }
2508 2508
2509 /* 2509 /*
2510 * fork()/clone()-time setup: 2510 * fork()/clone()-time setup:
2511 */ 2511 */
2512 void sched_fork(struct task_struct *p, int clone_flags) 2512 void sched_fork(struct task_struct *p, int clone_flags)
2513 { 2513 {
2514 int cpu = get_cpu(); 2514 int cpu = get_cpu();
2515 2515
2516 __sched_fork(p); 2516 __sched_fork(p);
2517 2517
2518 /* 2518 /*
2519 * Revert to default priority/policy on fork if requested. 2519 * Revert to default priority/policy on fork if requested.
2520 */ 2520 */
2521 if (unlikely(p->sched_reset_on_fork)) { 2521 if (unlikely(p->sched_reset_on_fork)) {
2522 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { 2522 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2523 p->policy = SCHED_NORMAL; 2523 p->policy = SCHED_NORMAL;
2524 p->normal_prio = p->static_prio; 2524 p->normal_prio = p->static_prio;
2525 } 2525 }
2526 2526
2527 if (PRIO_TO_NICE(p->static_prio) < 0) { 2527 if (PRIO_TO_NICE(p->static_prio) < 0) {
2528 p->static_prio = NICE_TO_PRIO(0); 2528 p->static_prio = NICE_TO_PRIO(0);
2529 p->normal_prio = p->static_prio; 2529 p->normal_prio = p->static_prio;
2530 set_load_weight(p); 2530 set_load_weight(p);
2531 } 2531 }
2532 2532
2533 /* 2533 /*
2534 * We don't need the reset flag anymore after the fork. It has 2534 * We don't need the reset flag anymore after the fork. It has
2535 * fulfilled its duty: 2535 * fulfilled its duty:
2536 */ 2536 */
2537 p->sched_reset_on_fork = 0; 2537 p->sched_reset_on_fork = 0;
2538 } 2538 }
2539 2539
2540 /* 2540 /*
2541 * Make sure we do not leak PI boosting priority to the child. 2541 * Make sure we do not leak PI boosting priority to the child.
2542 */ 2542 */
2543 p->prio = current->normal_prio; 2543 p->prio = current->normal_prio;
2544 2544
2545 if (!rt_prio(p->prio)) 2545 if (!rt_prio(p->prio))
2546 p->sched_class = &fair_sched_class; 2546 p->sched_class = &fair_sched_class;
2547 2547
2548 #ifdef CONFIG_SMP 2548 #ifdef CONFIG_SMP
2549 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2549 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2550 #endif 2550 #endif
2551 set_task_cpu(p, cpu); 2551 set_task_cpu(p, cpu);
2552 2552
2553 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2553 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2554 if (likely(sched_info_on())) 2554 if (likely(sched_info_on()))
2555 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2555 memset(&p->sched_info, 0, sizeof(p->sched_info));
2556 #endif 2556 #endif
2557 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2557 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
2558 p->oncpu = 0; 2558 p->oncpu = 0;
2559 #endif 2559 #endif
2560 #ifdef CONFIG_PREEMPT 2560 #ifdef CONFIG_PREEMPT
2561 /* Want to start with kernel preemption disabled. */ 2561 /* Want to start with kernel preemption disabled. */
2562 task_thread_info(p)->preempt_count = 1; 2562 task_thread_info(p)->preempt_count = 1;
2563 #endif 2563 #endif
2564 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2564 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2565 2565
2566 put_cpu(); 2566 put_cpu();
2567 } 2567 }
2568 2568
2569 /* 2569 /*
2570 * wake_up_new_task - wake up a newly created task for the first time. 2570 * wake_up_new_task - wake up a newly created task for the first time.
2571 * 2571 *
2572 * This function will do some initial scheduler statistics housekeeping 2572 * This function will do some initial scheduler statistics housekeeping
2573 * that must be done for every newly created context, then puts the task 2573 * that must be done for every newly created context, then puts the task
2574 * on the runqueue and wakes it. 2574 * on the runqueue and wakes it.
2575 */ 2575 */
2576 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2576 void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2577 { 2577 {
2578 unsigned long flags; 2578 unsigned long flags;
2579 struct rq *rq; 2579 struct rq *rq;
2580 2580
2581 rq = task_rq_lock(p, &flags); 2581 rq = task_rq_lock(p, &flags);
2582 BUG_ON(p->state != TASK_RUNNING); 2582 BUG_ON(p->state != TASK_RUNNING);
2583 update_rq_clock(rq); 2583 update_rq_clock(rq);
2584 2584
2585 if (!p->sched_class->task_new || !current->se.on_rq) { 2585 if (!p->sched_class->task_new || !current->se.on_rq) {
2586 activate_task(rq, p, 0); 2586 activate_task(rq, p, 0);
2587 } else { 2587 } else {
2588 /* 2588 /*
2589 * Let the scheduling class do new task startup 2589 * Let the scheduling class do new task startup
2590 * management (if any): 2590 * management (if any):
2591 */ 2591 */
2592 p->sched_class->task_new(rq, p); 2592 p->sched_class->task_new(rq, p);
2593 inc_nr_running(rq); 2593 inc_nr_running(rq);
2594 } 2594 }
2595 trace_sched_wakeup_new(rq, p, 1); 2595 trace_sched_wakeup_new(rq, p, 1);
2596 check_preempt_curr(rq, p, WF_FORK); 2596 check_preempt_curr(rq, p, WF_FORK);
2597 #ifdef CONFIG_SMP 2597 #ifdef CONFIG_SMP
2598 if (p->sched_class->task_wake_up) 2598 if (p->sched_class->task_wake_up)
2599 p->sched_class->task_wake_up(rq, p); 2599 p->sched_class->task_wake_up(rq, p);
2600 #endif 2600 #endif
2601 task_rq_unlock(rq, &flags); 2601 task_rq_unlock(rq, &flags);
2602 } 2602 }
2603 2603
2604 #ifdef CONFIG_PREEMPT_NOTIFIERS 2604 #ifdef CONFIG_PREEMPT_NOTIFIERS
2605 2605
2606 /** 2606 /**
2607 * preempt_notifier_register - tell me when current is being preempted & rescheduled 2607 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2608 * @notifier: notifier struct to register 2608 * @notifier: notifier struct to register
2609 */ 2609 */
2610 void preempt_notifier_register(struct preempt_notifier *notifier) 2610 void preempt_notifier_register(struct preempt_notifier *notifier)
2611 { 2611 {
2612 hlist_add_head(&notifier->link, &current->preempt_notifiers); 2612 hlist_add_head(&notifier->link, &current->preempt_notifiers);
2613 } 2613 }
2614 EXPORT_SYMBOL_GPL(preempt_notifier_register); 2614 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2615 2615
2616 /** 2616 /**
2617 * preempt_notifier_unregister - no longer interested in preemption notifications 2617 * preempt_notifier_unregister - no longer interested in preemption notifications
2618 * @notifier: notifier struct to unregister 2618 * @notifier: notifier struct to unregister
2619 * 2619 *
2620 * This is safe to call from within a preemption notifier. 2620 * This is safe to call from within a preemption notifier.
2621 */ 2621 */
2622 void preempt_notifier_unregister(struct preempt_notifier *notifier) 2622 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2623 { 2623 {
2624 hlist_del(&notifier->link); 2624 hlist_del(&notifier->link);
2625 } 2625 }
2626 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 2626 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2627 2627
2628 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2628 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2629 { 2629 {
2630 struct preempt_notifier *notifier; 2630 struct preempt_notifier *notifier;
2631 struct hlist_node *node; 2631 struct hlist_node *node;
2632 2632
2633 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2633 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2634 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 2634 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2635 } 2635 }
2636 2636
2637 static void 2637 static void
2638 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2638 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2639 struct task_struct *next) 2639 struct task_struct *next)
2640 { 2640 {
2641 struct preempt_notifier *notifier; 2641 struct preempt_notifier *notifier;
2642 struct hlist_node *node; 2642 struct hlist_node *node;
2643 2643
2644 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 2644 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
2645 notifier->ops->sched_out(notifier, next); 2645 notifier->ops->sched_out(notifier, next);
2646 } 2646 }
2647 2647
2648 #else /* !CONFIG_PREEMPT_NOTIFIERS */ 2648 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2649 2649
2650 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2650 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2651 { 2651 {
2652 } 2652 }
2653 2653
2654 static void 2654 static void
2655 fire_sched_out_preempt_notifiers(struct task_struct *curr, 2655 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2656 struct task_struct *next) 2656 struct task_struct *next)
2657 { 2657 {
2658 } 2658 }
2659 2659
2660 #endif /* CONFIG_PREEMPT_NOTIFIERS */ 2660 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2661 2661
2662 /** 2662 /**
2663 * prepare_task_switch - prepare to switch tasks 2663 * prepare_task_switch - prepare to switch tasks
2664 * @rq: the runqueue preparing to switch 2664 * @rq: the runqueue preparing to switch
2665 * @prev: the current task that is being switched out 2665 * @prev: the current task that is being switched out
2666 * @next: the task we are going to switch to. 2666 * @next: the task we are going to switch to.
2667 * 2667 *
2668 * This is called with the rq lock held and interrupts off. It must 2668 * This is called with the rq lock held and interrupts off. It must
2669 * be paired with a subsequent finish_task_switch after the context 2669 * be paired with a subsequent finish_task_switch after the context
2670 * switch. 2670 * switch.
2671 * 2671 *
2672 * prepare_task_switch sets up locking and calls architecture specific 2672 * prepare_task_switch sets up locking and calls architecture specific
2673 * hooks. 2673 * hooks.
2674 */ 2674 */
2675 static inline void 2675 static inline void
2676 prepare_task_switch(struct rq *rq, struct task_struct *prev, 2676 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2677 struct task_struct *next) 2677 struct task_struct *next)
2678 { 2678 {
2679 fire_sched_out_preempt_notifiers(prev, next); 2679 fire_sched_out_preempt_notifiers(prev, next);
2680 prepare_lock_switch(rq, next); 2680 prepare_lock_switch(rq, next);
2681 prepare_arch_switch(next); 2681 prepare_arch_switch(next);
2682 } 2682 }
2683 2683
2684 /** 2684 /**
2685 * finish_task_switch - clean up after a task-switch 2685 * finish_task_switch - clean up after a task-switch
2686 * @rq: runqueue associated with task-switch 2686 * @rq: runqueue associated with task-switch
2687 * @prev: the thread we just switched away from. 2687 * @prev: the thread we just switched away from.
2688 * 2688 *
2689 * finish_task_switch must be called after the context switch, paired 2689 * finish_task_switch must be called after the context switch, paired
2690 * with a prepare_task_switch call before the context switch. 2690 * with a prepare_task_switch call before the context switch.
2691 * finish_task_switch will reconcile locking set up by prepare_task_switch, 2691 * finish_task_switch will reconcile locking set up by prepare_task_switch,
2692 * and do any other architecture-specific cleanup actions. 2692 * and do any other architecture-specific cleanup actions.
2693 * 2693 *
2694 * Note that we may have delayed dropping an mm in context_switch(). If 2694 * Note that we may have delayed dropping an mm in context_switch(). If
2695 * so, we finish that here outside of the runqueue lock. (Doing it 2695 * so, we finish that here outside of the runqueue lock. (Doing it
2696 * with the lock held can cause deadlocks; see schedule() for 2696 * with the lock held can cause deadlocks; see schedule() for
2697 * details.) 2697 * details.)
2698 */ 2698 */
2699 static void finish_task_switch(struct rq *rq, struct task_struct *prev) 2699 static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2700 __releases(rq->lock) 2700 __releases(rq->lock)
2701 { 2701 {
2702 struct mm_struct *mm = rq->prev_mm; 2702 struct mm_struct *mm = rq->prev_mm;
2703 long prev_state; 2703 long prev_state;
2704 2704
2705 rq->prev_mm = NULL; 2705 rq->prev_mm = NULL;
2706 2706
2707 /* 2707 /*
2708 * A task struct has one reference for the use as "current". 2708 * A task struct has one reference for the use as "current".
2709 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 2709 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2710 * schedule one last time. The schedule call will never return, and 2710 * schedule one last time. The schedule call will never return, and
2711 * the scheduled task must drop that reference. 2711 * the scheduled task must drop that reference.
2712 * The test for TASK_DEAD must occur while the runqueue locks are 2712 * The test for TASK_DEAD must occur while the runqueue locks are
2713 * still held, otherwise prev could be scheduled on another cpu, die 2713 * still held, otherwise prev could be scheduled on another cpu, die
2714 * there before we look at prev->state, and then the reference would 2714 * there before we look at prev->state, and then the reference would
2715 * be dropped twice. 2715 * be dropped twice.
2716 * Manfred Spraul <manfred@colorfullife.com> 2716 * Manfred Spraul <manfred@colorfullife.com>
2717 */ 2717 */
2718 prev_state = prev->state; 2718 prev_state = prev->state;
2719 finish_arch_switch(prev); 2719 finish_arch_switch(prev);
2720 perf_event_task_sched_in(current, cpu_of(rq)); 2720 perf_event_task_sched_in(current, cpu_of(rq));
2721 finish_lock_switch(rq, prev); 2721 finish_lock_switch(rq, prev);
2722 2722
2723 fire_sched_in_preempt_notifiers(current); 2723 fire_sched_in_preempt_notifiers(current);
2724 if (mm) 2724 if (mm)
2725 mmdrop(mm); 2725 mmdrop(mm);
2726 if (unlikely(prev_state == TASK_DEAD)) { 2726 if (unlikely(prev_state == TASK_DEAD)) {
2727 /* 2727 /*
2728 * Remove function-return probe instances associated with this 2728 * Remove function-return probe instances associated with this
2729 * task and put them back on the free list. 2729 * task and put them back on the free list.
2730 */ 2730 */
2731 kprobe_flush_task(prev); 2731 kprobe_flush_task(prev);
2732 put_task_struct(prev); 2732 put_task_struct(prev);
2733 } 2733 }
2734 } 2734 }
2735 2735
2736 #ifdef CONFIG_SMP 2736 #ifdef CONFIG_SMP
2737 2737
2738 /* assumes rq->lock is held */ 2738 /* assumes rq->lock is held */
2739 static inline void pre_schedule(struct rq *rq, struct task_struct *prev) 2739 static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2740 { 2740 {
2741 if (prev->sched_class->pre_schedule) 2741 if (prev->sched_class->pre_schedule)
2742 prev->sched_class->pre_schedule(rq, prev); 2742 prev->sched_class->pre_schedule(rq, prev);
2743 } 2743 }
2744 2744
2745 /* rq->lock is NOT held, but preemption is disabled */ 2745 /* rq->lock is NOT held, but preemption is disabled */
2746 static inline void post_schedule(struct rq *rq) 2746 static inline void post_schedule(struct rq *rq)
2747 { 2747 {
2748 if (rq->post_schedule) { 2748 if (rq->post_schedule) {
2749 unsigned long flags; 2749 unsigned long flags;
2750 2750
2751 spin_lock_irqsave(&rq->lock, flags); 2751 spin_lock_irqsave(&rq->lock, flags);
2752 if (rq->curr->sched_class->post_schedule) 2752 if (rq->curr->sched_class->post_schedule)
2753 rq->curr->sched_class->post_schedule(rq); 2753 rq->curr->sched_class->post_schedule(rq);
2754 spin_unlock_irqrestore(&rq->lock, flags); 2754 spin_unlock_irqrestore(&rq->lock, flags);
2755 2755
2756 rq->post_schedule = 0; 2756 rq->post_schedule = 0;
2757 } 2757 }
2758 } 2758 }
2759 2759
2760 #else 2760 #else
2761 2761
2762 static inline void pre_schedule(struct rq *rq, struct task_struct *p) 2762 static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2763 { 2763 {
2764 } 2764 }
2765 2765
2766 static inline void post_schedule(struct rq *rq) 2766 static inline void post_schedule(struct rq *rq)
2767 { 2767 {
2768 } 2768 }
2769 2769
2770 #endif 2770 #endif
2771 2771
2772 /** 2772 /**
2773 * schedule_tail - first thing a freshly forked thread must call. 2773 * schedule_tail - first thing a freshly forked thread must call.
2774 * @prev: the thread we just switched away from. 2774 * @prev: the thread we just switched away from.
2775 */ 2775 */
2776 asmlinkage void schedule_tail(struct task_struct *prev) 2776 asmlinkage void schedule_tail(struct task_struct *prev)
2777 __releases(rq->lock) 2777 __releases(rq->lock)
2778 { 2778 {
2779 struct rq *rq = this_rq(); 2779 struct rq *rq = this_rq();
2780 2780
2781 finish_task_switch(rq, prev); 2781 finish_task_switch(rq, prev);
2782 2782
2783 /* 2783 /*
2784 * FIXME: do we need to worry about rq being invalidated by the 2784 * FIXME: do we need to worry about rq being invalidated by the
2785 * task_switch? 2785 * task_switch?
2786 */ 2786 */
2787 post_schedule(rq); 2787 post_schedule(rq);
2788 2788
2789 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 2789 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
2790 /* In this case, finish_task_switch does not reenable preemption */ 2790 /* In this case, finish_task_switch does not reenable preemption */
2791 preempt_enable(); 2791 preempt_enable();
2792 #endif 2792 #endif
2793 if (current->set_child_tid) 2793 if (current->set_child_tid)
2794 put_user(task_pid_vnr(current), current->set_child_tid); 2794 put_user(task_pid_vnr(current), current->set_child_tid);
2795 } 2795 }
2796 2796
2797 /* 2797 /*
2798 * context_switch - switch to the new MM and the new 2798 * context_switch - switch to the new MM and the new
2799 * thread's register state. 2799 * thread's register state.
2800 */ 2800 */
2801 static inline void 2801 static inline void
2802 context_switch(struct rq *rq, struct task_struct *prev, 2802 context_switch(struct rq *rq, struct task_struct *prev,
2803 struct task_struct *next) 2803 struct task_struct *next)
2804 { 2804 {
2805 struct mm_struct *mm, *oldmm; 2805 struct mm_struct *mm, *oldmm;
2806 2806
2807 prepare_task_switch(rq, prev, next); 2807 prepare_task_switch(rq, prev, next);
2808 trace_sched_switch(rq, prev, next); 2808 trace_sched_switch(rq, prev, next);
2809 mm = next->mm; 2809 mm = next->mm;
2810 oldmm = prev->active_mm; 2810 oldmm = prev->active_mm;
2811 /* 2811 /*
2812 * For paravirt, this is coupled with an exit in switch_to to 2812 * For paravirt, this is coupled with an exit in switch_to to
2813 * combine the page table reload and the switch backend into 2813 * combine the page table reload and the switch backend into
2814 * one hypercall. 2814 * one hypercall.
2815 */ 2815 */
2816 arch_start_context_switch(prev); 2816 arch_start_context_switch(prev);
2817 2817
2818 if (unlikely(!mm)) { 2818 if (unlikely(!mm)) {
2819 next->active_mm = oldmm; 2819 next->active_mm = oldmm;
2820 atomic_inc(&oldmm->mm_count); 2820 atomic_inc(&oldmm->mm_count);
2821 enter_lazy_tlb(oldmm, next); 2821 enter_lazy_tlb(oldmm, next);
2822 } else 2822 } else
2823 switch_mm(oldmm, mm, next); 2823 switch_mm(oldmm, mm, next);
2824 2824
2825 if (unlikely(!prev->mm)) { 2825 if (unlikely(!prev->mm)) {
2826 prev->active_mm = NULL; 2826 prev->active_mm = NULL;
2827 rq->prev_mm = oldmm; 2827 rq->prev_mm = oldmm;
2828 } 2828 }
2829 /* 2829 /*
2830 * Since the runqueue lock will be released by the next 2830 * Since the runqueue lock will be released by the next
2831 * task (which is an invalid locking op but in the case 2831 * task (which is an invalid locking op but in the case
2832 * of the scheduler it's an obvious special-case), so we 2832 * of the scheduler it's an obvious special-case), so we
2833 * do an early lockdep release here: 2833 * do an early lockdep release here:
2834 */ 2834 */
2835 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 2835 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
2836 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2836 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2837 #endif 2837 #endif
2838 2838
2839 /* Here we just switch the register state and the stack. */ 2839 /* Here we just switch the register state and the stack. */
2840 switch_to(prev, next, prev); 2840 switch_to(prev, next, prev);
2841 2841
2842 barrier(); 2842 barrier();
2843 /* 2843 /*
2844 * this_rq must be evaluated again because prev may have moved 2844 * this_rq must be evaluated again because prev may have moved
2845 * CPUs since it called schedule(), thus the 'rq' on its stack 2845 * CPUs since it called schedule(), thus the 'rq' on its stack
2846 * frame will be invalid. 2846 * frame will be invalid.
2847 */ 2847 */
2848 finish_task_switch(this_rq(), prev); 2848 finish_task_switch(this_rq(), prev);
2849 } 2849 }
2850 2850
2851 /* 2851 /*
2852 * nr_running, nr_uninterruptible and nr_context_switches: 2852 * nr_running, nr_uninterruptible and nr_context_switches:
2853 * 2853 *
2854 * externally visible scheduler statistics: current number of runnable 2854 * externally visible scheduler statistics: current number of runnable
2855 * threads, current number of uninterruptible-sleeping threads, total 2855 * threads, current number of uninterruptible-sleeping threads, total
2856 * number of context switches performed since bootup. 2856 * number of context switches performed since bootup.
2857 */ 2857 */
2858 unsigned long nr_running(void) 2858 unsigned long nr_running(void)
2859 { 2859 {
2860 unsigned long i, sum = 0; 2860 unsigned long i, sum = 0;
2861 2861
2862 for_each_online_cpu(i) 2862 for_each_online_cpu(i)
2863 sum += cpu_rq(i)->nr_running; 2863 sum += cpu_rq(i)->nr_running;
2864 2864
2865 return sum; 2865 return sum;
2866 } 2866 }
2867 2867
2868 unsigned long nr_uninterruptible(void) 2868 unsigned long nr_uninterruptible(void)
2869 { 2869 {
2870 unsigned long i, sum = 0; 2870 unsigned long i, sum = 0;
2871 2871
2872 for_each_possible_cpu(i) 2872 for_each_possible_cpu(i)
2873 sum += cpu_rq(i)->nr_uninterruptible; 2873 sum += cpu_rq(i)->nr_uninterruptible;
2874 2874
2875 /* 2875 /*
2876 * Since we read the counters lockless, it might be slightly 2876 * Since we read the counters lockless, it might be slightly
2877 * inaccurate. Do not allow it to go below zero though: 2877 * inaccurate. Do not allow it to go below zero though:
2878 */ 2878 */
2879 if (unlikely((long)sum < 0)) 2879 if (unlikely((long)sum < 0))
2880 sum = 0; 2880 sum = 0;
2881 2881
2882 return sum; 2882 return sum;
2883 } 2883 }
2884 2884
2885 unsigned long long nr_context_switches(void) 2885 unsigned long long nr_context_switches(void)
2886 { 2886 {
2887 int i; 2887 int i;
2888 unsigned long long sum = 0; 2888 unsigned long long sum = 0;
2889 2889
2890 for_each_possible_cpu(i) 2890 for_each_possible_cpu(i)
2891 sum += cpu_rq(i)->nr_switches; 2891 sum += cpu_rq(i)->nr_switches;
2892 2892
2893 return sum; 2893 return sum;
2894 } 2894 }
2895 2895
2896 unsigned long nr_iowait(void) 2896 unsigned long nr_iowait(void)
2897 { 2897 {
2898 unsigned long i, sum = 0; 2898 unsigned long i, sum = 0;
2899 2899
2900 for_each_possible_cpu(i) 2900 for_each_possible_cpu(i)
2901 sum += atomic_read(&cpu_rq(i)->nr_iowait); 2901 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2902 2902
2903 return sum; 2903 return sum;
2904 } 2904 }
2905 2905
2906 unsigned long nr_iowait_cpu(void) 2906 unsigned long nr_iowait_cpu(void)
2907 { 2907 {
2908 struct rq *this = this_rq(); 2908 struct rq *this = this_rq();
2909 return atomic_read(&this->nr_iowait); 2909 return atomic_read(&this->nr_iowait);
2910 } 2910 }
2911 2911
2912 unsigned long this_cpu_load(void) 2912 unsigned long this_cpu_load(void)
2913 { 2913 {
2914 struct rq *this = this_rq(); 2914 struct rq *this = this_rq();
2915 return this->cpu_load[0]; 2915 return this->cpu_load[0];
2916 } 2916 }
2917 2917
2918 2918
2919 /* Variables and functions for calc_load */ 2919 /* Variables and functions for calc_load */
2920 static atomic_long_t calc_load_tasks; 2920 static atomic_long_t calc_load_tasks;
2921 static unsigned long calc_load_update; 2921 static unsigned long calc_load_update;
2922 unsigned long avenrun[3]; 2922 unsigned long avenrun[3];
2923 EXPORT_SYMBOL(avenrun); 2923 EXPORT_SYMBOL(avenrun);
2924 2924
2925 /** 2925 /**
2926 * get_avenrun - get the load average array 2926 * get_avenrun - get the load average array
2927 * @loads: pointer to dest load array 2927 * @loads: pointer to dest load array
2928 * @offset: offset to add 2928 * @offset: offset to add
2929 * @shift: shift count to shift the result left 2929 * @shift: shift count to shift the result left
2930 * 2930 *
2931 * These values are estimates at best, so no need for locking. 2931 * These values are estimates at best, so no need for locking.
2932 */ 2932 */
2933 void get_avenrun(unsigned long *loads, unsigned long offset, int shift) 2933 void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2934 { 2934 {
2935 loads[0] = (avenrun[0] + offset) << shift; 2935 loads[0] = (avenrun[0] + offset) << shift;
2936 loads[1] = (avenrun[1] + offset) << shift; 2936 loads[1] = (avenrun[1] + offset) << shift;
2937 loads[2] = (avenrun[2] + offset) << shift; 2937 loads[2] = (avenrun[2] + offset) << shift;
2938 } 2938 }
2939 2939
2940 static unsigned long 2940 static unsigned long
2941 calc_load(unsigned long load, unsigned long exp, unsigned long active) 2941 calc_load(unsigned long load, unsigned long exp, unsigned long active)
2942 { 2942 {
2943 load *= exp; 2943 load *= exp;
2944 load += active * (FIXED_1 - exp); 2944 load += active * (FIXED_1 - exp);
2945 return load >> FSHIFT; 2945 return load >> FSHIFT;
2946 } 2946 }
2947 2947
2948 /* 2948 /*
2949 * calc_load - update the avenrun load estimates 10 ticks after the 2949 * calc_load - update the avenrun load estimates 10 ticks after the
2950 * CPUs have updated calc_load_tasks. 2950 * CPUs have updated calc_load_tasks.
2951 */ 2951 */
2952 void calc_global_load(void) 2952 void calc_global_load(void)
2953 { 2953 {
2954 unsigned long upd = calc_load_update + 10; 2954 unsigned long upd = calc_load_update + 10;
2955 long active; 2955 long active;
2956 2956
2957 if (time_before(jiffies, upd)) 2957 if (time_before(jiffies, upd))
2958 return; 2958 return;
2959 2959
2960 active = atomic_long_read(&calc_load_tasks); 2960 active = atomic_long_read(&calc_load_tasks);
2961 active = active > 0 ? active * FIXED_1 : 0; 2961 active = active > 0 ? active * FIXED_1 : 0;
2962 2962
2963 avenrun[0] = calc_load(avenrun[0], EXP_1, active); 2963 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
2964 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 2964 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
2965 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 2965 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2966 2966
2967 calc_load_update += LOAD_FREQ; 2967 calc_load_update += LOAD_FREQ;
2968 } 2968 }
2969 2969
2970 /* 2970 /*
2971 * Either called from update_cpu_load() or from a cpu going idle 2971 * Either called from update_cpu_load() or from a cpu going idle
2972 */ 2972 */
2973 static void calc_load_account_active(struct rq *this_rq) 2973 static void calc_load_account_active(struct rq *this_rq)
2974 { 2974 {
2975 long nr_active, delta; 2975 long nr_active, delta;
2976 2976
2977 nr_active = this_rq->nr_running; 2977 nr_active = this_rq->nr_running;
2978 nr_active += (long) this_rq->nr_uninterruptible; 2978 nr_active += (long) this_rq->nr_uninterruptible;
2979 2979
2980 if (nr_active != this_rq->calc_load_active) { 2980 if (nr_active != this_rq->calc_load_active) {
2981 delta = nr_active - this_rq->calc_load_active; 2981 delta = nr_active - this_rq->calc_load_active;
2982 this_rq->calc_load_active = nr_active; 2982 this_rq->calc_load_active = nr_active;
2983 atomic_long_add(delta, &calc_load_tasks); 2983 atomic_long_add(delta, &calc_load_tasks);
2984 } 2984 }
2985 } 2985 }
2986 2986
2987 /* 2987 /*
2988 * Externally visible per-cpu scheduler statistics: 2988 * Externally visible per-cpu scheduler statistics:
2989 * cpu_nr_migrations(cpu) - number of migrations into that cpu 2989 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2990 */ 2990 */
2991 u64 cpu_nr_migrations(int cpu) 2991 u64 cpu_nr_migrations(int cpu)
2992 { 2992 {
2993 return cpu_rq(cpu)->nr_migrations_in; 2993 return cpu_rq(cpu)->nr_migrations_in;
2994 } 2994 }
2995 2995
2996 /* 2996 /*
2997 * Update rq->cpu_load[] statistics. This function is usually called every 2997 * Update rq->cpu_load[] statistics. This function is usually called every
2998 * scheduler tick (TICK_NSEC). 2998 * scheduler tick (TICK_NSEC).
2999 */ 2999 */
3000 static void update_cpu_load(struct rq *this_rq) 3000 static void update_cpu_load(struct rq *this_rq)
3001 { 3001 {
3002 unsigned long this_load = this_rq->load.weight; 3002 unsigned long this_load = this_rq->load.weight;
3003 int i, scale; 3003 int i, scale;
3004 3004
3005 this_rq->nr_load_updates++; 3005 this_rq->nr_load_updates++;
3006 3006
3007 /* Update our load: */ 3007 /* Update our load: */
3008 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3008 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3009 unsigned long old_load, new_load; 3009 unsigned long old_load, new_load;
3010 3010
3011 /* scale is effectively 1 << i now, and >> i divides by scale */ 3011 /* scale is effectively 1 << i now, and >> i divides by scale */
3012 3012
3013 old_load = this_rq->cpu_load[i]; 3013 old_load = this_rq->cpu_load[i];
3014 new_load = this_load; 3014 new_load = this_load;
3015 /* 3015 /*
3016 * Round up the averaging division if load is increasing. This 3016 * Round up the averaging division if load is increasing. This
3017 * prevents us from getting stuck on 9 if the load is 10, for 3017 * prevents us from getting stuck on 9 if the load is 10, for
3018 * example. 3018 * example.
3019 */ 3019 */
3020 if (new_load > old_load) 3020 if (new_load > old_load)
3021 new_load += scale-1; 3021 new_load += scale-1;
3022 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3022 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3023 } 3023 }
3024 3024
3025 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3025 if (time_after_eq(jiffies, this_rq->calc_load_update)) {
3026 this_rq->calc_load_update += LOAD_FREQ; 3026 this_rq->calc_load_update += LOAD_FREQ;
3027 calc_load_account_active(this_rq); 3027 calc_load_account_active(this_rq);
3028 } 3028 }
3029 } 3029 }
3030 3030
3031 #ifdef CONFIG_SMP 3031 #ifdef CONFIG_SMP
3032 3032
3033 /* 3033 /*
3034 * double_rq_lock - safely lock two runqueues 3034 * double_rq_lock - safely lock two runqueues
3035 * 3035 *
3036 * Note this does not disable interrupts like task_rq_lock, 3036 * Note this does not disable interrupts like task_rq_lock,
3037 * you need to do so manually before calling. 3037 * you need to do so manually before calling.
3038 */ 3038 */
3039 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 3039 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3040 __acquires(rq1->lock) 3040 __acquires(rq1->lock)
3041 __acquires(rq2->lock) 3041 __acquires(rq2->lock)
3042 { 3042 {
3043 BUG_ON(!irqs_disabled()); 3043 BUG_ON(!irqs_disabled());
3044 if (rq1 == rq2) { 3044 if (rq1 == rq2) {
3045 spin_lock(&rq1->lock); 3045 spin_lock(&rq1->lock);
3046 __acquire(rq2->lock); /* Fake it out ;) */ 3046 __acquire(rq2->lock); /* Fake it out ;) */
3047 } else { 3047 } else {
3048 if (rq1 < rq2) { 3048 if (rq1 < rq2) {
3049 spin_lock(&rq1->lock); 3049 spin_lock(&rq1->lock);
3050 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 3050 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3051 } else { 3051 } else {
3052 spin_lock(&rq2->lock); 3052 spin_lock(&rq2->lock);
3053 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 3053 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3054 } 3054 }
3055 } 3055 }
3056 update_rq_clock(rq1); 3056 update_rq_clock(rq1);
3057 update_rq_clock(rq2); 3057 update_rq_clock(rq2);
3058 } 3058 }
3059 3059
3060 /* 3060 /*
3061 * double_rq_unlock - safely unlock two runqueues 3061 * double_rq_unlock - safely unlock two runqueues
3062 * 3062 *
3063 * Note this does not restore interrupts like task_rq_unlock, 3063 * Note this does not restore interrupts like task_rq_unlock,
3064 * you need to do so manually after calling. 3064 * you need to do so manually after calling.
3065 */ 3065 */
3066 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 3066 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3067 __releases(rq1->lock) 3067 __releases(rq1->lock)
3068 __releases(rq2->lock) 3068 __releases(rq2->lock)
3069 { 3069 {
3070 spin_unlock(&rq1->lock); 3070 spin_unlock(&rq1->lock);
3071 if (rq1 != rq2) 3071 if (rq1 != rq2)
3072 spin_unlock(&rq2->lock); 3072 spin_unlock(&rq2->lock);
3073 else 3073 else
3074 __release(rq2->lock); 3074 __release(rq2->lock);
3075 } 3075 }
3076 3076
3077 /* 3077 /*
3078 * If dest_cpu is allowed for this process, migrate the task to it. 3078 * If dest_cpu is allowed for this process, migrate the task to it.
3079 * This is accomplished by forcing the cpu_allowed mask to only 3079 * This is accomplished by forcing the cpu_allowed mask to only
3080 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 3080 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3081 * the cpu_allowed mask is restored. 3081 * the cpu_allowed mask is restored.
3082 */ 3082 */
3083 static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3083 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3084 { 3084 {
3085 struct migration_req req; 3085 struct migration_req req;
3086 unsigned long flags; 3086 unsigned long flags;
3087 struct rq *rq; 3087 struct rq *rq;
3088 3088
3089 rq = task_rq_lock(p, &flags); 3089 rq = task_rq_lock(p, &flags);
3090 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3090 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3091 || unlikely(!cpu_active(dest_cpu))) 3091 || unlikely(!cpu_active(dest_cpu)))
3092 goto out; 3092 goto out;
3093 3093
3094 /* force the process onto the specified CPU */ 3094 /* force the process onto the specified CPU */
3095 if (migrate_task(p, dest_cpu, &req)) { 3095 if (migrate_task(p, dest_cpu, &req)) {
3096 /* Need to wait for migration thread (might exit: take ref). */ 3096 /* Need to wait for migration thread (might exit: take ref). */
3097 struct task_struct *mt = rq->migration_thread; 3097 struct task_struct *mt = rq->migration_thread;
3098 3098
3099 get_task_struct(mt); 3099 get_task_struct(mt);
3100 task_rq_unlock(rq, &flags); 3100 task_rq_unlock(rq, &flags);
3101 wake_up_process(mt); 3101 wake_up_process(mt);
3102 put_task_struct(mt); 3102 put_task_struct(mt);
3103 wait_for_completion(&req.done); 3103 wait_for_completion(&req.done);
3104 3104
3105 return; 3105 return;
3106 } 3106 }
3107 out: 3107 out:
3108 task_rq_unlock(rq, &flags); 3108 task_rq_unlock(rq, &flags);
3109 } 3109 }
3110 3110
3111 /* 3111 /*
3112 * sched_exec - execve() is a valuable balancing opportunity, because at 3112 * sched_exec - execve() is a valuable balancing opportunity, because at
3113 * this point the task has the smallest effective memory and cache footprint. 3113 * this point the task has the smallest effective memory and cache footprint.
3114 */ 3114 */
3115 void sched_exec(void) 3115 void sched_exec(void)
3116 { 3116 {
3117 int new_cpu, this_cpu = get_cpu(); 3117 int new_cpu, this_cpu = get_cpu();
3118 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); 3118 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3119 put_cpu(); 3119 put_cpu();
3120 if (new_cpu != this_cpu) 3120 if (new_cpu != this_cpu)
3121 sched_migrate_task(current, new_cpu); 3121 sched_migrate_task(current, new_cpu);
3122 } 3122 }
3123 3123
3124 /* 3124 /*
3125 * pull_task - move a task from a remote runqueue to the local runqueue. 3125 * pull_task - move a task from a remote runqueue to the local runqueue.
3126 * Both runqueues must be locked. 3126 * Both runqueues must be locked.
3127 */ 3127 */
3128 static void pull_task(struct rq *src_rq, struct task_struct *p, 3128 static void pull_task(struct rq *src_rq, struct task_struct *p,
3129 struct rq *this_rq, int this_cpu) 3129 struct rq *this_rq, int this_cpu)
3130 { 3130 {
3131 deactivate_task(src_rq, p, 0); 3131 deactivate_task(src_rq, p, 0);
3132 set_task_cpu(p, this_cpu); 3132 set_task_cpu(p, this_cpu);
3133 activate_task(this_rq, p, 0); 3133 activate_task(this_rq, p, 0);
3134 /* 3134 /*
3135 * Note that idle threads have a prio of MAX_PRIO, for this test 3135 * Note that idle threads have a prio of MAX_PRIO, for this test
3136 * to be always true for them. 3136 * to be always true for them.
3137 */ 3137 */
3138 check_preempt_curr(this_rq, p, 0); 3138 check_preempt_curr(this_rq, p, 0);
3139 } 3139 }
3140 3140
3141 /* 3141 /*
3142 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3142 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3143 */ 3143 */
3144 static 3144 static
3145 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3145 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3146 struct sched_domain *sd, enum cpu_idle_type idle, 3146 struct sched_domain *sd, enum cpu_idle_type idle,
3147 int *all_pinned) 3147 int *all_pinned)
3148 { 3148 {
3149 int tsk_cache_hot = 0; 3149 int tsk_cache_hot = 0;
3150 /* 3150 /*
3151 * We do not migrate tasks that are: 3151 * We do not migrate tasks that are:
3152 * 1) running (obviously), or 3152 * 1) running (obviously), or
3153 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3153 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3154 * 3) are cache-hot on their current CPU. 3154 * 3) are cache-hot on their current CPU.
3155 */ 3155 */
3156 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 3156 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3157 schedstat_inc(p, se.nr_failed_migrations_affine); 3157 schedstat_inc(p, se.nr_failed_migrations_affine);
3158 return 0; 3158 return 0;
3159 } 3159 }
3160 *all_pinned = 0; 3160 *all_pinned = 0;
3161 3161
3162 if (task_running(rq, p)) { 3162 if (task_running(rq, p)) {
3163 schedstat_inc(p, se.nr_failed_migrations_running); 3163 schedstat_inc(p, se.nr_failed_migrations_running);
3164 return 0; 3164 return 0;
3165 } 3165 }
3166 3166
3167 /* 3167 /*
3168 * Aggressive migration if: 3168 * Aggressive migration if:
3169 * 1) task is cache cold, or 3169 * 1) task is cache cold, or
3170 * 2) too many balance attempts have failed. 3170 * 2) too many balance attempts have failed.
3171 */ 3171 */
3172 3172
3173 tsk_cache_hot = task_hot(p, rq->clock, sd); 3173 tsk_cache_hot = task_hot(p, rq->clock, sd);
3174 if (!tsk_cache_hot || 3174 if (!tsk_cache_hot ||
3175 sd->nr_balance_failed > sd->cache_nice_tries) { 3175 sd->nr_balance_failed > sd->cache_nice_tries) {
3176 #ifdef CONFIG_SCHEDSTATS 3176 #ifdef CONFIG_SCHEDSTATS
3177 if (tsk_cache_hot) { 3177 if (tsk_cache_hot) {
3178 schedstat_inc(sd, lb_hot_gained[idle]); 3178 schedstat_inc(sd, lb_hot_gained[idle]);
3179 schedstat_inc(p, se.nr_forced_migrations); 3179 schedstat_inc(p, se.nr_forced_migrations);
3180 } 3180 }
3181 #endif 3181 #endif
3182 return 1; 3182 return 1;
3183 } 3183 }
3184 3184
3185 if (tsk_cache_hot) { 3185 if (tsk_cache_hot) {
3186 schedstat_inc(p, se.nr_failed_migrations_hot); 3186 schedstat_inc(p, se.nr_failed_migrations_hot);
3187 return 0; 3187 return 0;
3188 } 3188 }
3189 return 1; 3189 return 1;
3190 } 3190 }
3191 3191
3192 static unsigned long 3192 static unsigned long
3193 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3193 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3194 unsigned long max_load_move, struct sched_domain *sd, 3194 unsigned long max_load_move, struct sched_domain *sd,
3195 enum cpu_idle_type idle, int *all_pinned, 3195 enum cpu_idle_type idle, int *all_pinned,
3196 int *this_best_prio, struct rq_iterator *iterator) 3196 int *this_best_prio, struct rq_iterator *iterator)
3197 { 3197 {
3198 int loops = 0, pulled = 0, pinned = 0; 3198 int loops = 0, pulled = 0, pinned = 0;
3199 struct task_struct *p; 3199 struct task_struct *p;
3200 long rem_load_move = max_load_move; 3200 long rem_load_move = max_load_move;
3201 3201
3202 if (max_load_move == 0) 3202 if (max_load_move == 0)
3203 goto out; 3203 goto out;
3204 3204
3205 pinned = 1; 3205 pinned = 1;
3206 3206
3207 /* 3207 /*
3208 * Start the load-balancing iterator: 3208 * Start the load-balancing iterator:
3209 */ 3209 */
3210 p = iterator->start(iterator->arg); 3210 p = iterator->start(iterator->arg);
3211 next: 3211 next:
3212 if (!p || loops++ > sysctl_sched_nr_migrate) 3212 if (!p || loops++ > sysctl_sched_nr_migrate)
3213 goto out; 3213 goto out;
3214 3214
3215 if ((p->se.load.weight >> 1) > rem_load_move || 3215 if ((p->se.load.weight >> 1) > rem_load_move ||
3216 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 3216 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3217 p = iterator->next(iterator->arg); 3217 p = iterator->next(iterator->arg);
3218 goto next; 3218 goto next;
3219 } 3219 }
3220 3220
3221 pull_task(busiest, p, this_rq, this_cpu); 3221 pull_task(busiest, p, this_rq, this_cpu);
3222 pulled++; 3222 pulled++;
3223 rem_load_move -= p->se.load.weight; 3223 rem_load_move -= p->se.load.weight;
3224 3224
3225 #ifdef CONFIG_PREEMPT 3225 #ifdef CONFIG_PREEMPT
3226 /* 3226 /*
3227 * NEWIDLE balancing is a source of latency, so preemptible kernels 3227 * NEWIDLE balancing is a source of latency, so preemptible kernels
3228 * will stop after the first task is pulled to minimize the critical 3228 * will stop after the first task is pulled to minimize the critical
3229 * section. 3229 * section.
3230 */ 3230 */
3231 if (idle == CPU_NEWLY_IDLE) 3231 if (idle == CPU_NEWLY_IDLE)
3232 goto out; 3232 goto out;
3233 #endif 3233 #endif
3234 3234
3235 /* 3235 /*
3236 * We only want to steal up to the prescribed amount of weighted load. 3236 * We only want to steal up to the prescribed amount of weighted load.
3237 */ 3237 */
3238 if (rem_load_move > 0) { 3238 if (rem_load_move > 0) {
3239 if (p->prio < *this_best_prio) 3239 if (p->prio < *this_best_prio)
3240 *this_best_prio = p->prio; 3240 *this_best_prio = p->prio;
3241 p = iterator->next(iterator->arg); 3241 p = iterator->next(iterator->arg);
3242 goto next; 3242 goto next;
3243 } 3243 }
3244 out: 3244 out:
3245 /* 3245 /*
3246 * Right now, this is one of only two places pull_task() is called, 3246 * Right now, this is one of only two places pull_task() is called,
3247 * so we can safely collect pull_task() stats here rather than 3247 * so we can safely collect pull_task() stats here rather than
3248 * inside pull_task(). 3248 * inside pull_task().
3249 */ 3249 */
3250 schedstat_add(sd, lb_gained[idle], pulled); 3250 schedstat_add(sd, lb_gained[idle], pulled);
3251 3251
3252 if (all_pinned) 3252 if (all_pinned)
3253 *all_pinned = pinned; 3253 *all_pinned = pinned;
3254 3254
3255 return max_load_move - rem_load_move; 3255 return max_load_move - rem_load_move;
3256 } 3256 }
3257 3257
3258 /* 3258 /*
3259 * move_tasks tries to move up to max_load_move weighted load from busiest to 3259 * move_tasks tries to move up to max_load_move weighted load from busiest to
3260 * this_rq, as part of a balancing operation within domain "sd". 3260 * this_rq, as part of a balancing operation within domain "sd".
3261 * Returns 1 if successful and 0 otherwise. 3261 * Returns 1 if successful and 0 otherwise.
3262 * 3262 *
3263 * Called with both runqueues locked. 3263 * Called with both runqueues locked.
3264 */ 3264 */
3265 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3265 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3266 unsigned long max_load_move, 3266 unsigned long max_load_move,
3267 struct sched_domain *sd, enum cpu_idle_type idle, 3267 struct sched_domain *sd, enum cpu_idle_type idle,
3268 int *all_pinned) 3268 int *all_pinned)
3269 { 3269 {
3270 const struct sched_class *class = sched_class_highest; 3270 const struct sched_class *class = sched_class_highest;
3271 unsigned long total_load_moved = 0; 3271 unsigned long total_load_moved = 0;
3272 int this_best_prio = this_rq->curr->prio; 3272 int this_best_prio = this_rq->curr->prio;
3273 3273
3274 do { 3274 do {
3275 total_load_moved += 3275 total_load_moved +=
3276 class->load_balance(this_rq, this_cpu, busiest, 3276 class->load_balance(this_rq, this_cpu, busiest,
3277 max_load_move - total_load_moved, 3277 max_load_move - total_load_moved,
3278 sd, idle, all_pinned, &this_best_prio); 3278 sd, idle, all_pinned, &this_best_prio);
3279 class = class->next; 3279 class = class->next;
3280 3280
3281 #ifdef CONFIG_PREEMPT 3281 #ifdef CONFIG_PREEMPT
3282 /* 3282 /*
3283 * NEWIDLE balancing is a source of latency, so preemptible 3283 * NEWIDLE balancing is a source of latency, so preemptible
3284 * kernels will stop after the first task is pulled to minimize 3284 * kernels will stop after the first task is pulled to minimize
3285 * the critical section. 3285 * the critical section.
3286 */ 3286 */
3287 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3287 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3288 break; 3288 break;
3289 #endif 3289 #endif
3290 } while (class && max_load_move > total_load_moved); 3290 } while (class && max_load_move > total_load_moved);
3291 3291
3292 return total_load_moved > 0; 3292 return total_load_moved > 0;
3293 } 3293 }
3294 3294
3295 static int 3295 static int
3296 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 3296 iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3297 struct sched_domain *sd, enum cpu_idle_type idle, 3297 struct sched_domain *sd, enum cpu_idle_type idle,
3298 struct rq_iterator *iterator) 3298 struct rq_iterator *iterator)
3299 { 3299 {
3300 struct task_struct *p = iterator->start(iterator->arg); 3300 struct task_struct *p = iterator->start(iterator->arg);
3301 int pinned = 0; 3301 int pinned = 0;
3302 3302
3303 while (p) { 3303 while (p) {
3304 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 3304 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3305 pull_task(busiest, p, this_rq, this_cpu); 3305 pull_task(busiest, p, this_rq, this_cpu);
3306 /* 3306 /*
3307 * Right now, this is only the second place pull_task() 3307 * Right now, this is only the second place pull_task()
3308 * is called, so we can safely collect pull_task() 3308 * is called, so we can safely collect pull_task()
3309 * stats here rather than inside pull_task(). 3309 * stats here rather than inside pull_task().
3310 */ 3310 */
3311 schedstat_inc(sd, lb_gained[idle]); 3311 schedstat_inc(sd, lb_gained[idle]);
3312 3312
3313 return 1; 3313 return 1;
3314 } 3314 }
3315 p = iterator->next(iterator->arg); 3315 p = iterator->next(iterator->arg);
3316 } 3316 }
3317 3317
3318 return 0; 3318 return 0;
3319 } 3319 }
3320 3320
3321 /* 3321 /*
3322 * move_one_task tries to move exactly one task from busiest to this_rq, as 3322 * move_one_task tries to move exactly one task from busiest to this_rq, as
3323 * part of active balancing operations within "domain". 3323 * part of active balancing operations within "domain".
3324 * Returns 1 if successful and 0 otherwise. 3324 * Returns 1 if successful and 0 otherwise.
3325 * 3325 *
3326 * Called with both runqueues locked. 3326 * Called with both runqueues locked.
3327 */ 3327 */
3328 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, 3328 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3329 struct sched_domain *sd, enum cpu_idle_type idle) 3329 struct sched_domain *sd, enum cpu_idle_type idle)
3330 { 3330 {
3331 const struct sched_class *class; 3331 const struct sched_class *class;
3332 3332
3333 for_each_class(class) { 3333 for_each_class(class) {
3334 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3334 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3335 return 1; 3335 return 1;
3336 } 3336 }
3337 3337
3338 return 0; 3338 return 0;
3339 } 3339 }
3340 /********** Helpers for find_busiest_group ************************/ 3340 /********** Helpers for find_busiest_group ************************/
3341 /* 3341 /*
3342 * sd_lb_stats - Structure to store the statistics of a sched_domain 3342 * sd_lb_stats - Structure to store the statistics of a sched_domain
3343 * during load balancing. 3343 * during load balancing.
3344 */ 3344 */
3345 struct sd_lb_stats { 3345 struct sd_lb_stats {
3346 struct sched_group *busiest; /* Busiest group in this sd */ 3346 struct sched_group *busiest; /* Busiest group in this sd */
3347 struct sched_group *this; /* Local group in this sd */ 3347 struct sched_group *this; /* Local group in this sd */
3348 unsigned long total_load; /* Total load of all groups in sd */ 3348 unsigned long total_load; /* Total load of all groups in sd */
3349 unsigned long total_pwr; /* Total power of all groups in sd */ 3349 unsigned long total_pwr; /* Total power of all groups in sd */
3350 unsigned long avg_load; /* Average load across all groups in sd */ 3350 unsigned long avg_load; /* Average load across all groups in sd */
3351 3351
3352 /** Statistics of this group */ 3352 /** Statistics of this group */
3353 unsigned long this_load; 3353 unsigned long this_load;
3354 unsigned long this_load_per_task; 3354 unsigned long this_load_per_task;
3355 unsigned long this_nr_running; 3355 unsigned long this_nr_running;
3356 3356
3357 /* Statistics of the busiest group */ 3357 /* Statistics of the busiest group */
3358 unsigned long max_load; 3358 unsigned long max_load;
3359 unsigned long busiest_load_per_task; 3359 unsigned long busiest_load_per_task;
3360 unsigned long busiest_nr_running; 3360 unsigned long busiest_nr_running;
3361 3361
3362 int group_imb; /* Is there imbalance in this sd */ 3362 int group_imb; /* Is there imbalance in this sd */
3363 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3363 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3364 int power_savings_balance; /* Is powersave balance needed for this sd */ 3364 int power_savings_balance; /* Is powersave balance needed for this sd */
3365 struct sched_group *group_min; /* Least loaded group in sd */ 3365 struct sched_group *group_min; /* Least loaded group in sd */
3366 struct sched_group *group_leader; /* Group which relieves group_min */ 3366 struct sched_group *group_leader; /* Group which relieves group_min */
3367 unsigned long min_load_per_task; /* load_per_task in group_min */ 3367 unsigned long min_load_per_task; /* load_per_task in group_min */
3368 unsigned long leader_nr_running; /* Nr running of group_leader */ 3368 unsigned long leader_nr_running; /* Nr running of group_leader */
3369 unsigned long min_nr_running; /* Nr running of group_min */ 3369 unsigned long min_nr_running; /* Nr running of group_min */
3370 #endif 3370 #endif
3371 }; 3371 };
3372 3372
3373 /* 3373 /*
3374 * sg_lb_stats - stats of a sched_group required for load_balancing 3374 * sg_lb_stats - stats of a sched_group required for load_balancing
3375 */ 3375 */
3376 struct sg_lb_stats { 3376 struct sg_lb_stats {
3377 unsigned long avg_load; /*Avg load across the CPUs of the group */ 3377 unsigned long avg_load; /*Avg load across the CPUs of the group */
3378 unsigned long group_load; /* Total load over the CPUs of the group */ 3378 unsigned long group_load; /* Total load over the CPUs of the group */
3379 unsigned long sum_nr_running; /* Nr tasks running in the group */ 3379 unsigned long sum_nr_running; /* Nr tasks running in the group */
3380 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 3380 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3381 unsigned long group_capacity; 3381 unsigned long group_capacity;
3382 int group_imb; /* Is there an imbalance in the group ? */ 3382 int group_imb; /* Is there an imbalance in the group ? */
3383 }; 3383 };
3384 3384
3385 /** 3385 /**
3386 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 3386 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3387 * @group: The group whose first cpu is to be returned. 3387 * @group: The group whose first cpu is to be returned.
3388 */ 3388 */
3389 static inline unsigned int group_first_cpu(struct sched_group *group) 3389 static inline unsigned int group_first_cpu(struct sched_group *group)
3390 { 3390 {
3391 return cpumask_first(sched_group_cpus(group)); 3391 return cpumask_first(sched_group_cpus(group));
3392 } 3392 }
3393 3393
3394 /** 3394 /**
3395 * get_sd_load_idx - Obtain the load index for a given sched domain. 3395 * get_sd_load_idx - Obtain the load index for a given sched domain.
3396 * @sd: The sched_domain whose load_idx is to be obtained. 3396 * @sd: The sched_domain whose load_idx is to be obtained.
3397 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 3397 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3398 */ 3398 */
3399 static inline int get_sd_load_idx(struct sched_domain *sd, 3399 static inline int get_sd_load_idx(struct sched_domain *sd,
3400 enum cpu_idle_type idle) 3400 enum cpu_idle_type idle)
3401 { 3401 {
3402 int load_idx; 3402 int load_idx;
3403 3403
3404 switch (idle) { 3404 switch (idle) {
3405 case CPU_NOT_IDLE: 3405 case CPU_NOT_IDLE:
3406 load_idx = sd->busy_idx; 3406 load_idx = sd->busy_idx;
3407 break; 3407 break;
3408 3408
3409 case CPU_NEWLY_IDLE: 3409 case CPU_NEWLY_IDLE:
3410 load_idx = sd->newidle_idx; 3410 load_idx = sd->newidle_idx;
3411 break; 3411 break;
3412 default: 3412 default:
3413 load_idx = sd->idle_idx; 3413 load_idx = sd->idle_idx;
3414 break; 3414 break;
3415 } 3415 }
3416 3416
3417 return load_idx; 3417 return load_idx;
3418 } 3418 }
3419 3419
3420 3420
3421 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3421 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3422 /** 3422 /**
3423 * init_sd_power_savings_stats - Initialize power savings statistics for 3423 * init_sd_power_savings_stats - Initialize power savings statistics for
3424 * the given sched_domain, during load balancing. 3424 * the given sched_domain, during load balancing.
3425 * 3425 *
3426 * @sd: Sched domain whose power-savings statistics are to be initialized. 3426 * @sd: Sched domain whose power-savings statistics are to be initialized.
3427 * @sds: Variable containing the statistics for sd. 3427 * @sds: Variable containing the statistics for sd.
3428 * @idle: Idle status of the CPU at which we're performing load-balancing. 3428 * @idle: Idle status of the CPU at which we're performing load-balancing.
3429 */ 3429 */
3430 static inline void init_sd_power_savings_stats(struct sched_domain *sd, 3430 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3431 struct sd_lb_stats *sds, enum cpu_idle_type idle) 3431 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3432 { 3432 {
3433 /* 3433 /*
3434 * Busy processors will not participate in power savings 3434 * Busy processors will not participate in power savings
3435 * balance. 3435 * balance.
3436 */ 3436 */
3437 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 3437 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3438 sds->power_savings_balance = 0; 3438 sds->power_savings_balance = 0;
3439 else { 3439 else {
3440 sds->power_savings_balance = 1; 3440 sds->power_savings_balance = 1;
3441 sds->min_nr_running = ULONG_MAX; 3441 sds->min_nr_running = ULONG_MAX;
3442 sds->leader_nr_running = 0; 3442 sds->leader_nr_running = 0;
3443 } 3443 }
3444 } 3444 }
3445 3445
3446 /** 3446 /**
3447 * update_sd_power_savings_stats - Update the power saving stats for a 3447 * update_sd_power_savings_stats - Update the power saving stats for a
3448 * sched_domain while performing load balancing. 3448 * sched_domain while performing load balancing.
3449 * 3449 *
3450 * @group: sched_group belonging to the sched_domain under consideration. 3450 * @group: sched_group belonging to the sched_domain under consideration.
3451 * @sds: Variable containing the statistics of the sched_domain 3451 * @sds: Variable containing the statistics of the sched_domain
3452 * @local_group: Does group contain the CPU for which we're performing 3452 * @local_group: Does group contain the CPU for which we're performing
3453 * load balancing ? 3453 * load balancing ?
3454 * @sgs: Variable containing the statistics of the group. 3454 * @sgs: Variable containing the statistics of the group.
3455 */ 3455 */
3456 static inline void update_sd_power_savings_stats(struct sched_group *group, 3456 static inline void update_sd_power_savings_stats(struct sched_group *group,
3457 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) 3457 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3458 { 3458 {
3459 3459
3460 if (!sds->power_savings_balance) 3460 if (!sds->power_savings_balance)
3461 return; 3461 return;
3462 3462
3463 /* 3463 /*
3464 * If the local group is idle or completely loaded 3464 * If the local group is idle or completely loaded
3465 * no need to do power savings balance at this domain 3465 * no need to do power savings balance at this domain
3466 */ 3466 */
3467 if (local_group && (sds->this_nr_running >= sgs->group_capacity || 3467 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3468 !sds->this_nr_running)) 3468 !sds->this_nr_running))
3469 sds->power_savings_balance = 0; 3469 sds->power_savings_balance = 0;
3470 3470
3471 /* 3471 /*
3472 * If a group is already running at full capacity or idle, 3472 * If a group is already running at full capacity or idle,
3473 * don't include that group in power savings calculations 3473 * don't include that group in power savings calculations
3474 */ 3474 */
3475 if (!sds->power_savings_balance || 3475 if (!sds->power_savings_balance ||
3476 sgs->sum_nr_running >= sgs->group_capacity || 3476 sgs->sum_nr_running >= sgs->group_capacity ||
3477 !sgs->sum_nr_running) 3477 !sgs->sum_nr_running)
3478 return; 3478 return;
3479 3479
3480 /* 3480 /*
3481 * Calculate the group which has the least non-idle load. 3481 * Calculate the group which has the least non-idle load.
3482 * This is the group from where we need to pick up the load 3482 * This is the group from where we need to pick up the load
3483 * for saving power 3483 * for saving power
3484 */ 3484 */
3485 if ((sgs->sum_nr_running < sds->min_nr_running) || 3485 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3486 (sgs->sum_nr_running == sds->min_nr_running && 3486 (sgs->sum_nr_running == sds->min_nr_running &&
3487 group_first_cpu(group) > group_first_cpu(sds->group_min))) { 3487 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3488 sds->group_min = group; 3488 sds->group_min = group;
3489 sds->min_nr_running = sgs->sum_nr_running; 3489 sds->min_nr_running = sgs->sum_nr_running;
3490 sds->min_load_per_task = sgs->sum_weighted_load / 3490 sds->min_load_per_task = sgs->sum_weighted_load /
3491 sgs->sum_nr_running; 3491 sgs->sum_nr_running;
3492 } 3492 }
3493 3493
3494 /* 3494 /*
3495 * Calculate the group which is almost near its 3495 * Calculate the group which is almost near its
3496 * capacity but still has some space to pick up some load 3496 * capacity but still has some space to pick up some load
3497 * from other group and save more power 3497 * from other group and save more power
3498 */ 3498 */
3499 if (sgs->sum_nr_running + 1 > sgs->group_capacity) 3499 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3500 return; 3500 return;
3501 3501
3502 if (sgs->sum_nr_running > sds->leader_nr_running || 3502 if (sgs->sum_nr_running > sds->leader_nr_running ||
3503 (sgs->sum_nr_running == sds->leader_nr_running && 3503 (sgs->sum_nr_running == sds->leader_nr_running &&
3504 group_first_cpu(group) < group_first_cpu(sds->group_leader))) { 3504 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3505 sds->group_leader = group; 3505 sds->group_leader = group;
3506 sds->leader_nr_running = sgs->sum_nr_running; 3506 sds->leader_nr_running = sgs->sum_nr_running;
3507 } 3507 }
3508 } 3508 }
3509 3509
3510 /** 3510 /**
3511 * check_power_save_busiest_group - see if there is potential for some power-savings balance 3511 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3512 * @sds: Variable containing the statistics of the sched_domain 3512 * @sds: Variable containing the statistics of the sched_domain
3513 * under consideration. 3513 * under consideration.
3514 * @this_cpu: Cpu at which we're currently performing load-balancing. 3514 * @this_cpu: Cpu at which we're currently performing load-balancing.
3515 * @imbalance: Variable to store the imbalance. 3515 * @imbalance: Variable to store the imbalance.
3516 * 3516 *
3517 * Description: 3517 * Description:
3518 * Check if we have potential to perform some power-savings balance. 3518 * Check if we have potential to perform some power-savings balance.
3519 * If yes, set the busiest group to be the least loaded group in the 3519 * If yes, set the busiest group to be the least loaded group in the
3520 * sched_domain, so that it's CPUs can be put to idle. 3520 * sched_domain, so that it's CPUs can be put to idle.
3521 * 3521 *
3522 * Returns 1 if there is potential to perform power-savings balance. 3522 * Returns 1 if there is potential to perform power-savings balance.
3523 * Else returns 0. 3523 * Else returns 0.
3524 */ 3524 */
3525 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 3525 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3526 int this_cpu, unsigned long *imbalance) 3526 int this_cpu, unsigned long *imbalance)
3527 { 3527 {
3528 if (!sds->power_savings_balance) 3528 if (!sds->power_savings_balance)
3529 return 0; 3529 return 0;
3530 3530
3531 if (sds->this != sds->group_leader || 3531 if (sds->this != sds->group_leader ||
3532 sds->group_leader == sds->group_min) 3532 sds->group_leader == sds->group_min)
3533 return 0; 3533 return 0;
3534 3534
3535 *imbalance = sds->min_load_per_task; 3535 *imbalance = sds->min_load_per_task;
3536 sds->busiest = sds->group_min; 3536 sds->busiest = sds->group_min;
3537 3537
3538 return 1; 3538 return 1;
3539 3539
3540 } 3540 }
3541 #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3541 #else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3542 static inline void init_sd_power_savings_stats(struct sched_domain *sd, 3542 static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3543 struct sd_lb_stats *sds, enum cpu_idle_type idle) 3543 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3544 { 3544 {
3545 return; 3545 return;
3546 } 3546 }
3547 3547
3548 static inline void update_sd_power_savings_stats(struct sched_group *group, 3548 static inline void update_sd_power_savings_stats(struct sched_group *group,
3549 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) 3549 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3550 { 3550 {
3551 return; 3551 return;
3552 } 3552 }
3553 3553
3554 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, 3554 static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3555 int this_cpu, unsigned long *imbalance) 3555 int this_cpu, unsigned long *imbalance)
3556 { 3556 {
3557 return 0; 3557 return 0;
3558 } 3558 }
3559 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3559 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3560 3560
3561 3561
3562 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) 3562 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3563 { 3563 {
3564 return SCHED_LOAD_SCALE; 3564 return SCHED_LOAD_SCALE;
3565 } 3565 }
3566 3566
3567 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) 3567 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3568 { 3568 {
3569 return default_scale_freq_power(sd, cpu); 3569 return default_scale_freq_power(sd, cpu);
3570 } 3570 }
3571 3571
3572 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 3572 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3573 { 3573 {
3574 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3574 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3575 unsigned long smt_gain = sd->smt_gain; 3575 unsigned long smt_gain = sd->smt_gain;
3576 3576
3577 smt_gain /= weight; 3577 smt_gain /= weight;
3578 3578
3579 return smt_gain; 3579 return smt_gain;
3580 } 3580 }
3581 3581
3582 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) 3582 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3583 { 3583 {
3584 return default_scale_smt_power(sd, cpu); 3584 return default_scale_smt_power(sd, cpu);
3585 } 3585 }
3586 3586
3587 unsigned long scale_rt_power(int cpu) 3587 unsigned long scale_rt_power(int cpu)
3588 { 3588 {
3589 struct rq *rq = cpu_rq(cpu); 3589 struct rq *rq = cpu_rq(cpu);
3590 u64 total, available; 3590 u64 total, available;
3591 3591
3592 sched_avg_update(rq); 3592 sched_avg_update(rq);
3593 3593
3594 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3594 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3595 available = total - rq->rt_avg; 3595 available = total - rq->rt_avg;
3596 3596
3597 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 3597 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3598 total = SCHED_LOAD_SCALE; 3598 total = SCHED_LOAD_SCALE;
3599 3599
3600 total >>= SCHED_LOAD_SHIFT; 3600 total >>= SCHED_LOAD_SHIFT;
3601 3601
3602 return div_u64(available, total); 3602 return div_u64(available, total);
3603 } 3603 }
3604 3604
3605 static void update_cpu_power(struct sched_domain *sd, int cpu) 3605 static void update_cpu_power(struct sched_domain *sd, int cpu)
3606 { 3606 {
3607 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 3607 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3608 unsigned long power = SCHED_LOAD_SCALE; 3608 unsigned long power = SCHED_LOAD_SCALE;
3609 struct sched_group *sdg = sd->groups; 3609 struct sched_group *sdg = sd->groups;
3610 3610
3611 if (sched_feat(ARCH_POWER)) 3611 if (sched_feat(ARCH_POWER))
3612 power *= arch_scale_freq_power(sd, cpu); 3612 power *= arch_scale_freq_power(sd, cpu);
3613 else 3613 else
3614 power *= default_scale_freq_power(sd, cpu); 3614 power *= default_scale_freq_power(sd, cpu);
3615 3615
3616 power >>= SCHED_LOAD_SHIFT; 3616 power >>= SCHED_LOAD_SHIFT;
3617 3617
3618 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 3618 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3619 if (sched_feat(ARCH_POWER)) 3619 if (sched_feat(ARCH_POWER))
3620 power *= arch_scale_smt_power(sd, cpu); 3620 power *= arch_scale_smt_power(sd, cpu);
3621 else 3621 else
3622 power *= default_scale_smt_power(sd, cpu); 3622 power *= default_scale_smt_power(sd, cpu);
3623 3623
3624 power >>= SCHED_LOAD_SHIFT; 3624 power >>= SCHED_LOAD_SHIFT;
3625 } 3625 }
3626 3626
3627 power *= scale_rt_power(cpu); 3627 power *= scale_rt_power(cpu);
3628 power >>= SCHED_LOAD_SHIFT; 3628 power >>= SCHED_LOAD_SHIFT;
3629 3629
3630 if (!power) 3630 if (!power)
3631 power = 1; 3631 power = 1;
3632 3632
3633 sdg->cpu_power = power; 3633 sdg->cpu_power = power;
3634 } 3634 }
3635 3635
3636 static void update_group_power(struct sched_domain *sd, int cpu) 3636 static void update_group_power(struct sched_domain *sd, int cpu)
3637 { 3637 {
3638 struct sched_domain *child = sd->child; 3638 struct sched_domain *child = sd->child;
3639 struct sched_group *group, *sdg = sd->groups; 3639 struct sched_group *group, *sdg = sd->groups;
3640 unsigned long power; 3640 unsigned long power;
3641 3641
3642 if (!child) { 3642 if (!child) {
3643 update_cpu_power(sd, cpu); 3643 update_cpu_power(sd, cpu);
3644 return; 3644 return;
3645 } 3645 }
3646 3646
3647 power = 0; 3647 power = 0;
3648 3648
3649 group = child->groups; 3649 group = child->groups;
3650 do { 3650 do {
3651 power += group->cpu_power; 3651 power += group->cpu_power;
3652 group = group->next; 3652 group = group->next;
3653 } while (group != child->groups); 3653 } while (group != child->groups);
3654 3654
3655 sdg->cpu_power = power; 3655 sdg->cpu_power = power;
3656 } 3656 }
3657 3657
3658 /** 3658 /**
3659 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3659 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3660 * @sd: The sched_domain whose statistics are to be updated. 3660 * @sd: The sched_domain whose statistics are to be updated.
3661 * @group: sched_group whose statistics are to be updated. 3661 * @group: sched_group whose statistics are to be updated.
3662 * @this_cpu: Cpu for which load balance is currently performed. 3662 * @this_cpu: Cpu for which load balance is currently performed.
3663 * @idle: Idle status of this_cpu 3663 * @idle: Idle status of this_cpu
3664 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3664 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3665 * @sd_idle: Idle status of the sched_domain containing group. 3665 * @sd_idle: Idle status of the sched_domain containing group.
3666 * @local_group: Does group contain this_cpu. 3666 * @local_group: Does group contain this_cpu.
3667 * @cpus: Set of cpus considered for load balancing. 3667 * @cpus: Set of cpus considered for load balancing.
3668 * @balance: Should we balance. 3668 * @balance: Should we balance.
3669 * @sgs: variable to hold the statistics for this group. 3669 * @sgs: variable to hold the statistics for this group.
3670 */ 3670 */
3671 static inline void update_sg_lb_stats(struct sched_domain *sd, 3671 static inline void update_sg_lb_stats(struct sched_domain *sd,
3672 struct sched_group *group, int this_cpu, 3672 struct sched_group *group, int this_cpu,
3673 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3673 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3674 int local_group, const struct cpumask *cpus, 3674 int local_group, const struct cpumask *cpus,
3675 int *balance, struct sg_lb_stats *sgs) 3675 int *balance, struct sg_lb_stats *sgs)
3676 { 3676 {
3677 unsigned long load, max_cpu_load, min_cpu_load; 3677 unsigned long load, max_cpu_load, min_cpu_load;
3678 int i; 3678 int i;
3679 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3679 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3680 unsigned long sum_avg_load_per_task; 3680 unsigned long sum_avg_load_per_task;
3681 unsigned long avg_load_per_task; 3681 unsigned long avg_load_per_task;
3682 3682
3683 if (local_group) { 3683 if (local_group) {
3684 balance_cpu = group_first_cpu(group); 3684 balance_cpu = group_first_cpu(group);
3685 if (balance_cpu == this_cpu) 3685 if (balance_cpu == this_cpu)
3686 update_group_power(sd, this_cpu); 3686 update_group_power(sd, this_cpu);
3687 } 3687 }
3688 3688
3689 /* Tally up the load of all CPUs in the group */ 3689 /* Tally up the load of all CPUs in the group */
3690 sum_avg_load_per_task = avg_load_per_task = 0; 3690 sum_avg_load_per_task = avg_load_per_task = 0;
3691 max_cpu_load = 0; 3691 max_cpu_load = 0;
3692 min_cpu_load = ~0UL; 3692 min_cpu_load = ~0UL;
3693 3693
3694 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3694 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3695 struct rq *rq = cpu_rq(i); 3695 struct rq *rq = cpu_rq(i);
3696 3696
3697 if (*sd_idle && rq->nr_running) 3697 if (*sd_idle && rq->nr_running)
3698 *sd_idle = 0; 3698 *sd_idle = 0;
3699 3699
3700 /* Bias balancing toward cpus of our domain */ 3700 /* Bias balancing toward cpus of our domain */
3701 if (local_group) { 3701 if (local_group) {
3702 if (idle_cpu(i) && !first_idle_cpu) { 3702 if (idle_cpu(i) && !first_idle_cpu) {
3703 first_idle_cpu = 1; 3703 first_idle_cpu = 1;
3704 balance_cpu = i; 3704 balance_cpu = i;
3705 } 3705 }
3706 3706
3707 load = target_load(i, load_idx); 3707 load = target_load(i, load_idx);
3708 } else { 3708 } else {
3709 load = source_load(i, load_idx); 3709 load = source_load(i, load_idx);
3710 if (load > max_cpu_load) 3710 if (load > max_cpu_load)
3711 max_cpu_load = load; 3711 max_cpu_load = load;
3712 if (min_cpu_load > load) 3712 if (min_cpu_load > load)
3713 min_cpu_load = load; 3713 min_cpu_load = load;
3714 } 3714 }
3715 3715
3716 sgs->group_load += load; 3716 sgs->group_load += load;
3717 sgs->sum_nr_running += rq->nr_running; 3717 sgs->sum_nr_running += rq->nr_running;
3718 sgs->sum_weighted_load += weighted_cpuload(i); 3718 sgs->sum_weighted_load += weighted_cpuload(i);
3719 3719
3720 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3720 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3721 } 3721 }
3722 3722
3723 /* 3723 /*
3724 * First idle cpu or the first cpu(busiest) in this sched group 3724 * First idle cpu or the first cpu(busiest) in this sched group
3725 * is eligible for doing load balancing at this and above 3725 * is eligible for doing load balancing at this and above
3726 * domains. In the newly idle case, we will allow all the cpu's 3726 * domains. In the newly idle case, we will allow all the cpu's
3727 * to do the newly idle load balance. 3727 * to do the newly idle load balance.
3728 */ 3728 */
3729 if (idle != CPU_NEWLY_IDLE && local_group && 3729 if (idle != CPU_NEWLY_IDLE && local_group &&
3730 balance_cpu != this_cpu && balance) { 3730 balance_cpu != this_cpu && balance) {
3731 *balance = 0; 3731 *balance = 0;
3732 return; 3732 return;
3733 } 3733 }
3734 3734
3735 /* Adjust by relative CPU power of the group */ 3735 /* Adjust by relative CPU power of the group */
3736 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 3736 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3737 3737
3738 3738
3739 /* 3739 /*
3740 * Consider the group unbalanced when the imbalance is larger 3740 * Consider the group unbalanced when the imbalance is larger
3741 * than the average weight of two tasks. 3741 * than the average weight of two tasks.
3742 * 3742 *
3743 * APZ: with cgroup the avg task weight can vary wildly and 3743 * APZ: with cgroup the avg task weight can vary wildly and
3744 * might not be a suitable number - should we keep a 3744 * might not be a suitable number - should we keep a
3745 * normalized nr_running number somewhere that negates 3745 * normalized nr_running number somewhere that negates
3746 * the hierarchy? 3746 * the hierarchy?
3747 */ 3747 */
3748 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / 3748 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3749 group->cpu_power; 3749 group->cpu_power;
3750 3750
3751 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3751 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3752 sgs->group_imb = 1; 3752 sgs->group_imb = 1;
3753 3753
3754 sgs->group_capacity = 3754 sgs->group_capacity =
3755 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 3755 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3756 } 3756 }
3757 3757
3758 /** 3758 /**
3759 * update_sd_lb_stats - Update sched_group's statistics for load balancing. 3759 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3760 * @sd: sched_domain whose statistics are to be updated. 3760 * @sd: sched_domain whose statistics are to be updated.
3761 * @this_cpu: Cpu for which load balance is currently performed. 3761 * @this_cpu: Cpu for which load balance is currently performed.
3762 * @idle: Idle status of this_cpu 3762 * @idle: Idle status of this_cpu
3763 * @sd_idle: Idle status of the sched_domain containing group. 3763 * @sd_idle: Idle status of the sched_domain containing group.
3764 * @cpus: Set of cpus considered for load balancing. 3764 * @cpus: Set of cpus considered for load balancing.
3765 * @balance: Should we balance. 3765 * @balance: Should we balance.
3766 * @sds: variable to hold the statistics for this sched_domain. 3766 * @sds: variable to hold the statistics for this sched_domain.
3767 */ 3767 */
3768 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, 3768 static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3769 enum cpu_idle_type idle, int *sd_idle, 3769 enum cpu_idle_type idle, int *sd_idle,
3770 const struct cpumask *cpus, int *balance, 3770 const struct cpumask *cpus, int *balance,
3771 struct sd_lb_stats *sds) 3771 struct sd_lb_stats *sds)
3772 { 3772 {
3773 struct sched_domain *child = sd->child; 3773 struct sched_domain *child = sd->child;
3774 struct sched_group *group = sd->groups; 3774 struct sched_group *group = sd->groups;
3775 struct sg_lb_stats sgs; 3775 struct sg_lb_stats sgs;
3776 int load_idx, prefer_sibling = 0; 3776 int load_idx, prefer_sibling = 0;
3777 3777
3778 if (child && child->flags & SD_PREFER_SIBLING) 3778 if (child && child->flags & SD_PREFER_SIBLING)
3779 prefer_sibling = 1; 3779 prefer_sibling = 1;
3780 3780
3781 init_sd_power_savings_stats(sd, sds, idle); 3781 init_sd_power_savings_stats(sd, sds, idle);
3782 load_idx = get_sd_load_idx(sd, idle); 3782 load_idx = get_sd_load_idx(sd, idle);
3783 3783
3784 do { 3784 do {
3785 int local_group; 3785 int local_group;
3786 3786
3787 local_group = cpumask_test_cpu(this_cpu, 3787 local_group = cpumask_test_cpu(this_cpu,
3788 sched_group_cpus(group)); 3788 sched_group_cpus(group));
3789 memset(&sgs, 0, sizeof(sgs)); 3789 memset(&sgs, 0, sizeof(sgs));
3790 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 3790 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3791 local_group, cpus, balance, &sgs); 3791 local_group, cpus, balance, &sgs);
3792 3792
3793 if (local_group && balance && !(*balance)) 3793 if (local_group && balance && !(*balance))
3794 return; 3794 return;
3795 3795
3796 sds->total_load += sgs.group_load; 3796 sds->total_load += sgs.group_load;
3797 sds->total_pwr += group->cpu_power; 3797 sds->total_pwr += group->cpu_power;
3798 3798
3799 /* 3799 /*
3800 * In case the child domain prefers tasks go to siblings 3800 * In case the child domain prefers tasks go to siblings
3801 * first, lower the group capacity to one so that we'll try 3801 * first, lower the group capacity to one so that we'll try
3802 * and move all the excess tasks away. 3802 * and move all the excess tasks away.
3803 */ 3803 */
3804 if (prefer_sibling) 3804 if (prefer_sibling)
3805 sgs.group_capacity = min(sgs.group_capacity, 1UL); 3805 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3806 3806
3807 if (local_group) { 3807 if (local_group) {
3808 sds->this_load = sgs.avg_load; 3808 sds->this_load = sgs.avg_load;
3809 sds->this = group; 3809 sds->this = group;
3810 sds->this_nr_running = sgs.sum_nr_running; 3810 sds->this_nr_running = sgs.sum_nr_running;
3811 sds->this_load_per_task = sgs.sum_weighted_load; 3811 sds->this_load_per_task = sgs.sum_weighted_load;
3812 } else if (sgs.avg_load > sds->max_load && 3812 } else if (sgs.avg_load > sds->max_load &&
3813 (sgs.sum_nr_running > sgs.group_capacity || 3813 (sgs.sum_nr_running > sgs.group_capacity ||
3814 sgs.group_imb)) { 3814 sgs.group_imb)) {
3815 sds->max_load = sgs.avg_load; 3815 sds->max_load = sgs.avg_load;
3816 sds->busiest = group; 3816 sds->busiest = group;
3817 sds->busiest_nr_running = sgs.sum_nr_running; 3817 sds->busiest_nr_running = sgs.sum_nr_running;
3818 sds->busiest_load_per_task = sgs.sum_weighted_load; 3818 sds->busiest_load_per_task = sgs.sum_weighted_load;
3819 sds->group_imb = sgs.group_imb; 3819 sds->group_imb = sgs.group_imb;
3820 } 3820 }
3821 3821
3822 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3822 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3823 group = group->next; 3823 group = group->next;
3824 } while (group != sd->groups); 3824 } while (group != sd->groups);
3825 } 3825 }
3826 3826
3827 /** 3827 /**
3828 * fix_small_imbalance - Calculate the minor imbalance that exists 3828 * fix_small_imbalance - Calculate the minor imbalance that exists
3829 * amongst the groups of a sched_domain, during 3829 * amongst the groups of a sched_domain, during
3830 * load balancing. 3830 * load balancing.
3831 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 3831 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3832 * @this_cpu: The cpu at whose sched_domain we're performing load-balance. 3832 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3833 * @imbalance: Variable to store the imbalance. 3833 * @imbalance: Variable to store the imbalance.
3834 */ 3834 */
3835 static inline void fix_small_imbalance(struct sd_lb_stats *sds, 3835 static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3836 int this_cpu, unsigned long *imbalance) 3836 int this_cpu, unsigned long *imbalance)
3837 { 3837 {
3838 unsigned long tmp, pwr_now = 0, pwr_move = 0; 3838 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3839 unsigned int imbn = 2; 3839 unsigned int imbn = 2;
3840 3840
3841 if (sds->this_nr_running) { 3841 if (sds->this_nr_running) {
3842 sds->this_load_per_task /= sds->this_nr_running; 3842 sds->this_load_per_task /= sds->this_nr_running;
3843 if (sds->busiest_load_per_task > 3843 if (sds->busiest_load_per_task >
3844 sds->this_load_per_task) 3844 sds->this_load_per_task)
3845 imbn = 1; 3845 imbn = 1;
3846 } else 3846 } else
3847 sds->this_load_per_task = 3847 sds->this_load_per_task =
3848 cpu_avg_load_per_task(this_cpu); 3848 cpu_avg_load_per_task(this_cpu);
3849 3849
3850 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= 3850 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3851 sds->busiest_load_per_task * imbn) { 3851 sds->busiest_load_per_task * imbn) {
3852 *imbalance = sds->busiest_load_per_task; 3852 *imbalance = sds->busiest_load_per_task;
3853 return; 3853 return;
3854 } 3854 }
3855 3855
3856 /* 3856 /*
3857 * OK, we don't have enough imbalance to justify moving tasks, 3857 * OK, we don't have enough imbalance to justify moving tasks,
3858 * however we may be able to increase total CPU power used by 3858 * however we may be able to increase total CPU power used by
3859 * moving them. 3859 * moving them.
3860 */ 3860 */
3861 3861
3862 pwr_now += sds->busiest->cpu_power * 3862 pwr_now += sds->busiest->cpu_power *
3863 min(sds->busiest_load_per_task, sds->max_load); 3863 min(sds->busiest_load_per_task, sds->max_load);
3864 pwr_now += sds->this->cpu_power * 3864 pwr_now += sds->this->cpu_power *
3865 min(sds->this_load_per_task, sds->this_load); 3865 min(sds->this_load_per_task, sds->this_load);
3866 pwr_now /= SCHED_LOAD_SCALE; 3866 pwr_now /= SCHED_LOAD_SCALE;
3867 3867
3868 /* Amount of load we'd subtract */ 3868 /* Amount of load we'd subtract */
3869 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3869 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3870 sds->busiest->cpu_power; 3870 sds->busiest->cpu_power;
3871 if (sds->max_load > tmp) 3871 if (sds->max_load > tmp)
3872 pwr_move += sds->busiest->cpu_power * 3872 pwr_move += sds->busiest->cpu_power *
3873 min(sds->busiest_load_per_task, sds->max_load - tmp); 3873 min(sds->busiest_load_per_task, sds->max_load - tmp);
3874 3874
3875 /* Amount of load we'd add */ 3875 /* Amount of load we'd add */
3876 if (sds->max_load * sds->busiest->cpu_power < 3876 if (sds->max_load * sds->busiest->cpu_power <
3877 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3877 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3878 tmp = (sds->max_load * sds->busiest->cpu_power) / 3878 tmp = (sds->max_load * sds->busiest->cpu_power) /
3879 sds->this->cpu_power; 3879 sds->this->cpu_power;
3880 else 3880 else
3881 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / 3881 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3882 sds->this->cpu_power; 3882 sds->this->cpu_power;
3883 pwr_move += sds->this->cpu_power * 3883 pwr_move += sds->this->cpu_power *
3884 min(sds->this_load_per_task, sds->this_load + tmp); 3884 min(sds->this_load_per_task, sds->this_load + tmp);
3885 pwr_move /= SCHED_LOAD_SCALE; 3885 pwr_move /= SCHED_LOAD_SCALE;
3886 3886
3887 /* Move if we gain throughput */ 3887 /* Move if we gain throughput */
3888 if (pwr_move > pwr_now) 3888 if (pwr_move > pwr_now)
3889 *imbalance = sds->busiest_load_per_task; 3889 *imbalance = sds->busiest_load_per_task;
3890 } 3890 }
3891 3891
3892 /** 3892 /**
3893 * calculate_imbalance - Calculate the amount of imbalance present within the 3893 * calculate_imbalance - Calculate the amount of imbalance present within the
3894 * groups of a given sched_domain during load balance. 3894 * groups of a given sched_domain during load balance.
3895 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 3895 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3896 * @this_cpu: Cpu for which currently load balance is being performed. 3896 * @this_cpu: Cpu for which currently load balance is being performed.
3897 * @imbalance: The variable to store the imbalance. 3897 * @imbalance: The variable to store the imbalance.
3898 */ 3898 */
3899 static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, 3899 static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3900 unsigned long *imbalance) 3900 unsigned long *imbalance)
3901 { 3901 {
3902 unsigned long max_pull; 3902 unsigned long max_pull;
3903 /* 3903 /*
3904 * In the presence of smp nice balancing, certain scenarios can have 3904 * In the presence of smp nice balancing, certain scenarios can have
3905 * max load less than avg load(as we skip the groups at or below 3905 * max load less than avg load(as we skip the groups at or below
3906 * its cpu_power, while calculating max_load..) 3906 * its cpu_power, while calculating max_load..)
3907 */ 3907 */
3908 if (sds->max_load < sds->avg_load) { 3908 if (sds->max_load < sds->avg_load) {
3909 *imbalance = 0; 3909 *imbalance = 0;
3910 return fix_small_imbalance(sds, this_cpu, imbalance); 3910 return fix_small_imbalance(sds, this_cpu, imbalance);
3911 } 3911 }
3912 3912
3913 /* Don't want to pull so many tasks that a group would go idle */ 3913 /* Don't want to pull so many tasks that a group would go idle */
3914 max_pull = min(sds->max_load - sds->avg_load, 3914 max_pull = min(sds->max_load - sds->avg_load,
3915 sds->max_load - sds->busiest_load_per_task); 3915 sds->max_load - sds->busiest_load_per_task);
3916 3916
3917 /* How much load to actually move to equalise the imbalance */ 3917 /* How much load to actually move to equalise the imbalance */
3918 *imbalance = min(max_pull * sds->busiest->cpu_power, 3918 *imbalance = min(max_pull * sds->busiest->cpu_power,
3919 (sds->avg_load - sds->this_load) * sds->this->cpu_power) 3919 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3920 / SCHED_LOAD_SCALE; 3920 / SCHED_LOAD_SCALE;
3921 3921
3922 /* 3922 /*
3923 * if *imbalance is less than the average load per runnable task 3923 * if *imbalance is less than the average load per runnable task
3924 * there is no gaurantee that any tasks will be moved so we'll have 3924 * there is no gaurantee that any tasks will be moved so we'll have
3925 * a think about bumping its value to force at least one task to be 3925 * a think about bumping its value to force at least one task to be
3926 * moved 3926 * moved
3927 */ 3927 */
3928 if (*imbalance < sds->busiest_load_per_task) 3928 if (*imbalance < sds->busiest_load_per_task)
3929 return fix_small_imbalance(sds, this_cpu, imbalance); 3929 return fix_small_imbalance(sds, this_cpu, imbalance);
3930 3930
3931 } 3931 }
3932 /******* find_busiest_group() helpers end here *********************/ 3932 /******* find_busiest_group() helpers end here *********************/
3933 3933
3934 /** 3934 /**
3935 * find_busiest_group - Returns the busiest group within the sched_domain 3935 * find_busiest_group - Returns the busiest group within the sched_domain
3936 * if there is an imbalance. If there isn't an imbalance, and 3936 * if there is an imbalance. If there isn't an imbalance, and
3937 * the user has opted for power-savings, it returns a group whose 3937 * the user has opted for power-savings, it returns a group whose
3938 * CPUs can be put to idle by rebalancing those tasks elsewhere, if 3938 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3939 * such a group exists. 3939 * such a group exists.
3940 * 3940 *
3941 * Also calculates the amount of weighted load which should be moved 3941 * Also calculates the amount of weighted load which should be moved
3942 * to restore balance. 3942 * to restore balance.
3943 * 3943 *
3944 * @sd: The sched_domain whose busiest group is to be returned. 3944 * @sd: The sched_domain whose busiest group is to be returned.
3945 * @this_cpu: The cpu for which load balancing is currently being performed. 3945 * @this_cpu: The cpu for which load balancing is currently being performed.
3946 * @imbalance: Variable which stores amount of weighted load which should 3946 * @imbalance: Variable which stores amount of weighted load which should
3947 * be moved to restore balance/put a group to idle. 3947 * be moved to restore balance/put a group to idle.
3948 * @idle: The idle status of this_cpu. 3948 * @idle: The idle status of this_cpu.
3949 * @sd_idle: The idleness of sd 3949 * @sd_idle: The idleness of sd
3950 * @cpus: The set of CPUs under consideration for load-balancing. 3950 * @cpus: The set of CPUs under consideration for load-balancing.
3951 * @balance: Pointer to a variable indicating if this_cpu 3951 * @balance: Pointer to a variable indicating if this_cpu
3952 * is the appropriate cpu to perform load balancing at this_level. 3952 * is the appropriate cpu to perform load balancing at this_level.
3953 * 3953 *
3954 * Returns: - the busiest group if imbalance exists. 3954 * Returns: - the busiest group if imbalance exists.
3955 * - If no imbalance and user has opted for power-savings balance, 3955 * - If no imbalance and user has opted for power-savings balance,
3956 * return the least loaded group whose CPUs can be 3956 * return the least loaded group whose CPUs can be
3957 * put to idle by rebalancing its tasks onto our group. 3957 * put to idle by rebalancing its tasks onto our group.
3958 */ 3958 */
3959 static struct sched_group * 3959 static struct sched_group *
3960 find_busiest_group(struct sched_domain *sd, int this_cpu, 3960 find_busiest_group(struct sched_domain *sd, int this_cpu,
3961 unsigned long *imbalance, enum cpu_idle_type idle, 3961 unsigned long *imbalance, enum cpu_idle_type idle,
3962 int *sd_idle, const struct cpumask *cpus, int *balance) 3962 int *sd_idle, const struct cpumask *cpus, int *balance)
3963 { 3963 {
3964 struct sd_lb_stats sds; 3964 struct sd_lb_stats sds;
3965 3965
3966 memset(&sds, 0, sizeof(sds)); 3966 memset(&sds, 0, sizeof(sds));
3967 3967
3968 /* 3968 /*
3969 * Compute the various statistics relavent for load balancing at 3969 * Compute the various statistics relavent for load balancing at
3970 * this level. 3970 * this level.
3971 */ 3971 */
3972 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, 3972 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3973 balance, &sds); 3973 balance, &sds);
3974 3974
3975 /* Cases where imbalance does not exist from POV of this_cpu */ 3975 /* Cases where imbalance does not exist from POV of this_cpu */
3976 /* 1) this_cpu is not the appropriate cpu to perform load balancing 3976 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3977 * at this level. 3977 * at this level.
3978 * 2) There is no busy sibling group to pull from. 3978 * 2) There is no busy sibling group to pull from.
3979 * 3) This group is the busiest group. 3979 * 3) This group is the busiest group.
3980 * 4) This group is more busy than the avg busieness at this 3980 * 4) This group is more busy than the avg busieness at this
3981 * sched_domain. 3981 * sched_domain.
3982 * 5) The imbalance is within the specified limit. 3982 * 5) The imbalance is within the specified limit.
3983 * 6) Any rebalance would lead to ping-pong 3983 * 6) Any rebalance would lead to ping-pong
3984 */ 3984 */
3985 if (balance && !(*balance)) 3985 if (balance && !(*balance))
3986 goto ret; 3986 goto ret;
3987 3987
3988 if (!sds.busiest || sds.busiest_nr_running == 0) 3988 if (!sds.busiest || sds.busiest_nr_running == 0)
3989 goto out_balanced; 3989 goto out_balanced;
3990 3990
3991 if (sds.this_load >= sds.max_load) 3991 if (sds.this_load >= sds.max_load)
3992 goto out_balanced; 3992 goto out_balanced;
3993 3993
3994 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; 3994 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3995 3995
3996 if (sds.this_load >= sds.avg_load) 3996 if (sds.this_load >= sds.avg_load)
3997 goto out_balanced; 3997 goto out_balanced;
3998 3998
3999 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 3999 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4000 goto out_balanced; 4000 goto out_balanced;
4001 4001
4002 sds.busiest_load_per_task /= sds.busiest_nr_running; 4002 sds.busiest_load_per_task /= sds.busiest_nr_running;
4003 if (sds.group_imb) 4003 if (sds.group_imb)
4004 sds.busiest_load_per_task = 4004 sds.busiest_load_per_task =
4005 min(sds.busiest_load_per_task, sds.avg_load); 4005 min(sds.busiest_load_per_task, sds.avg_load);
4006 4006
4007 /* 4007 /*
4008 * We're trying to get all the cpus to the average_load, so we don't 4008 * We're trying to get all the cpus to the average_load, so we don't
4009 * want to push ourselves above the average load, nor do we wish to 4009 * want to push ourselves above the average load, nor do we wish to
4010 * reduce the max loaded cpu below the average load, as either of these 4010 * reduce the max loaded cpu below the average load, as either of these
4011 * actions would just result in more rebalancing later, and ping-pong 4011 * actions would just result in more rebalancing later, and ping-pong
4012 * tasks around. Thus we look for the minimum possible imbalance. 4012 * tasks around. Thus we look for the minimum possible imbalance.
4013 * Negative imbalances (*we* are more loaded than anyone else) will 4013 * Negative imbalances (*we* are more loaded than anyone else) will
4014 * be counted as no imbalance for these purposes -- we can't fix that 4014 * be counted as no imbalance for these purposes -- we can't fix that
4015 * by pulling tasks to us. Be careful of negative numbers as they'll 4015 * by pulling tasks to us. Be careful of negative numbers as they'll
4016 * appear as very large values with unsigned longs. 4016 * appear as very large values with unsigned longs.
4017 */ 4017 */
4018 if (sds.max_load <= sds.busiest_load_per_task) 4018 if (sds.max_load <= sds.busiest_load_per_task)
4019 goto out_balanced; 4019 goto out_balanced;
4020 4020
4021 /* Looks like there is an imbalance. Compute it */ 4021 /* Looks like there is an imbalance. Compute it */
4022 calculate_imbalance(&sds, this_cpu, imbalance); 4022 calculate_imbalance(&sds, this_cpu, imbalance);
4023 return sds.busiest; 4023 return sds.busiest;
4024 4024
4025 out_balanced: 4025 out_balanced:
4026 /* 4026 /*
4027 * There is no obvious imbalance. But check if we can do some balancing 4027 * There is no obvious imbalance. But check if we can do some balancing
4028 * to save power. 4028 * to save power.
4029 */ 4029 */
4030 if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) 4030 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4031 return sds.busiest; 4031 return sds.busiest;
4032 ret: 4032 ret:
4033 *imbalance = 0; 4033 *imbalance = 0;
4034 return NULL; 4034 return NULL;
4035 } 4035 }
4036 4036
4037 /* 4037 /*
4038 * find_busiest_queue - find the busiest runqueue among the cpus in group. 4038 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4039 */ 4039 */
4040 static struct rq * 4040 static struct rq *
4041 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 4041 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4042 unsigned long imbalance, const struct cpumask *cpus) 4042 unsigned long imbalance, const struct cpumask *cpus)
4043 { 4043 {
4044 struct rq *busiest = NULL, *rq; 4044 struct rq *busiest = NULL, *rq;
4045 unsigned long max_load = 0; 4045 unsigned long max_load = 0;
4046 int i; 4046 int i;
4047 4047
4048 for_each_cpu(i, sched_group_cpus(group)) { 4048 for_each_cpu(i, sched_group_cpus(group)) {
4049 unsigned long power = power_of(i); 4049 unsigned long power = power_of(i);
4050 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 4050 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4051 unsigned long wl; 4051 unsigned long wl;
4052 4052
4053 if (!cpumask_test_cpu(i, cpus)) 4053 if (!cpumask_test_cpu(i, cpus))
4054 continue; 4054 continue;
4055 4055
4056 rq = cpu_rq(i); 4056 rq = cpu_rq(i);
4057 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; 4057 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4058 wl /= power; 4058 wl /= power;
4059 4059
4060 if (capacity && rq->nr_running == 1 && wl > imbalance) 4060 if (capacity && rq->nr_running == 1 && wl > imbalance)
4061 continue; 4061 continue;
4062 4062
4063 if (wl > max_load) { 4063 if (wl > max_load) {
4064 max_load = wl; 4064 max_load = wl;
4065 busiest = rq; 4065 busiest = rq;
4066 } 4066 }
4067 } 4067 }
4068 4068
4069 return busiest; 4069 return busiest;
4070 } 4070 }
4071 4071
4072 /* 4072 /*
4073 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 4073 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4074 * so long as it is large enough. 4074 * so long as it is large enough.
4075 */ 4075 */
4076 #define MAX_PINNED_INTERVAL 512 4076 #define MAX_PINNED_INTERVAL 512
4077 4077
4078 /* Working cpumask for load_balance and load_balance_newidle. */ 4078 /* Working cpumask for load_balance and load_balance_newidle. */
4079 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 4079 static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4080 4080
4081 /* 4081 /*
4082 * Check this_cpu to ensure it is balanced within domain. Attempt to move 4082 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4083 * tasks if there is an imbalance. 4083 * tasks if there is an imbalance.
4084 */ 4084 */
4085 static int load_balance(int this_cpu, struct rq *this_rq, 4085 static int load_balance(int this_cpu, struct rq *this_rq,
4086 struct sched_domain *sd, enum cpu_idle_type idle, 4086 struct sched_domain *sd, enum cpu_idle_type idle,
4087 int *balance) 4087 int *balance)
4088 { 4088 {
4089 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 4089 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4090 struct sched_group *group; 4090 struct sched_group *group;
4091 unsigned long imbalance; 4091 unsigned long imbalance;
4092 struct rq *busiest; 4092 struct rq *busiest;
4093 unsigned long flags; 4093 unsigned long flags;
4094 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4094 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4095 4095
4096 cpumask_setall(cpus); 4096 cpumask_setall(cpus);
4097 4097
4098 /* 4098 /*
4099 * When power savings policy is enabled for the parent domain, idle 4099 * When power savings policy is enabled for the parent domain, idle
4100 * sibling can pick up load irrespective of busy siblings. In this case, 4100 * sibling can pick up load irrespective of busy siblings. In this case,
4101 * let the state of idle sibling percolate up as CPU_IDLE, instead of 4101 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4102 * portraying it as CPU_NOT_IDLE. 4102 * portraying it as CPU_NOT_IDLE.
4103 */ 4103 */
4104 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 4104 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4105 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 4105 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4106 sd_idle = 1; 4106 sd_idle = 1;
4107 4107
4108 schedstat_inc(sd, lb_count[idle]); 4108 schedstat_inc(sd, lb_count[idle]);
4109 4109
4110 redo: 4110 redo:
4111 update_shares(sd); 4111 update_shares(sd);
4112 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 4112 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4113 cpus, balance); 4113 cpus, balance);
4114 4114
4115 if (*balance == 0) 4115 if (*balance == 0)
4116 goto out_balanced; 4116 goto out_balanced;
4117 4117
4118 if (!group) { 4118 if (!group) {
4119 schedstat_inc(sd, lb_nobusyg[idle]); 4119 schedstat_inc(sd, lb_nobusyg[idle]);
4120 goto out_balanced; 4120 goto out_balanced;
4121 } 4121 }
4122 4122
4123 busiest = find_busiest_queue(group, idle, imbalance, cpus); 4123 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4124 if (!busiest) { 4124 if (!busiest) {
4125 schedstat_inc(sd, lb_nobusyq[idle]); 4125 schedstat_inc(sd, lb_nobusyq[idle]);
4126 goto out_balanced; 4126 goto out_balanced;
4127 } 4127 }
4128 4128
4129 BUG_ON(busiest == this_rq); 4129 BUG_ON(busiest == this_rq);
4130 4130
4131 schedstat_add(sd, lb_imbalance[idle], imbalance); 4131 schedstat_add(sd, lb_imbalance[idle], imbalance);
4132 4132
4133 ld_moved = 0; 4133 ld_moved = 0;
4134 if (busiest->nr_running > 1) { 4134 if (busiest->nr_running > 1) {
4135 /* 4135 /*
4136 * Attempt to move tasks. If find_busiest_group has found 4136 * Attempt to move tasks. If find_busiest_group has found
4137 * an imbalance but busiest->nr_running <= 1, the group is 4137 * an imbalance but busiest->nr_running <= 1, the group is
4138 * still unbalanced. ld_moved simply stays zero, so it is 4138 * still unbalanced. ld_moved simply stays zero, so it is
4139 * correctly treated as an imbalance. 4139 * correctly treated as an imbalance.
4140 */ 4140 */
4141 local_irq_save(flags); 4141 local_irq_save(flags);
4142 double_rq_lock(this_rq, busiest); 4142 double_rq_lock(this_rq, busiest);
4143 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4143 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4144 imbalance, sd, idle, &all_pinned); 4144 imbalance, sd, idle, &all_pinned);
4145 double_rq_unlock(this_rq, busiest); 4145 double_rq_unlock(this_rq, busiest);
4146 local_irq_restore(flags); 4146 local_irq_restore(flags);
4147 4147
4148 /* 4148 /*
4149 * some other cpu did the load balance for us. 4149 * some other cpu did the load balance for us.
4150 */ 4150 */
4151 if (ld_moved && this_cpu != smp_processor_id()) 4151 if (ld_moved && this_cpu != smp_processor_id())
4152 resched_cpu(this_cpu); 4152 resched_cpu(this_cpu);
4153 4153
4154 /* All tasks on this runqueue were pinned by CPU affinity */ 4154 /* All tasks on this runqueue were pinned by CPU affinity */
4155 if (unlikely(all_pinned)) { 4155 if (unlikely(all_pinned)) {
4156 cpumask_clear_cpu(cpu_of(busiest), cpus); 4156 cpumask_clear_cpu(cpu_of(busiest), cpus);
4157 if (!cpumask_empty(cpus)) 4157 if (!cpumask_empty(cpus))
4158 goto redo; 4158 goto redo;
4159 goto out_balanced; 4159 goto out_balanced;
4160 } 4160 }
4161 } 4161 }
4162 4162
4163 if (!ld_moved) { 4163 if (!ld_moved) {
4164 schedstat_inc(sd, lb_failed[idle]); 4164 schedstat_inc(sd, lb_failed[idle]);
4165 sd->nr_balance_failed++; 4165 sd->nr_balance_failed++;
4166 4166
4167 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 4167 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4168 4168
4169 spin_lock_irqsave(&busiest->lock, flags); 4169 spin_lock_irqsave(&busiest->lock, flags);
4170 4170
4171 /* don't kick the migration_thread, if the curr 4171 /* don't kick the migration_thread, if the curr
4172 * task on busiest cpu can't be moved to this_cpu 4172 * task on busiest cpu can't be moved to this_cpu
4173 */ 4173 */
4174 if (!cpumask_test_cpu(this_cpu, 4174 if (!cpumask_test_cpu(this_cpu,
4175 &busiest->curr->cpus_allowed)) { 4175 &busiest->curr->cpus_allowed)) {
4176 spin_unlock_irqrestore(&busiest->lock, flags); 4176 spin_unlock_irqrestore(&busiest->lock, flags);
4177 all_pinned = 1; 4177 all_pinned = 1;
4178 goto out_one_pinned; 4178 goto out_one_pinned;
4179 } 4179 }
4180 4180
4181 if (!busiest->active_balance) { 4181 if (!busiest->active_balance) {
4182 busiest->active_balance = 1; 4182 busiest->active_balance = 1;
4183 busiest->push_cpu = this_cpu; 4183 busiest->push_cpu = this_cpu;
4184 active_balance = 1; 4184 active_balance = 1;
4185 } 4185 }
4186 spin_unlock_irqrestore(&busiest->lock, flags); 4186 spin_unlock_irqrestore(&busiest->lock, flags);
4187 if (active_balance) 4187 if (active_balance)
4188 wake_up_process(busiest->migration_thread); 4188 wake_up_process(busiest->migration_thread);
4189 4189
4190 /* 4190 /*
4191 * We've kicked active balancing, reset the failure 4191 * We've kicked active balancing, reset the failure
4192 * counter. 4192 * counter.
4193 */ 4193 */
4194 sd->nr_balance_failed = sd->cache_nice_tries+1; 4194 sd->nr_balance_failed = sd->cache_nice_tries+1;
4195 } 4195 }
4196 } else 4196 } else
4197 sd->nr_balance_failed = 0; 4197 sd->nr_balance_failed = 0;
4198 4198
4199 if (likely(!active_balance)) { 4199 if (likely(!active_balance)) {
4200 /* We were unbalanced, so reset the balancing interval */ 4200 /* We were unbalanced, so reset the balancing interval */
4201 sd->balance_interval = sd->min_interval; 4201 sd->balance_interval = sd->min_interval;
4202 } else { 4202 } else {
4203 /* 4203 /*
4204 * If we've begun active balancing, start to back off. This 4204 * If we've begun active balancing, start to back off. This
4205 * case may not be covered by the all_pinned logic if there 4205 * case may not be covered by the all_pinned logic if there
4206 * is only 1 task on the busy runqueue (because we don't call 4206 * is only 1 task on the busy runqueue (because we don't call
4207 * move_tasks). 4207 * move_tasks).
4208 */ 4208 */
4209 if (sd->balance_interval < sd->max_interval) 4209 if (sd->balance_interval < sd->max_interval)
4210 sd->balance_interval *= 2; 4210 sd->balance_interval *= 2;
4211 } 4211 }
4212 4212
4213 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 4213 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4214 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 4214 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4215 ld_moved = -1; 4215 ld_moved = -1;
4216 4216
4217 goto out; 4217 goto out;
4218 4218
4219 out_balanced: 4219 out_balanced:
4220 schedstat_inc(sd, lb_balanced[idle]); 4220 schedstat_inc(sd, lb_balanced[idle]);
4221 4221
4222 sd->nr_balance_failed = 0; 4222 sd->nr_balance_failed = 0;
4223 4223
4224 out_one_pinned: 4224 out_one_pinned:
4225 /* tune up the balancing interval */ 4225 /* tune up the balancing interval */
4226 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 4226 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4227 (sd->balance_interval < sd->max_interval)) 4227 (sd->balance_interval < sd->max_interval))
4228 sd->balance_interval *= 2; 4228 sd->balance_interval *= 2;
4229 4229
4230 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 4230 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4231 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 4231 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4232 ld_moved = -1; 4232 ld_moved = -1;
4233 else 4233 else
4234 ld_moved = 0; 4234 ld_moved = 0;
4235 out: 4235 out:
4236 if (ld_moved) 4236 if (ld_moved)
4237 update_shares(sd); 4237 update_shares(sd);
4238 return ld_moved; 4238 return ld_moved;
4239 } 4239 }
4240 4240
4241 /* 4241 /*
4242 * Check this_cpu to ensure it is balanced within domain. Attempt to move 4242 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4243 * tasks if there is an imbalance. 4243 * tasks if there is an imbalance.
4244 * 4244 *
4245 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). 4245 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4246 * this_rq is locked. 4246 * this_rq is locked.
4247 */ 4247 */
4248 static int 4248 static int
4249 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) 4249 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4250 { 4250 {
4251 struct sched_group *group; 4251 struct sched_group *group;
4252 struct rq *busiest = NULL; 4252 struct rq *busiest = NULL;
4253 unsigned long imbalance; 4253 unsigned long imbalance;
4254 int ld_moved = 0; 4254 int ld_moved = 0;
4255 int sd_idle = 0; 4255 int sd_idle = 0;
4256 int all_pinned = 0; 4256 int all_pinned = 0;
4257 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4257 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4258 4258
4259 cpumask_setall(cpus); 4259 cpumask_setall(cpus);
4260 4260
4261 /* 4261 /*
4262 * When power savings policy is enabled for the parent domain, idle 4262 * When power savings policy is enabled for the parent domain, idle
4263 * sibling can pick up load irrespective of busy siblings. In this case, 4263 * sibling can pick up load irrespective of busy siblings. In this case,
4264 * let the state of idle sibling percolate up as IDLE, instead of 4264 * let the state of idle sibling percolate up as IDLE, instead of
4265 * portraying it as CPU_NOT_IDLE. 4265 * portraying it as CPU_NOT_IDLE.
4266 */ 4266 */
4267 if (sd->flags & SD_SHARE_CPUPOWER && 4267 if (sd->flags & SD_SHARE_CPUPOWER &&
4268 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 4268 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4269 sd_idle = 1; 4269 sd_idle = 1;
4270 4270
4271 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 4271 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4272 redo: 4272 redo:
4273 update_shares_locked(this_rq, sd); 4273 update_shares_locked(this_rq, sd);
4274 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 4274 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4275 &sd_idle, cpus, NULL); 4275 &sd_idle, cpus, NULL);
4276 if (!group) { 4276 if (!group) {
4277 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); 4277 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4278 goto out_balanced; 4278 goto out_balanced;
4279 } 4279 }
4280 4280
4281 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus); 4281 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4282 if (!busiest) { 4282 if (!busiest) {
4283 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); 4283 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4284 goto out_balanced; 4284 goto out_balanced;
4285 } 4285 }
4286 4286
4287 BUG_ON(busiest == this_rq); 4287 BUG_ON(busiest == this_rq);
4288 4288
4289 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); 4289 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4290 4290
4291 ld_moved = 0; 4291 ld_moved = 0;
4292 if (busiest->nr_running > 1) { 4292 if (busiest->nr_running > 1) {
4293 /* Attempt to move tasks */ 4293 /* Attempt to move tasks */
4294 double_lock_balance(this_rq, busiest); 4294 double_lock_balance(this_rq, busiest);
4295 /* this_rq->clock is already updated */ 4295 /* this_rq->clock is already updated */
4296 update_rq_clock(busiest); 4296 update_rq_clock(busiest);
4297 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4297 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4298 imbalance, sd, CPU_NEWLY_IDLE, 4298 imbalance, sd, CPU_NEWLY_IDLE,
4299 &all_pinned); 4299 &all_pinned);
4300 double_unlock_balance(this_rq, busiest); 4300 double_unlock_balance(this_rq, busiest);
4301 4301
4302 if (unlikely(all_pinned)) { 4302 if (unlikely(all_pinned)) {
4303 cpumask_clear_cpu(cpu_of(busiest), cpus); 4303 cpumask_clear_cpu(cpu_of(busiest), cpus);
4304 if (!cpumask_empty(cpus)) 4304 if (!cpumask_empty(cpus))
4305 goto redo; 4305 goto redo;
4306 } 4306 }
4307 } 4307 }
4308 4308
4309 if (!ld_moved) { 4309 if (!ld_moved) {
4310 int active_balance = 0; 4310 int active_balance = 0;
4311 4311
4312 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 4312 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4313 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 4313 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4314 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 4314 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4315 return -1; 4315 return -1;
4316 4316
4317 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) 4317 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4318 return -1; 4318 return -1;
4319 4319
4320 if (sd->nr_balance_failed++ < 2) 4320 if (sd->nr_balance_failed++ < 2)
4321 return -1; 4321 return -1;
4322 4322
4323 /* 4323 /*
4324 * The only task running in a non-idle cpu can be moved to this 4324 * The only task running in a non-idle cpu can be moved to this
4325 * cpu in an attempt to completely freeup the other CPU 4325 * cpu in an attempt to completely freeup the other CPU
4326 * package. The same method used to move task in load_balance() 4326 * package. The same method used to move task in load_balance()
4327 * have been extended for load_balance_newidle() to speedup 4327 * have been extended for load_balance_newidle() to speedup
4328 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2) 4328 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4329 * 4329 *
4330 * The package power saving logic comes from 4330 * The package power saving logic comes from
4331 * find_busiest_group(). If there are no imbalance, then 4331 * find_busiest_group(). If there are no imbalance, then
4332 * f_b_g() will return NULL. However when sched_mc={1,2} then 4332 * f_b_g() will return NULL. However when sched_mc={1,2} then
4333 * f_b_g() will select a group from which a running task may be 4333 * f_b_g() will select a group from which a running task may be
4334 * pulled to this cpu in order to make the other package idle. 4334 * pulled to this cpu in order to make the other package idle.
4335 * If there is no opportunity to make a package idle and if 4335 * If there is no opportunity to make a package idle and if
4336 * there are no imbalance, then f_b_g() will return NULL and no 4336 * there are no imbalance, then f_b_g() will return NULL and no
4337 * action will be taken in load_balance_newidle(). 4337 * action will be taken in load_balance_newidle().
4338 * 4338 *
4339 * Under normal task pull operation due to imbalance, there 4339 * Under normal task pull operation due to imbalance, there
4340 * will be more than one task in the source run queue and 4340 * will be more than one task in the source run queue and
4341 * move_tasks() will succeed. ld_moved will be true and this 4341 * move_tasks() will succeed. ld_moved will be true and this
4342 * active balance code will not be triggered. 4342 * active balance code will not be triggered.
4343 */ 4343 */
4344 4344
4345 /* Lock busiest in correct order while this_rq is held */ 4345 /* Lock busiest in correct order while this_rq is held */
4346 double_lock_balance(this_rq, busiest); 4346 double_lock_balance(this_rq, busiest);
4347 4347
4348 /* 4348 /*
4349 * don't kick the migration_thread, if the curr 4349 * don't kick the migration_thread, if the curr
4350 * task on busiest cpu can't be moved to this_cpu 4350 * task on busiest cpu can't be moved to this_cpu
4351 */ 4351 */
4352 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 4352 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4353 double_unlock_balance(this_rq, busiest); 4353 double_unlock_balance(this_rq, busiest);
4354 all_pinned = 1; 4354 all_pinned = 1;
4355 return ld_moved; 4355 return ld_moved;
4356 } 4356 }
4357 4357
4358 if (!busiest->active_balance) { 4358 if (!busiest->active_balance) {
4359 busiest->active_balance = 1; 4359 busiest->active_balance = 1;
4360 busiest->push_cpu = this_cpu; 4360 busiest->push_cpu = this_cpu;
4361 active_balance = 1; 4361 active_balance = 1;
4362 } 4362 }
4363 4363
4364 double_unlock_balance(this_rq, busiest); 4364 double_unlock_balance(this_rq, busiest);
4365 /* 4365 /*
4366 * Should not call ttwu while holding a rq->lock 4366 * Should not call ttwu while holding a rq->lock
4367 */ 4367 */
4368 spin_unlock(&this_rq->lock); 4368 spin_unlock(&this_rq->lock);
4369 if (active_balance) 4369 if (active_balance)
4370 wake_up_process(busiest->migration_thread); 4370 wake_up_process(busiest->migration_thread);
4371 spin_lock(&this_rq->lock); 4371 spin_lock(&this_rq->lock);
4372 4372
4373 } else 4373 } else
4374 sd->nr_balance_failed = 0; 4374 sd->nr_balance_failed = 0;
4375 4375
4376 update_shares_locked(this_rq, sd); 4376 update_shares_locked(this_rq, sd);
4377 return ld_moved; 4377 return ld_moved;
4378 4378
4379 out_balanced: 4379 out_balanced:
4380 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); 4380 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4381 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 4381 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4382 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 4382 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4383 return -1; 4383 return -1;
4384 sd->nr_balance_failed = 0; 4384 sd->nr_balance_failed = 0;
4385 4385
4386 return 0; 4386 return 0;
4387 } 4387 }
4388 4388
4389 /* 4389 /*
4390 * idle_balance is called by schedule() if this_cpu is about to become 4390 * idle_balance is called by schedule() if this_cpu is about to become
4391 * idle. Attempts to pull tasks from other CPUs. 4391 * idle. Attempts to pull tasks from other CPUs.
4392 */ 4392 */
4393 static void idle_balance(int this_cpu, struct rq *this_rq) 4393 static void idle_balance(int this_cpu, struct rq *this_rq)
4394 { 4394 {
4395 struct sched_domain *sd; 4395 struct sched_domain *sd;
4396 int pulled_task = 0; 4396 int pulled_task = 0;
4397 unsigned long next_balance = jiffies + HZ; 4397 unsigned long next_balance = jiffies + HZ;
4398 4398
4399 for_each_domain(this_cpu, sd) { 4399 for_each_domain(this_cpu, sd) {
4400 unsigned long interval; 4400 unsigned long interval;
4401 4401
4402 if (!(sd->flags & SD_LOAD_BALANCE)) 4402 if (!(sd->flags & SD_LOAD_BALANCE))
4403 continue; 4403 continue;
4404 4404
4405 if (sd->flags & SD_BALANCE_NEWIDLE) 4405 if (sd->flags & SD_BALANCE_NEWIDLE)
4406 /* If we've pulled tasks over stop searching: */ 4406 /* If we've pulled tasks over stop searching: */
4407 pulled_task = load_balance_newidle(this_cpu, this_rq, 4407 pulled_task = load_balance_newidle(this_cpu, this_rq,
4408 sd); 4408 sd);
4409 4409
4410 interval = msecs_to_jiffies(sd->balance_interval); 4410 interval = msecs_to_jiffies(sd->balance_interval);
4411 if (time_after(next_balance, sd->last_balance + interval)) 4411 if (time_after(next_balance, sd->last_balance + interval))
4412 next_balance = sd->last_balance + interval; 4412 next_balance = sd->last_balance + interval;
4413 if (pulled_task) 4413 if (pulled_task)
4414 break; 4414 break;
4415 } 4415 }
4416 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4416 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4417 /* 4417 /*
4418 * We are going idle. next_balance may be set based on 4418 * We are going idle. next_balance may be set based on
4419 * a busy processor. So reset next_balance. 4419 * a busy processor. So reset next_balance.
4420 */ 4420 */
4421 this_rq->next_balance = next_balance; 4421 this_rq->next_balance = next_balance;
4422 } 4422 }
4423 } 4423 }
4424 4424
4425 /* 4425 /*
4426 * active_load_balance is run by migration threads. It pushes running tasks 4426 * active_load_balance is run by migration threads. It pushes running tasks
4427 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 4427 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4428 * running on each physical CPU where possible, and avoids physical / 4428 * running on each physical CPU where possible, and avoids physical /
4429 * logical imbalances. 4429 * logical imbalances.
4430 * 4430 *
4431 * Called with busiest_rq locked. 4431 * Called with busiest_rq locked.
4432 */ 4432 */
4433 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 4433 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4434 { 4434 {
4435 int target_cpu = busiest_rq->push_cpu; 4435 int target_cpu = busiest_rq->push_cpu;
4436 struct sched_domain *sd; 4436 struct sched_domain *sd;
4437 struct rq *target_rq; 4437 struct rq *target_rq;
4438 4438
4439 /* Is there any task to move? */ 4439 /* Is there any task to move? */
4440 if (busiest_rq->nr_running <= 1) 4440 if (busiest_rq->nr_running <= 1)
4441 return; 4441 return;
4442 4442
4443 target_rq = cpu_rq(target_cpu); 4443 target_rq = cpu_rq(target_cpu);
4444 4444
4445 /* 4445 /*
4446 * This condition is "impossible", if it occurs 4446 * This condition is "impossible", if it occurs
4447 * we need to fix it. Originally reported by 4447 * we need to fix it. Originally reported by
4448 * Bjorn Helgaas on a 128-cpu setup. 4448 * Bjorn Helgaas on a 128-cpu setup.
4449 */ 4449 */
4450 BUG_ON(busiest_rq == target_rq); 4450 BUG_ON(busiest_rq == target_rq);
4451 4451
4452 /* move a task from busiest_rq to target_rq */ 4452 /* move a task from busiest_rq to target_rq */
4453 double_lock_balance(busiest_rq, target_rq); 4453 double_lock_balance(busiest_rq, target_rq);
4454 update_rq_clock(busiest_rq); 4454 update_rq_clock(busiest_rq);
4455 update_rq_clock(target_rq); 4455 update_rq_clock(target_rq);
4456 4456
4457 /* Search for an sd spanning us and the target CPU. */ 4457 /* Search for an sd spanning us and the target CPU. */
4458 for_each_domain(target_cpu, sd) { 4458 for_each_domain(target_cpu, sd) {
4459 if ((sd->flags & SD_LOAD_BALANCE) && 4459 if ((sd->flags & SD_LOAD_BALANCE) &&
4460 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 4460 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4461 break; 4461 break;
4462 } 4462 }
4463 4463
4464 if (likely(sd)) { 4464 if (likely(sd)) {
4465 schedstat_inc(sd, alb_count); 4465 schedstat_inc(sd, alb_count);
4466 4466
4467 if (move_one_task(target_rq, target_cpu, busiest_rq, 4467 if (move_one_task(target_rq, target_cpu, busiest_rq,
4468 sd, CPU_IDLE)) 4468 sd, CPU_IDLE))
4469 schedstat_inc(sd, alb_pushed); 4469 schedstat_inc(sd, alb_pushed);
4470 else 4470 else
4471 schedstat_inc(sd, alb_failed); 4471 schedstat_inc(sd, alb_failed);
4472 } 4472 }
4473 double_unlock_balance(busiest_rq, target_rq); 4473 double_unlock_balance(busiest_rq, target_rq);
4474 } 4474 }
4475 4475
4476 #ifdef CONFIG_NO_HZ 4476 #ifdef CONFIG_NO_HZ
4477 static struct { 4477 static struct {
4478 atomic_t load_balancer; 4478 atomic_t load_balancer;
4479 cpumask_var_t cpu_mask; 4479 cpumask_var_t cpu_mask;
4480 cpumask_var_t ilb_grp_nohz_mask; 4480 cpumask_var_t ilb_grp_nohz_mask;
4481 } nohz ____cacheline_aligned = { 4481 } nohz ____cacheline_aligned = {
4482 .load_balancer = ATOMIC_INIT(-1), 4482 .load_balancer = ATOMIC_INIT(-1),
4483 }; 4483 };
4484 4484
4485 int get_nohz_load_balancer(void) 4485 int get_nohz_load_balancer(void)
4486 { 4486 {
4487 return atomic_read(&nohz.load_balancer); 4487 return atomic_read(&nohz.load_balancer);
4488 } 4488 }
4489 4489
4490 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 4490 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4491 /** 4491 /**
4492 * lowest_flag_domain - Return lowest sched_domain containing flag. 4492 * lowest_flag_domain - Return lowest sched_domain containing flag.
4493 * @cpu: The cpu whose lowest level of sched domain is to 4493 * @cpu: The cpu whose lowest level of sched domain is to
4494 * be returned. 4494 * be returned.
4495 * @flag: The flag to check for the lowest sched_domain 4495 * @flag: The flag to check for the lowest sched_domain
4496 * for the given cpu. 4496 * for the given cpu.
4497 * 4497 *
4498 * Returns the lowest sched_domain of a cpu which contains the given flag. 4498 * Returns the lowest sched_domain of a cpu which contains the given flag.
4499 */ 4499 */
4500 static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) 4500 static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4501 { 4501 {
4502 struct sched_domain *sd; 4502 struct sched_domain *sd;
4503 4503
4504 for_each_domain(cpu, sd) 4504 for_each_domain(cpu, sd)
4505 if (sd && (sd->flags & flag)) 4505 if (sd && (sd->flags & flag))
4506 break; 4506 break;
4507 4507
4508 return sd; 4508 return sd;
4509 } 4509 }
4510 4510
4511 /** 4511 /**
4512 * for_each_flag_domain - Iterates over sched_domains containing the flag. 4512 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4513 * @cpu: The cpu whose domains we're iterating over. 4513 * @cpu: The cpu whose domains we're iterating over.
4514 * @sd: variable holding the value of the power_savings_sd 4514 * @sd: variable holding the value of the power_savings_sd
4515 * for cpu. 4515 * for cpu.
4516 * @flag: The flag to filter the sched_domains to be iterated. 4516 * @flag: The flag to filter the sched_domains to be iterated.
4517 * 4517 *
4518 * Iterates over all the scheduler domains for a given cpu that has the 'flag' 4518 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4519 * set, starting from the lowest sched_domain to the highest. 4519 * set, starting from the lowest sched_domain to the highest.
4520 */ 4520 */
4521 #define for_each_flag_domain(cpu, sd, flag) \ 4521 #define for_each_flag_domain(cpu, sd, flag) \
4522 for (sd = lowest_flag_domain(cpu, flag); \ 4522 for (sd = lowest_flag_domain(cpu, flag); \
4523 (sd && (sd->flags & flag)); sd = sd->parent) 4523 (sd && (sd->flags & flag)); sd = sd->parent)
4524 4524
4525 /** 4525 /**
4526 * is_semi_idle_group - Checks if the given sched_group is semi-idle. 4526 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4527 * @ilb_group: group to be checked for semi-idleness 4527 * @ilb_group: group to be checked for semi-idleness
4528 * 4528 *
4529 * Returns: 1 if the group is semi-idle. 0 otherwise. 4529 * Returns: 1 if the group is semi-idle. 0 otherwise.
4530 * 4530 *
4531 * We define a sched_group to be semi idle if it has atleast one idle-CPU 4531 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4532 * and atleast one non-idle CPU. This helper function checks if the given 4532 * and atleast one non-idle CPU. This helper function checks if the given
4533 * sched_group is semi-idle or not. 4533 * sched_group is semi-idle or not.
4534 */ 4534 */
4535 static inline int is_semi_idle_group(struct sched_group *ilb_group) 4535 static inline int is_semi_idle_group(struct sched_group *ilb_group)
4536 { 4536 {
4537 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 4537 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4538 sched_group_cpus(ilb_group)); 4538 sched_group_cpus(ilb_group));
4539 4539
4540 /* 4540 /*
4541 * A sched_group is semi-idle when it has atleast one busy cpu 4541 * A sched_group is semi-idle when it has atleast one busy cpu
4542 * and atleast one idle cpu. 4542 * and atleast one idle cpu.
4543 */ 4543 */
4544 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 4544 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4545 return 0; 4545 return 0;
4546 4546
4547 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 4547 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4548 return 0; 4548 return 0;
4549 4549
4550 return 1; 4550 return 1;
4551 } 4551 }
4552 /** 4552 /**
4553 * find_new_ilb - Finds the optimum idle load balancer for nomination. 4553 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4554 * @cpu: The cpu which is nominating a new idle_load_balancer. 4554 * @cpu: The cpu which is nominating a new idle_load_balancer.
4555 * 4555 *
4556 * Returns: Returns the id of the idle load balancer if it exists, 4556 * Returns: Returns the id of the idle load balancer if it exists,
4557 * Else, returns >= nr_cpu_ids. 4557 * Else, returns >= nr_cpu_ids.
4558 * 4558 *
4559 * This algorithm picks the idle load balancer such that it belongs to a 4559 * This algorithm picks the idle load balancer such that it belongs to a
4560 * semi-idle powersavings sched_domain. The idea is to try and avoid 4560 * semi-idle powersavings sched_domain. The idea is to try and avoid
4561 * completely idle packages/cores just for the purpose of idle load balancing 4561 * completely idle packages/cores just for the purpose of idle load balancing
4562 * when there are other idle cpu's which are better suited for that job. 4562 * when there are other idle cpu's which are better suited for that job.
4563 */ 4563 */
4564 static int find_new_ilb(int cpu) 4564 static int find_new_ilb(int cpu)
4565 { 4565 {
4566 struct sched_domain *sd; 4566 struct sched_domain *sd;
4567 struct sched_group *ilb_group; 4567 struct sched_group *ilb_group;
4568 4568
4569 /* 4569 /*
4570 * Have idle load balancer selection from semi-idle packages only 4570 * Have idle load balancer selection from semi-idle packages only
4571 * when power-aware load balancing is enabled 4571 * when power-aware load balancing is enabled
4572 */ 4572 */
4573 if (!(sched_smt_power_savings || sched_mc_power_savings)) 4573 if (!(sched_smt_power_savings || sched_mc_power_savings))
4574 goto out_done; 4574 goto out_done;
4575 4575
4576 /* 4576 /*
4577 * Optimize for the case when we have no idle CPUs or only one 4577 * Optimize for the case when we have no idle CPUs or only one
4578 * idle CPU. Don't walk the sched_domain hierarchy in such cases 4578 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4579 */ 4579 */
4580 if (cpumask_weight(nohz.cpu_mask) < 2) 4580 if (cpumask_weight(nohz.cpu_mask) < 2)
4581 goto out_done; 4581 goto out_done;
4582 4582
4583 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 4583 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4584 ilb_group = sd->groups; 4584 ilb_group = sd->groups;
4585 4585
4586 do { 4586 do {
4587 if (is_semi_idle_group(ilb_group)) 4587 if (is_semi_idle_group(ilb_group))
4588 return cpumask_first(nohz.ilb_grp_nohz_mask); 4588 return cpumask_first(nohz.ilb_grp_nohz_mask);
4589 4589
4590 ilb_group = ilb_group->next; 4590 ilb_group = ilb_group->next;
4591 4591
4592 } while (ilb_group != sd->groups); 4592 } while (ilb_group != sd->groups);
4593 } 4593 }
4594 4594
4595 out_done: 4595 out_done:
4596 return cpumask_first(nohz.cpu_mask); 4596 return cpumask_first(nohz.cpu_mask);
4597 } 4597 }
4598 #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 4598 #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4599 static inline int find_new_ilb(int call_cpu) 4599 static inline int find_new_ilb(int call_cpu)
4600 { 4600 {
4601 return cpumask_first(nohz.cpu_mask); 4601 return cpumask_first(nohz.cpu_mask);
4602 } 4602 }
4603 #endif 4603 #endif
4604 4604
4605 /* 4605 /*
4606 * This routine will try to nominate the ilb (idle load balancing) 4606 * This routine will try to nominate the ilb (idle load balancing)
4607 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 4607 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4608 * load balancing on behalf of all those cpus. If all the cpus in the system 4608 * load balancing on behalf of all those cpus. If all the cpus in the system
4609 * go into this tickless mode, then there will be no ilb owner (as there is 4609 * go into this tickless mode, then there will be no ilb owner (as there is
4610 * no need for one) and all the cpus will sleep till the next wakeup event 4610 * no need for one) and all the cpus will sleep till the next wakeup event
4611 * arrives... 4611 * arrives...
4612 * 4612 *
4613 * For the ilb owner, tick is not stopped. And this tick will be used 4613 * For the ilb owner, tick is not stopped. And this tick will be used
4614 * for idle load balancing. ilb owner will still be part of 4614 * for idle load balancing. ilb owner will still be part of
4615 * nohz.cpu_mask.. 4615 * nohz.cpu_mask..
4616 * 4616 *
4617 * While stopping the tick, this cpu will become the ilb owner if there 4617 * While stopping the tick, this cpu will become the ilb owner if there
4618 * is no other owner. And will be the owner till that cpu becomes busy 4618 * is no other owner. And will be the owner till that cpu becomes busy
4619 * or if all cpus in the system stop their ticks at which point 4619 * or if all cpus in the system stop their ticks at which point
4620 * there is no need for ilb owner. 4620 * there is no need for ilb owner.
4621 * 4621 *
4622 * When the ilb owner becomes busy, it nominates another owner, during the 4622 * When the ilb owner becomes busy, it nominates another owner, during the
4623 * next busy scheduler_tick() 4623 * next busy scheduler_tick()
4624 */ 4624 */
4625 int select_nohz_load_balancer(int stop_tick) 4625 int select_nohz_load_balancer(int stop_tick)
4626 { 4626 {
4627 int cpu = smp_processor_id(); 4627 int cpu = smp_processor_id();
4628 4628
4629 if (stop_tick) { 4629 if (stop_tick) {
4630 cpu_rq(cpu)->in_nohz_recently = 1; 4630 cpu_rq(cpu)->in_nohz_recently = 1;
4631 4631
4632 if (!cpu_active(cpu)) { 4632 if (!cpu_active(cpu)) {
4633 if (atomic_read(&nohz.load_balancer) != cpu) 4633 if (atomic_read(&nohz.load_balancer) != cpu)
4634 return 0; 4634 return 0;
4635 4635
4636 /* 4636 /*
4637 * If we are going offline and still the leader, 4637 * If we are going offline and still the leader,
4638 * give up! 4638 * give up!
4639 */ 4639 */
4640 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4640 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4641 BUG(); 4641 BUG();
4642 4642
4643 return 0; 4643 return 0;
4644 } 4644 }
4645 4645
4646 cpumask_set_cpu(cpu, nohz.cpu_mask); 4646 cpumask_set_cpu(cpu, nohz.cpu_mask);
4647 4647
4648 /* time for ilb owner also to sleep */ 4648 /* time for ilb owner also to sleep */
4649 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4649 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4650 if (atomic_read(&nohz.load_balancer) == cpu) 4650 if (atomic_read(&nohz.load_balancer) == cpu)
4651 atomic_set(&nohz.load_balancer, -1); 4651 atomic_set(&nohz.load_balancer, -1);
4652 return 0; 4652 return 0;
4653 } 4653 }
4654 4654
4655 if (atomic_read(&nohz.load_balancer) == -1) { 4655 if (atomic_read(&nohz.load_balancer) == -1) {
4656 /* make me the ilb owner */ 4656 /* make me the ilb owner */
4657 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 4657 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4658 return 1; 4658 return 1;
4659 } else if (atomic_read(&nohz.load_balancer) == cpu) { 4659 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4660 int new_ilb; 4660 int new_ilb;
4661 4661
4662 if (!(sched_smt_power_savings || 4662 if (!(sched_smt_power_savings ||
4663 sched_mc_power_savings)) 4663 sched_mc_power_savings))
4664 return 1; 4664 return 1;
4665 /* 4665 /*
4666 * Check to see if there is a more power-efficient 4666 * Check to see if there is a more power-efficient
4667 * ilb. 4667 * ilb.
4668 */ 4668 */
4669 new_ilb = find_new_ilb(cpu); 4669 new_ilb = find_new_ilb(cpu);
4670 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 4670 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4671 atomic_set(&nohz.load_balancer, -1); 4671 atomic_set(&nohz.load_balancer, -1);
4672 resched_cpu(new_ilb); 4672 resched_cpu(new_ilb);
4673 return 0; 4673 return 0;
4674 } 4674 }
4675 return 1; 4675 return 1;
4676 } 4676 }
4677 } else { 4677 } else {
4678 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 4678 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4679 return 0; 4679 return 0;
4680 4680
4681 cpumask_clear_cpu(cpu, nohz.cpu_mask); 4681 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4682 4682
4683 if (atomic_read(&nohz.load_balancer) == cpu) 4683 if (atomic_read(&nohz.load_balancer) == cpu)
4684 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4684 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4685 BUG(); 4685 BUG();
4686 } 4686 }
4687 return 0; 4687 return 0;
4688 } 4688 }
4689 #endif 4689 #endif
4690 4690
4691 static DEFINE_SPINLOCK(balancing); 4691 static DEFINE_SPINLOCK(balancing);
4692 4692
4693 /* 4693 /*
4694 * It checks each scheduling domain to see if it is due to be balanced, 4694 * It checks each scheduling domain to see if it is due to be balanced,
4695 * and initiates a balancing operation if so. 4695 * and initiates a balancing operation if so.
4696 * 4696 *
4697 * Balancing parameters are set up in arch_init_sched_domains. 4697 * Balancing parameters are set up in arch_init_sched_domains.
4698 */ 4698 */
4699 static void rebalance_domains(int cpu, enum cpu_idle_type idle) 4699 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4700 { 4700 {
4701 int balance = 1; 4701 int balance = 1;
4702 struct rq *rq = cpu_rq(cpu); 4702 struct rq *rq = cpu_rq(cpu);
4703 unsigned long interval; 4703 unsigned long interval;
4704 struct sched_domain *sd; 4704 struct sched_domain *sd;
4705 /* Earliest time when we have to do rebalance again */ 4705 /* Earliest time when we have to do rebalance again */
4706 unsigned long next_balance = jiffies + 60*HZ; 4706 unsigned long next_balance = jiffies + 60*HZ;
4707 int update_next_balance = 0; 4707 int update_next_balance = 0;
4708 int need_serialize; 4708 int need_serialize;
4709 4709
4710 for_each_domain(cpu, sd) { 4710 for_each_domain(cpu, sd) {
4711 if (!(sd->flags & SD_LOAD_BALANCE)) 4711 if (!(sd->flags & SD_LOAD_BALANCE))
4712 continue; 4712 continue;
4713 4713
4714 interval = sd->balance_interval; 4714 interval = sd->balance_interval;
4715 if (idle != CPU_IDLE) 4715 if (idle != CPU_IDLE)
4716 interval *= sd->busy_factor; 4716 interval *= sd->busy_factor;
4717 4717
4718 /* scale ms to jiffies */ 4718 /* scale ms to jiffies */
4719 interval = msecs_to_jiffies(interval); 4719 interval = msecs_to_jiffies(interval);
4720 if (unlikely(!interval)) 4720 if (unlikely(!interval))
4721 interval = 1; 4721 interval = 1;
4722 if (interval > HZ*NR_CPUS/10) 4722 if (interval > HZ*NR_CPUS/10)
4723 interval = HZ*NR_CPUS/10; 4723 interval = HZ*NR_CPUS/10;
4724 4724
4725 need_serialize = sd->flags & SD_SERIALIZE; 4725 need_serialize = sd->flags & SD_SERIALIZE;
4726 4726
4727 if (need_serialize) { 4727 if (need_serialize) {
4728 if (!spin_trylock(&balancing)) 4728 if (!spin_trylock(&balancing))
4729 goto out; 4729 goto out;
4730 } 4730 }
4731 4731
4732 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4732 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4733 if (load_balance(cpu, rq, sd, idle, &balance)) { 4733 if (load_balance(cpu, rq, sd, idle, &balance)) {
4734 /* 4734 /*
4735 * We've pulled tasks over so either we're no 4735 * We've pulled tasks over so either we're no
4736 * longer idle, or one of our SMT siblings is 4736 * longer idle, or one of our SMT siblings is
4737 * not idle. 4737 * not idle.
4738 */ 4738 */
4739 idle = CPU_NOT_IDLE; 4739 idle = CPU_NOT_IDLE;
4740 } 4740 }
4741 sd->last_balance = jiffies; 4741 sd->last_balance = jiffies;
4742 } 4742 }
4743 if (need_serialize) 4743 if (need_serialize)
4744 spin_unlock(&balancing); 4744 spin_unlock(&balancing);
4745 out: 4745 out:
4746 if (time_after(next_balance, sd->last_balance + interval)) { 4746 if (time_after(next_balance, sd->last_balance + interval)) {
4747 next_balance = sd->last_balance + interval; 4747 next_balance = sd->last_balance + interval;
4748 update_next_balance = 1; 4748 update_next_balance = 1;
4749 } 4749 }
4750 4750
4751 /* 4751 /*
4752 * Stop the load balance at this level. There is another 4752 * Stop the load balance at this level. There is another
4753 * CPU in our sched group which is doing load balancing more 4753 * CPU in our sched group which is doing load balancing more
4754 * actively. 4754 * actively.
4755 */ 4755 */
4756 if (!balance) 4756 if (!balance)
4757 break; 4757 break;
4758 } 4758 }
4759 4759
4760 /* 4760 /*
4761 * next_balance will be updated only when there is a need. 4761 * next_balance will be updated only when there is a need.
4762 * When the cpu is attached to null domain for ex, it will not be 4762 * When the cpu is attached to null domain for ex, it will not be
4763 * updated. 4763 * updated.
4764 */ 4764 */
4765 if (likely(update_next_balance)) 4765 if (likely(update_next_balance))
4766 rq->next_balance = next_balance; 4766 rq->next_balance = next_balance;
4767 } 4767 }
4768 4768
4769 /* 4769 /*
4770 * run_rebalance_domains is triggered when needed from the scheduler tick. 4770 * run_rebalance_domains is triggered when needed from the scheduler tick.
4771 * In CONFIG_NO_HZ case, the idle load balance owner will do the 4771 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4772 * rebalancing for all the cpus for whom scheduler ticks are stopped. 4772 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4773 */ 4773 */
4774 static void run_rebalance_domains(struct softirq_action *h) 4774 static void run_rebalance_domains(struct softirq_action *h)
4775 { 4775 {
4776 int this_cpu = smp_processor_id(); 4776 int this_cpu = smp_processor_id();
4777 struct rq *this_rq = cpu_rq(this_cpu); 4777 struct rq *this_rq = cpu_rq(this_cpu);
4778 enum cpu_idle_type idle = this_rq->idle_at_tick ? 4778 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4779 CPU_IDLE : CPU_NOT_IDLE; 4779 CPU_IDLE : CPU_NOT_IDLE;
4780 4780
4781 rebalance_domains(this_cpu, idle); 4781 rebalance_domains(this_cpu, idle);
4782 4782
4783 #ifdef CONFIG_NO_HZ 4783 #ifdef CONFIG_NO_HZ
4784 /* 4784 /*
4785 * If this cpu is the owner for idle load balancing, then do the 4785 * If this cpu is the owner for idle load balancing, then do the
4786 * balancing on behalf of the other idle cpus whose ticks are 4786 * balancing on behalf of the other idle cpus whose ticks are
4787 * stopped. 4787 * stopped.
4788 */ 4788 */
4789 if (this_rq->idle_at_tick && 4789 if (this_rq->idle_at_tick &&
4790 atomic_read(&nohz.load_balancer) == this_cpu) { 4790 atomic_read(&nohz.load_balancer) == this_cpu) {
4791 struct rq *rq; 4791 struct rq *rq;
4792 int balance_cpu; 4792 int balance_cpu;
4793 4793
4794 for_each_cpu(balance_cpu, nohz.cpu_mask) { 4794 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4795 if (balance_cpu == this_cpu) 4795 if (balance_cpu == this_cpu)
4796 continue; 4796 continue;
4797 4797
4798 /* 4798 /*
4799 * If this cpu gets work to do, stop the load balancing 4799 * If this cpu gets work to do, stop the load balancing
4800 * work being done for other cpus. Next load 4800 * work being done for other cpus. Next load
4801 * balancing owner will pick it up. 4801 * balancing owner will pick it up.
4802 */ 4802 */
4803 if (need_resched()) 4803 if (need_resched())
4804 break; 4804 break;
4805 4805
4806 rebalance_domains(balance_cpu, CPU_IDLE); 4806 rebalance_domains(balance_cpu, CPU_IDLE);
4807 4807
4808 rq = cpu_rq(balance_cpu); 4808 rq = cpu_rq(balance_cpu);
4809 if (time_after(this_rq->next_balance, rq->next_balance)) 4809 if (time_after(this_rq->next_balance, rq->next_balance))
4810 this_rq->next_balance = rq->next_balance; 4810 this_rq->next_balance = rq->next_balance;
4811 } 4811 }
4812 } 4812 }
4813 #endif 4813 #endif
4814 } 4814 }
4815 4815
4816 static inline int on_null_domain(int cpu) 4816 static inline int on_null_domain(int cpu)
4817 { 4817 {
4818 return !rcu_dereference(cpu_rq(cpu)->sd); 4818 return !rcu_dereference(cpu_rq(cpu)->sd);
4819 } 4819 }
4820 4820
4821 /* 4821 /*
4822 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4822 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4823 * 4823 *
4824 * In case of CONFIG_NO_HZ, this is the place where we nominate a new 4824 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4825 * idle load balancing owner or decide to stop the periodic load balancing, 4825 * idle load balancing owner or decide to stop the periodic load balancing,
4826 * if the whole system is idle. 4826 * if the whole system is idle.
4827 */ 4827 */
4828 static inline void trigger_load_balance(struct rq *rq, int cpu) 4828 static inline void trigger_load_balance(struct rq *rq, int cpu)
4829 { 4829 {
4830 #ifdef CONFIG_NO_HZ 4830 #ifdef CONFIG_NO_HZ
4831 /* 4831 /*
4832 * If we were in the nohz mode recently and busy at the current 4832 * If we were in the nohz mode recently and busy at the current
4833 * scheduler tick, then check if we need to nominate new idle 4833 * scheduler tick, then check if we need to nominate new idle
4834 * load balancer. 4834 * load balancer.
4835 */ 4835 */
4836 if (rq->in_nohz_recently && !rq->idle_at_tick) { 4836 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4837 rq->in_nohz_recently = 0; 4837 rq->in_nohz_recently = 0;
4838 4838
4839 if (atomic_read(&nohz.load_balancer) == cpu) { 4839 if (atomic_read(&nohz.load_balancer) == cpu) {
4840 cpumask_clear_cpu(cpu, nohz.cpu_mask); 4840 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4841 atomic_set(&nohz.load_balancer, -1); 4841 atomic_set(&nohz.load_balancer, -1);
4842 } 4842 }
4843 4843
4844 if (atomic_read(&nohz.load_balancer) == -1) { 4844 if (atomic_read(&nohz.load_balancer) == -1) {
4845 int ilb = find_new_ilb(cpu); 4845 int ilb = find_new_ilb(cpu);
4846 4846
4847 if (ilb < nr_cpu_ids) 4847 if (ilb < nr_cpu_ids)
4848 resched_cpu(ilb); 4848 resched_cpu(ilb);
4849 } 4849 }
4850 } 4850 }
4851 4851
4852 /* 4852 /*
4853 * If this cpu is idle and doing idle load balancing for all the 4853 * If this cpu is idle and doing idle load balancing for all the
4854 * cpus with ticks stopped, is it time for that to stop? 4854 * cpus with ticks stopped, is it time for that to stop?
4855 */ 4855 */
4856 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 4856 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4857 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4857 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4858 resched_cpu(cpu); 4858 resched_cpu(cpu);
4859 return; 4859 return;
4860 } 4860 }
4861 4861
4862 /* 4862 /*
4863 * If this cpu is idle and the idle load balancing is done by 4863 * If this cpu is idle and the idle load balancing is done by
4864 * someone else, then no need raise the SCHED_SOFTIRQ 4864 * someone else, then no need raise the SCHED_SOFTIRQ
4865 */ 4865 */
4866 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 4866 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4867 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4867 cpumask_test_cpu(cpu, nohz.cpu_mask))
4868 return; 4868 return;
4869 #endif 4869 #endif
4870 /* Don't need to rebalance while attached to NULL domain */ 4870 /* Don't need to rebalance while attached to NULL domain */
4871 if (time_after_eq(jiffies, rq->next_balance) && 4871 if (time_after_eq(jiffies, rq->next_balance) &&
4872 likely(!on_null_domain(cpu))) 4872 likely(!on_null_domain(cpu)))
4873 raise_softirq(SCHED_SOFTIRQ); 4873 raise_softirq(SCHED_SOFTIRQ);
4874 } 4874 }
4875 4875
4876 #else /* CONFIG_SMP */ 4876 #else /* CONFIG_SMP */
4877 4877
4878 /* 4878 /*
4879 * on UP we do not need to balance between CPUs: 4879 * on UP we do not need to balance between CPUs:
4880 */ 4880 */
4881 static inline void idle_balance(int cpu, struct rq *rq) 4881 static inline void idle_balance(int cpu, struct rq *rq)
4882 { 4882 {
4883 } 4883 }
4884 4884
4885 #endif 4885 #endif
4886 4886
4887 DEFINE_PER_CPU(struct kernel_stat, kstat); 4887 DEFINE_PER_CPU(struct kernel_stat, kstat);
4888 4888
4889 EXPORT_PER_CPU_SYMBOL(kstat); 4889 EXPORT_PER_CPU_SYMBOL(kstat);
4890 4890
4891 /* 4891 /*
4892 * Return any ns on the sched_clock that have not yet been accounted in 4892 * Return any ns on the sched_clock that have not yet been accounted in
4893 * @p in case that task is currently running. 4893 * @p in case that task is currently running.
4894 * 4894 *
4895 * Called with task_rq_lock() held on @rq. 4895 * Called with task_rq_lock() held on @rq.
4896 */ 4896 */
4897 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) 4897 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4898 { 4898 {
4899 u64 ns = 0; 4899 u64 ns = 0;
4900 4900
4901 if (task_current(rq, p)) { 4901 if (task_current(rq, p)) {
4902 update_rq_clock(rq); 4902 update_rq_clock(rq);
4903 ns = rq->clock - p->se.exec_start; 4903 ns = rq->clock - p->se.exec_start;
4904 if ((s64)ns < 0) 4904 if ((s64)ns < 0)
4905 ns = 0; 4905 ns = 0;
4906 } 4906 }
4907 4907
4908 return ns; 4908 return ns;
4909 } 4909 }
4910 4910
4911 unsigned long long task_delta_exec(struct task_struct *p) 4911 unsigned long long task_delta_exec(struct task_struct *p)
4912 { 4912 {
4913 unsigned long flags; 4913 unsigned long flags;
4914 struct rq *rq; 4914 struct rq *rq;
4915 u64 ns = 0; 4915 u64 ns = 0;
4916 4916
4917 rq = task_rq_lock(p, &flags); 4917 rq = task_rq_lock(p, &flags);
4918 ns = do_task_delta_exec(p, rq); 4918 ns = do_task_delta_exec(p, rq);
4919 task_rq_unlock(rq, &flags); 4919 task_rq_unlock(rq, &flags);
4920 4920
4921 return ns; 4921 return ns;
4922 } 4922 }
4923 4923
4924 /* 4924 /*
4925 * Return accounted runtime for the task. 4925 * Return accounted runtime for the task.
4926 * In case the task is currently running, return the runtime plus current's 4926 * In case the task is currently running, return the runtime plus current's
4927 * pending runtime that have not been accounted yet. 4927 * pending runtime that have not been accounted yet.
4928 */ 4928 */
4929 unsigned long long task_sched_runtime(struct task_struct *p) 4929 unsigned long long task_sched_runtime(struct task_struct *p)
4930 { 4930 {
4931 unsigned long flags; 4931 unsigned long flags;
4932 struct rq *rq; 4932 struct rq *rq;
4933 u64 ns = 0; 4933 u64 ns = 0;
4934 4934
4935 rq = task_rq_lock(p, &flags); 4935 rq = task_rq_lock(p, &flags);
4936 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 4936 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4937 task_rq_unlock(rq, &flags); 4937 task_rq_unlock(rq, &flags);
4938 4938
4939 return ns; 4939 return ns;
4940 } 4940 }
4941 4941
4942 /* 4942 /*
4943 * Return sum_exec_runtime for the thread group. 4943 * Return sum_exec_runtime for the thread group.
4944 * In case the task is currently running, return the sum plus current's 4944 * In case the task is currently running, return the sum plus current's
4945 * pending runtime that have not been accounted yet. 4945 * pending runtime that have not been accounted yet.
4946 * 4946 *
4947 * Note that the thread group might have other running tasks as well, 4947 * Note that the thread group might have other running tasks as well,
4948 * so the return value not includes other pending runtime that other 4948 * so the return value not includes other pending runtime that other
4949 * running tasks might have. 4949 * running tasks might have.
4950 */ 4950 */
4951 unsigned long long thread_group_sched_runtime(struct task_struct *p) 4951 unsigned long long thread_group_sched_runtime(struct task_struct *p)
4952 { 4952 {
4953 struct task_cputime totals; 4953 struct task_cputime totals;
4954 unsigned long flags; 4954 unsigned long flags;
4955 struct rq *rq; 4955 struct rq *rq;
4956 u64 ns; 4956 u64 ns;
4957 4957
4958 rq = task_rq_lock(p, &flags); 4958 rq = task_rq_lock(p, &flags);
4959 thread_group_cputime(p, &totals); 4959 thread_group_cputime(p, &totals);
4960 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 4960 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4961 task_rq_unlock(rq, &flags); 4961 task_rq_unlock(rq, &flags);
4962 4962
4963 return ns; 4963 return ns;
4964 } 4964 }
4965 4965
4966 /* 4966 /*
4967 * Account user cpu time to a process. 4967 * Account user cpu time to a process.
4968 * @p: the process that the cpu time gets accounted to 4968 * @p: the process that the cpu time gets accounted to
4969 * @cputime: the cpu time spent in user space since the last update 4969 * @cputime: the cpu time spent in user space since the last update
4970 * @cputime_scaled: cputime scaled by cpu frequency 4970 * @cputime_scaled: cputime scaled by cpu frequency
4971 */ 4971 */
4972 void account_user_time(struct task_struct *p, cputime_t cputime, 4972 void account_user_time(struct task_struct *p, cputime_t cputime,
4973 cputime_t cputime_scaled) 4973 cputime_t cputime_scaled)
4974 { 4974 {
4975 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 4975 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4976 cputime64_t tmp; 4976 cputime64_t tmp;
4977 4977
4978 /* Add user time to process. */ 4978 /* Add user time to process. */
4979 p->utime = cputime_add(p->utime, cputime); 4979 p->utime = cputime_add(p->utime, cputime);
4980 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 4980 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
4981 account_group_user_time(p, cputime); 4981 account_group_user_time(p, cputime);
4982 4982
4983 /* Add user time to cpustat. */ 4983 /* Add user time to cpustat. */
4984 tmp = cputime_to_cputime64(cputime); 4984 tmp = cputime_to_cputime64(cputime);
4985 if (TASK_NICE(p) > 0) 4985 if (TASK_NICE(p) > 0)
4986 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4986 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4987 else 4987 else
4988 cpustat->user = cputime64_add(cpustat->user, tmp); 4988 cpustat->user = cputime64_add(cpustat->user, tmp);
4989 4989
4990 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); 4990 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4991 /* Account for user time used */ 4991 /* Account for user time used */
4992 acct_update_integrals(p); 4992 acct_update_integrals(p);
4993 } 4993 }
4994 4994
4995 /* 4995 /*
4996 * Account guest cpu time to a process. 4996 * Account guest cpu time to a process.
4997 * @p: the process that the cpu time gets accounted to 4997 * @p: the process that the cpu time gets accounted to
4998 * @cputime: the cpu time spent in virtual machine since the last update 4998 * @cputime: the cpu time spent in virtual machine since the last update
4999 * @cputime_scaled: cputime scaled by cpu frequency 4999 * @cputime_scaled: cputime scaled by cpu frequency
5000 */ 5000 */
5001 static void account_guest_time(struct task_struct *p, cputime_t cputime, 5001 static void account_guest_time(struct task_struct *p, cputime_t cputime,
5002 cputime_t cputime_scaled) 5002 cputime_t cputime_scaled)
5003 { 5003 {
5004 cputime64_t tmp; 5004 cputime64_t tmp;
5005 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 5005 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5006 5006
5007 tmp = cputime_to_cputime64(cputime); 5007 tmp = cputime_to_cputime64(cputime);
5008 5008
5009 /* Add guest time to process. */ 5009 /* Add guest time to process. */
5010 p->utime = cputime_add(p->utime, cputime); 5010 p->utime = cputime_add(p->utime, cputime);
5011 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); 5011 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
5012 account_group_user_time(p, cputime); 5012 account_group_user_time(p, cputime);
5013 p->gtime = cputime_add(p->gtime, cputime); 5013 p->gtime = cputime_add(p->gtime, cputime);
5014 5014
5015 /* Add guest time to cpustat. */ 5015 /* Add guest time to cpustat. */
5016 cpustat->user = cputime64_add(cpustat->user, tmp); 5016 cpustat->user = cputime64_add(cpustat->user, tmp);
5017 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5017 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5018 } 5018 }
5019 5019
5020 /* 5020 /*
5021 * Account system cpu time to a process. 5021 * Account system cpu time to a process.
5022 * @p: the process that the cpu time gets accounted to 5022 * @p: the process that the cpu time gets accounted to
5023 * @hardirq_offset: the offset to subtract from hardirq_count() 5023 * @hardirq_offset: the offset to subtract from hardirq_count()
5024 * @cputime: the cpu time spent in kernel space since the last update 5024 * @cputime: the cpu time spent in kernel space since the last update
5025 * @cputime_scaled: cputime scaled by cpu frequency 5025 * @cputime_scaled: cputime scaled by cpu frequency
5026 */ 5026 */
5027 void account_system_time(struct task_struct *p, int hardirq_offset, 5027 void account_system_time(struct task_struct *p, int hardirq_offset,
5028 cputime_t cputime, cputime_t cputime_scaled) 5028 cputime_t cputime, cputime_t cputime_scaled)
5029 { 5029 {
5030 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 5030 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5031 cputime64_t tmp; 5031 cputime64_t tmp;
5032 5032
5033 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 5033 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
5034 account_guest_time(p, cputime, cputime_scaled); 5034 account_guest_time(p, cputime, cputime_scaled);
5035 return; 5035 return;
5036 } 5036 }
5037 5037
5038 /* Add system time to process. */ 5038 /* Add system time to process. */
5039 p->stime = cputime_add(p->stime, cputime); 5039 p->stime = cputime_add(p->stime, cputime);
5040 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); 5040 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
5041 account_group_system_time(p, cputime); 5041 account_group_system_time(p, cputime);
5042 5042
5043 /* Add system time to cpustat. */ 5043 /* Add system time to cpustat. */
5044 tmp = cputime_to_cputime64(cputime); 5044 tmp = cputime_to_cputime64(cputime);
5045 if (hardirq_count() - hardirq_offset) 5045 if (hardirq_count() - hardirq_offset)
5046 cpustat->irq = cputime64_add(cpustat->irq, tmp); 5046 cpustat->irq = cputime64_add(cpustat->irq, tmp);
5047 else if (softirq_count()) 5047 else if (softirq_count())
5048 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 5048 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
5049 else 5049 else
5050 cpustat->system = cputime64_add(cpustat->system, tmp); 5050 cpustat->system = cputime64_add(cpustat->system, tmp);
5051 5051
5052 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); 5052 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
5053 5053
5054 /* Account for system time used */ 5054 /* Account for system time used */
5055 acct_update_integrals(p); 5055 acct_update_integrals(p);
5056 } 5056 }
5057 5057
5058 /* 5058 /*
5059 * Account for involuntary wait time. 5059 * Account for involuntary wait time.
5060 * @steal: the cpu time spent in involuntary wait 5060 * @steal: the cpu time spent in involuntary wait
5061 */ 5061 */
5062 void account_steal_time(cputime_t cputime) 5062 void account_steal_time(cputime_t cputime)
5063 { 5063 {
5064 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 5064 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5065 cputime64_t cputime64 = cputime_to_cputime64(cputime); 5065 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5066 5066
5067 cpustat->steal = cputime64_add(cpustat->steal, cputime64); 5067 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
5068 } 5068 }
5069 5069
5070 /* 5070 /*
5071 * Account for idle time. 5071 * Account for idle time.
5072 * @cputime: the cpu time spent in idle wait 5072 * @cputime: the cpu time spent in idle wait
5073 */ 5073 */
5074 void account_idle_time(cputime_t cputime) 5074 void account_idle_time(cputime_t cputime)
5075 { 5075 {
5076 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 5076 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
5077 cputime64_t cputime64 = cputime_to_cputime64(cputime); 5077 cputime64_t cputime64 = cputime_to_cputime64(cputime);
5078 struct rq *rq = this_rq(); 5078 struct rq *rq = this_rq();
5079 5079
5080 if (atomic_read(&rq->nr_iowait) > 0) 5080 if (atomic_read(&rq->nr_iowait) > 0)
5081 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); 5081 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
5082 else 5082 else
5083 cpustat->idle = cputime64_add(cpustat->idle, cputime64); 5083 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
5084 } 5084 }
5085 5085
5086 #ifndef CONFIG_VIRT_CPU_ACCOUNTING 5086 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
5087 5087
5088 /* 5088 /*
5089 * Account a single tick of cpu time. 5089 * Account a single tick of cpu time.
5090 * @p: the process that the cpu time gets accounted to 5090 * @p: the process that the cpu time gets accounted to
5091 * @user_tick: indicates if the tick is a user or a system tick 5091 * @user_tick: indicates if the tick is a user or a system tick
5092 */ 5092 */
5093 void account_process_tick(struct task_struct *p, int user_tick) 5093 void account_process_tick(struct task_struct *p, int user_tick)
5094 { 5094 {
5095 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 5095 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5096 struct rq *rq = this_rq(); 5096 struct rq *rq = this_rq();
5097 5097
5098 if (user_tick) 5098 if (user_tick)
5099 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); 5099 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5100 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5100 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5101 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, 5101 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5102 one_jiffy_scaled); 5102 one_jiffy_scaled);
5103 else 5103 else
5104 account_idle_time(cputime_one_jiffy); 5104 account_idle_time(cputime_one_jiffy);
5105 } 5105 }
5106 5106
5107 /* 5107 /*
5108 * Account multiple ticks of steal time. 5108 * Account multiple ticks of steal time.
5109 * @p: the process from which the cpu time has been stolen 5109 * @p: the process from which the cpu time has been stolen
5110 * @ticks: number of stolen ticks 5110 * @ticks: number of stolen ticks
5111 */ 5111 */
5112 void account_steal_ticks(unsigned long ticks) 5112 void account_steal_ticks(unsigned long ticks)
5113 { 5113 {
5114 account_steal_time(jiffies_to_cputime(ticks)); 5114 account_steal_time(jiffies_to_cputime(ticks));
5115 } 5115 }
5116 5116
5117 /* 5117 /*
5118 * Account multiple ticks of idle time. 5118 * Account multiple ticks of idle time.
5119 * @ticks: number of stolen ticks 5119 * @ticks: number of stolen ticks
5120 */ 5120 */
5121 void account_idle_ticks(unsigned long ticks) 5121 void account_idle_ticks(unsigned long ticks)
5122 { 5122 {
5123 account_idle_time(jiffies_to_cputime(ticks)); 5123 account_idle_time(jiffies_to_cputime(ticks));
5124 } 5124 }
5125 5125
5126 #endif 5126 #endif
5127 5127
5128 /* 5128 /*
5129 * Use precise platform statistics if available: 5129 * Use precise platform statistics if available:
5130 */ 5130 */
5131 #ifdef CONFIG_VIRT_CPU_ACCOUNTING 5131 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
5132 cputime_t task_utime(struct task_struct *p) 5132 cputime_t task_utime(struct task_struct *p)
5133 { 5133 {
5134 return p->utime; 5134 return p->utime;
5135 } 5135 }
5136 5136
5137 cputime_t task_stime(struct task_struct *p) 5137 cputime_t task_stime(struct task_struct *p)
5138 { 5138 {
5139 return p->stime; 5139 return p->stime;
5140 } 5140 }
5141 #else 5141 #else
5142 cputime_t task_utime(struct task_struct *p) 5142 cputime_t task_utime(struct task_struct *p)
5143 { 5143 {
5144 clock_t utime = cputime_to_clock_t(p->utime), 5144 clock_t utime = cputime_to_clock_t(p->utime),
5145 total = utime + cputime_to_clock_t(p->stime); 5145 total = utime + cputime_to_clock_t(p->stime);
5146 u64 temp; 5146 u64 temp;
5147 5147
5148 /* 5148 /*
5149 * Use CFS's precise accounting: 5149 * Use CFS's precise accounting:
5150 */ 5150 */
5151 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5151 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
5152 5152
5153 if (total) { 5153 if (total) {
5154 temp *= utime; 5154 temp *= utime;
5155 do_div(temp, total); 5155 do_div(temp, total);
5156 } 5156 }
5157 utime = (clock_t)temp; 5157 utime = (clock_t)temp;
5158 5158
5159 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5159 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
5160 return p->prev_utime; 5160 return p->prev_utime;
5161 } 5161 }
5162 5162
5163 cputime_t task_stime(struct task_struct *p) 5163 cputime_t task_stime(struct task_struct *p)
5164 { 5164 {
5165 clock_t stime; 5165 clock_t stime;
5166 5166
5167 /* 5167 /*
5168 * Use CFS's precise accounting. (we subtract utime from 5168 * Use CFS's precise accounting. (we subtract utime from
5169 * the total, to make sure the total observed by userspace 5169 * the total, to make sure the total observed by userspace
5170 * grows monotonically - apps rely on that): 5170 * grows monotonically - apps rely on that):
5171 */ 5171 */
5172 stime = nsec_to_clock_t(p->se.sum_exec_runtime) - 5172 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5173 cputime_to_clock_t(task_utime(p)); 5173 cputime_to_clock_t(task_utime(p));
5174 5174
5175 if (stime >= 0) 5175 if (stime >= 0)
5176 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5176 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
5177 5177
5178 return p->prev_stime; 5178 return p->prev_stime;
5179 } 5179 }
5180 #endif 5180 #endif
5181 5181
5182 inline cputime_t task_gtime(struct task_struct *p) 5182 inline cputime_t task_gtime(struct task_struct *p)
5183 { 5183 {
5184 return p->gtime; 5184 return p->gtime;
5185 } 5185 }
5186 5186
5187 /* 5187 /*
5188 * This function gets called by the timer code, with HZ frequency. 5188 * This function gets called by the timer code, with HZ frequency.
5189 * We call it with interrupts disabled. 5189 * We call it with interrupts disabled.
5190 * 5190 *
5191 * It also gets called by the fork code, when changing the parent's 5191 * It also gets called by the fork code, when changing the parent's
5192 * timeslices. 5192 * timeslices.
5193 */ 5193 */
5194 void scheduler_tick(void) 5194 void scheduler_tick(void)
5195 { 5195 {
5196 int cpu = smp_processor_id(); 5196 int cpu = smp_processor_id();
5197 struct rq *rq = cpu_rq(cpu); 5197 struct rq *rq = cpu_rq(cpu);
5198 struct task_struct *curr = rq->curr; 5198 struct task_struct *curr = rq->curr;
5199 5199
5200 sched_clock_tick(); 5200 sched_clock_tick();
5201 5201
5202 spin_lock(&rq->lock); 5202 spin_lock(&rq->lock);
5203 update_rq_clock(rq); 5203 update_rq_clock(rq);
5204 update_cpu_load(rq); 5204 update_cpu_load(rq);
5205 curr->sched_class->task_tick(rq, curr, 0); 5205 curr->sched_class->task_tick(rq, curr, 0);
5206 spin_unlock(&rq->lock); 5206 spin_unlock(&rq->lock);
5207 5207
5208 perf_event_task_tick(curr, cpu); 5208 perf_event_task_tick(curr, cpu);
5209 5209
5210 #ifdef CONFIG_SMP 5210 #ifdef CONFIG_SMP
5211 rq->idle_at_tick = idle_cpu(cpu); 5211 rq->idle_at_tick = idle_cpu(cpu);
5212 trigger_load_balance(rq, cpu); 5212 trigger_load_balance(rq, cpu);
5213 #endif 5213 #endif
5214 } 5214 }
5215 5215
5216 notrace unsigned long get_parent_ip(unsigned long addr) 5216 notrace unsigned long get_parent_ip(unsigned long addr)
5217 { 5217 {
5218 if (in_lock_functions(addr)) { 5218 if (in_lock_functions(addr)) {
5219 addr = CALLER_ADDR2; 5219 addr = CALLER_ADDR2;
5220 if (in_lock_functions(addr)) 5220 if (in_lock_functions(addr))
5221 addr = CALLER_ADDR3; 5221 addr = CALLER_ADDR3;
5222 } 5222 }
5223 return addr; 5223 return addr;
5224 } 5224 }
5225 5225
5226 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 5226 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
5227 defined(CONFIG_PREEMPT_TRACER)) 5227 defined(CONFIG_PREEMPT_TRACER))
5228 5228
5229 void __kprobes add_preempt_count(int val) 5229 void __kprobes add_preempt_count(int val)
5230 { 5230 {
5231 #ifdef CONFIG_DEBUG_PREEMPT 5231 #ifdef CONFIG_DEBUG_PREEMPT
5232 /* 5232 /*
5233 * Underflow? 5233 * Underflow?
5234 */ 5234 */
5235 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 5235 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
5236 return; 5236 return;
5237 #endif 5237 #endif
5238 preempt_count() += val; 5238 preempt_count() += val;
5239 #ifdef CONFIG_DEBUG_PREEMPT 5239 #ifdef CONFIG_DEBUG_PREEMPT
5240 /* 5240 /*
5241 * Spinlock count overflowing soon? 5241 * Spinlock count overflowing soon?
5242 */ 5242 */
5243 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 5243 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
5244 PREEMPT_MASK - 10); 5244 PREEMPT_MASK - 10);
5245 #endif 5245 #endif
5246 if (preempt_count() == val) 5246 if (preempt_count() == val)
5247 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 5247 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5248 } 5248 }
5249 EXPORT_SYMBOL(add_preempt_count); 5249 EXPORT_SYMBOL(add_preempt_count);
5250 5250
5251 void __kprobes sub_preempt_count(int val) 5251 void __kprobes sub_preempt_count(int val)
5252 { 5252 {
5253 #ifdef CONFIG_DEBUG_PREEMPT 5253 #ifdef CONFIG_DEBUG_PREEMPT
5254 /* 5254 /*
5255 * Underflow? 5255 * Underflow?
5256 */ 5256 */
5257 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 5257 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
5258 return; 5258 return;
5259 /* 5259 /*
5260 * Is the spinlock portion underflowing? 5260 * Is the spinlock portion underflowing?
5261 */ 5261 */
5262 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 5262 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
5263 !(preempt_count() & PREEMPT_MASK))) 5263 !(preempt_count() & PREEMPT_MASK)))
5264 return; 5264 return;
5265 #endif 5265 #endif
5266 5266
5267 if (preempt_count() == val) 5267 if (preempt_count() == val)
5268 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 5268 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
5269 preempt_count() -= val; 5269 preempt_count() -= val;
5270 } 5270 }
5271 EXPORT_SYMBOL(sub_preempt_count); 5271 EXPORT_SYMBOL(sub_preempt_count);
5272 5272
5273 #endif 5273 #endif
5274 5274
5275 /* 5275 /*
5276 * Print scheduling while atomic bug: 5276 * Print scheduling while atomic bug:
5277 */ 5277 */
5278 static noinline void __schedule_bug(struct task_struct *prev) 5278 static noinline void __schedule_bug(struct task_struct *prev)
5279 { 5279 {
5280 struct pt_regs *regs = get_irq_regs(); 5280 struct pt_regs *regs = get_irq_regs();
5281 5281
5282 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", 5282 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
5283 prev->comm, prev->pid, preempt_count()); 5283 prev->comm, prev->pid, preempt_count());
5284 5284
5285 debug_show_held_locks(prev); 5285 debug_show_held_locks(prev);
5286 print_modules(); 5286 print_modules();
5287 if (irqs_disabled()) 5287 if (irqs_disabled())
5288 print_irqtrace_events(prev); 5288 print_irqtrace_events(prev);
5289 5289
5290 if (regs) 5290 if (regs)
5291 show_regs(regs); 5291 show_regs(regs);
5292 else 5292 else
5293 dump_stack(); 5293 dump_stack();
5294 } 5294 }
5295 5295
5296 /* 5296 /*
5297 * Various schedule()-time debugging checks and statistics: 5297 * Various schedule()-time debugging checks and statistics:
5298 */ 5298 */
5299 static inline void schedule_debug(struct task_struct *prev) 5299 static inline void schedule_debug(struct task_struct *prev)
5300 { 5300 {
5301 /* 5301 /*
5302 * Test if we are atomic. Since do_exit() needs to call into 5302 * Test if we are atomic. Since do_exit() needs to call into
5303 * schedule() atomically, we ignore that path for now. 5303 * schedule() atomically, we ignore that path for now.
5304 * Otherwise, whine if we are scheduling when we should not be. 5304 * Otherwise, whine if we are scheduling when we should not be.
5305 */ 5305 */
5306 if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) 5306 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
5307 __schedule_bug(prev); 5307 __schedule_bug(prev);
5308 5308
5309 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 5309 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
5310 5310
5311 schedstat_inc(this_rq(), sched_count); 5311 schedstat_inc(this_rq(), sched_count);
5312 #ifdef CONFIG_SCHEDSTATS 5312 #ifdef CONFIG_SCHEDSTATS
5313 if (unlikely(prev->lock_depth >= 0)) { 5313 if (unlikely(prev->lock_depth >= 0)) {
5314 schedstat_inc(this_rq(), bkl_count); 5314 schedstat_inc(this_rq(), bkl_count);
5315 schedstat_inc(prev, sched_info.bkl_count); 5315 schedstat_inc(prev, sched_info.bkl_count);
5316 } 5316 }
5317 #endif 5317 #endif
5318 } 5318 }
5319 5319
5320 static void put_prev_task(struct rq *rq, struct task_struct *p) 5320 static void put_prev_task(struct rq *rq, struct task_struct *p)
5321 { 5321 {
5322 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5322 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5323 5323
5324 update_avg(&p->se.avg_running, runtime); 5324 update_avg(&p->se.avg_running, runtime);
5325 5325
5326 if (p->state == TASK_RUNNING) { 5326 if (p->state == TASK_RUNNING) {
5327 /* 5327 /*
5328 * In order to avoid avg_overlap growing stale when we are 5328 * In order to avoid avg_overlap growing stale when we are
5329 * indeed overlapping and hence not getting put to sleep, grow 5329 * indeed overlapping and hence not getting put to sleep, grow
5330 * the avg_overlap on preemption. 5330 * the avg_overlap on preemption.
5331 * 5331 *
5332 * We use the average preemption runtime because that 5332 * We use the average preemption runtime because that
5333 * correlates to the amount of cache footprint a task can 5333 * correlates to the amount of cache footprint a task can
5334 * build up. 5334 * build up.
5335 */ 5335 */
5336 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5336 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5337 update_avg(&p->se.avg_overlap, runtime); 5337 update_avg(&p->se.avg_overlap, runtime);
5338 } else { 5338 } else {
5339 update_avg(&p->se.avg_running, 0); 5339 update_avg(&p->se.avg_running, 0);
5340 } 5340 }
5341 p->sched_class->put_prev_task(rq, p); 5341 p->sched_class->put_prev_task(rq, p);
5342 } 5342 }
5343 5343
5344 /* 5344 /*
5345 * Pick up the highest-prio task: 5345 * Pick up the highest-prio task:
5346 */ 5346 */
5347 static inline struct task_struct * 5347 static inline struct task_struct *
5348 pick_next_task(struct rq *rq) 5348 pick_next_task(struct rq *rq)
5349 { 5349 {
5350 const struct sched_class *class; 5350 const struct sched_class *class;
5351 struct task_struct *p; 5351 struct task_struct *p;
5352 5352
5353 /* 5353 /*
5354 * Optimization: we know that if all tasks are in 5354 * Optimization: we know that if all tasks are in
5355 * the fair class we can call that function directly: 5355 * the fair class we can call that function directly:
5356 */ 5356 */
5357 if (likely(rq->nr_running == rq->cfs.nr_running)) { 5357 if (likely(rq->nr_running == rq->cfs.nr_running)) {
5358 p = fair_sched_class.pick_next_task(rq); 5358 p = fair_sched_class.pick_next_task(rq);
5359 if (likely(p)) 5359 if (likely(p))
5360 return p; 5360 return p;
5361 } 5361 }
5362 5362
5363 class = sched_class_highest; 5363 class = sched_class_highest;
5364 for ( ; ; ) { 5364 for ( ; ; ) {
5365 p = class->pick_next_task(rq); 5365 p = class->pick_next_task(rq);
5366 if (p) 5366 if (p)
5367 return p; 5367 return p;
5368 /* 5368 /*
5369 * Will never be NULL as the idle class always 5369 * Will never be NULL as the idle class always
5370 * returns a non-NULL p: 5370 * returns a non-NULL p:
5371 */ 5371 */
5372 class = class->next; 5372 class = class->next;
5373 } 5373 }
5374 } 5374 }
5375 5375
5376 /* 5376 /*
5377 * schedule() is the main scheduler function. 5377 * schedule() is the main scheduler function.
5378 */ 5378 */
5379 asmlinkage void __sched schedule(void) 5379 asmlinkage void __sched schedule(void)
5380 { 5380 {
5381 struct task_struct *prev, *next; 5381 struct task_struct *prev, *next;
5382 unsigned long *switch_count; 5382 unsigned long *switch_count;
5383 struct rq *rq; 5383 struct rq *rq;
5384 int cpu; 5384 int cpu;
5385 5385
5386 need_resched: 5386 need_resched:
5387 preempt_disable(); 5387 preempt_disable();
5388 cpu = smp_processor_id(); 5388 cpu = smp_processor_id();
5389 rq = cpu_rq(cpu); 5389 rq = cpu_rq(cpu);
5390 rcu_sched_qs(cpu); 5390 rcu_sched_qs(cpu);
5391 prev = rq->curr; 5391 prev = rq->curr;
5392 switch_count = &prev->nivcsw; 5392 switch_count = &prev->nivcsw;
5393 5393
5394 release_kernel_lock(prev); 5394 release_kernel_lock(prev);
5395 need_resched_nonpreemptible: 5395 need_resched_nonpreemptible:
5396 5396
5397 schedule_debug(prev); 5397 schedule_debug(prev);
5398 5398
5399 if (sched_feat(HRTICK)) 5399 if (sched_feat(HRTICK))
5400 hrtick_clear(rq); 5400 hrtick_clear(rq);
5401 5401
5402 spin_lock_irq(&rq->lock); 5402 spin_lock_irq(&rq->lock);
5403 update_rq_clock(rq); 5403 update_rq_clock(rq);
5404 clear_tsk_need_resched(prev); 5404 clear_tsk_need_resched(prev);
5405 5405
5406 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 5406 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5407 if (unlikely(signal_pending_state(prev->state, prev))) 5407 if (unlikely(signal_pending_state(prev->state, prev)))
5408 prev->state = TASK_RUNNING; 5408 prev->state = TASK_RUNNING;
5409 else 5409 else
5410 deactivate_task(rq, prev, 1); 5410 deactivate_task(rq, prev, 1);
5411 switch_count = &prev->nvcsw; 5411 switch_count = &prev->nvcsw;
5412 } 5412 }
5413 5413
5414 pre_schedule(rq, prev); 5414 pre_schedule(rq, prev);
5415 5415
5416 if (unlikely(!rq->nr_running)) 5416 if (unlikely(!rq->nr_running))
5417 idle_balance(cpu, rq); 5417 idle_balance(cpu, rq);
5418 5418
5419 put_prev_task(rq, prev); 5419 put_prev_task(rq, prev);
5420 next = pick_next_task(rq); 5420 next = pick_next_task(rq);
5421 5421
5422 if (likely(prev != next)) { 5422 if (likely(prev != next)) {
5423 sched_info_switch(prev, next); 5423 sched_info_switch(prev, next);
5424 perf_event_task_sched_out(prev, next, cpu); 5424 perf_event_task_sched_out(prev, next, cpu);
5425 5425
5426 rq->nr_switches++; 5426 rq->nr_switches++;
5427 rq->curr = next; 5427 rq->curr = next;
5428 ++*switch_count; 5428 ++*switch_count;
5429 5429
5430 context_switch(rq, prev, next); /* unlocks the rq */ 5430 context_switch(rq, prev, next); /* unlocks the rq */
5431 /* 5431 /*
5432 * the context switch might have flipped the stack from under 5432 * the context switch might have flipped the stack from under
5433 * us, hence refresh the local variables. 5433 * us, hence refresh the local variables.
5434 */ 5434 */
5435 cpu = smp_processor_id(); 5435 cpu = smp_processor_id();
5436 rq = cpu_rq(cpu); 5436 rq = cpu_rq(cpu);
5437 } else 5437 } else
5438 spin_unlock_irq(&rq->lock); 5438 spin_unlock_irq(&rq->lock);
5439 5439
5440 post_schedule(rq); 5440 post_schedule(rq);
5441 5441
5442 if (unlikely(reacquire_kernel_lock(current) < 0)) 5442 if (unlikely(reacquire_kernel_lock(current) < 0))
5443 goto need_resched_nonpreemptible; 5443 goto need_resched_nonpreemptible;
5444 5444
5445 preempt_enable_no_resched(); 5445 preempt_enable_no_resched();
5446 if (need_resched()) 5446 if (need_resched())
5447 goto need_resched; 5447 goto need_resched;
5448 } 5448 }
5449 EXPORT_SYMBOL(schedule); 5449 EXPORT_SYMBOL(schedule);
5450 5450
5451 #ifdef CONFIG_SMP 5451 #ifdef CONFIG_SMP
5452 /* 5452 /*
5453 * Look out! "owner" is an entirely speculative pointer 5453 * Look out! "owner" is an entirely speculative pointer
5454 * access and not reliable. 5454 * access and not reliable.
5455 */ 5455 */
5456 int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) 5456 int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5457 { 5457 {
5458 unsigned int cpu; 5458 unsigned int cpu;
5459 struct rq *rq; 5459 struct rq *rq;
5460 5460
5461 if (!sched_feat(OWNER_SPIN)) 5461 if (!sched_feat(OWNER_SPIN))
5462 return 0; 5462 return 0;
5463 5463
5464 #ifdef CONFIG_DEBUG_PAGEALLOC 5464 #ifdef CONFIG_DEBUG_PAGEALLOC
5465 /* 5465 /*
5466 * Need to access the cpu field knowing that 5466 * Need to access the cpu field knowing that
5467 * DEBUG_PAGEALLOC could have unmapped it if 5467 * DEBUG_PAGEALLOC could have unmapped it if
5468 * the mutex owner just released it and exited. 5468 * the mutex owner just released it and exited.
5469 */ 5469 */
5470 if (probe_kernel_address(&owner->cpu, cpu)) 5470 if (probe_kernel_address(&owner->cpu, cpu))
5471 goto out; 5471 goto out;
5472 #else 5472 #else
5473 cpu = owner->cpu; 5473 cpu = owner->cpu;
5474 #endif 5474 #endif
5475 5475
5476 /* 5476 /*
5477 * Even if the access succeeded (likely case), 5477 * Even if the access succeeded (likely case),
5478 * the cpu field may no longer be valid. 5478 * the cpu field may no longer be valid.
5479 */ 5479 */
5480 if (cpu >= nr_cpumask_bits) 5480 if (cpu >= nr_cpumask_bits)
5481 goto out; 5481 goto out;
5482 5482
5483 /* 5483 /*
5484 * We need to validate that we can do a 5484 * We need to validate that we can do a
5485 * get_cpu() and that we have the percpu area. 5485 * get_cpu() and that we have the percpu area.
5486 */ 5486 */
5487 if (!cpu_online(cpu)) 5487 if (!cpu_online(cpu))
5488 goto out; 5488 goto out;
5489 5489
5490 rq = cpu_rq(cpu); 5490 rq = cpu_rq(cpu);
5491 5491
5492 for (;;) { 5492 for (;;) {
5493 /* 5493 /*
5494 * Owner changed, break to re-assess state. 5494 * Owner changed, break to re-assess state.
5495 */ 5495 */
5496 if (lock->owner != owner) 5496 if (lock->owner != owner)
5497 break; 5497 break;
5498 5498
5499 /* 5499 /*
5500 * Is that owner really running on that cpu? 5500 * Is that owner really running on that cpu?
5501 */ 5501 */
5502 if (task_thread_info(rq->curr) != owner || need_resched()) 5502 if (task_thread_info(rq->curr) != owner || need_resched())
5503 return 0; 5503 return 0;
5504 5504
5505 cpu_relax(); 5505 cpu_relax();
5506 } 5506 }
5507 out: 5507 out:
5508 return 1; 5508 return 1;
5509 } 5509 }
5510 #endif 5510 #endif
5511 5511
5512 #ifdef CONFIG_PREEMPT 5512 #ifdef CONFIG_PREEMPT
5513 /* 5513 /*
5514 * this is the entry point to schedule() from in-kernel preemption 5514 * this is the entry point to schedule() from in-kernel preemption
5515 * off of preempt_enable. Kernel preemptions off return from interrupt 5515 * off of preempt_enable. Kernel preemptions off return from interrupt
5516 * occur there and call schedule directly. 5516 * occur there and call schedule directly.
5517 */ 5517 */
5518 asmlinkage void __sched preempt_schedule(void) 5518 asmlinkage void __sched preempt_schedule(void)
5519 { 5519 {
5520 struct thread_info *ti = current_thread_info(); 5520 struct thread_info *ti = current_thread_info();
5521 5521
5522 /* 5522 /*
5523 * If there is a non-zero preempt_count or interrupts are disabled, 5523 * If there is a non-zero preempt_count or interrupts are disabled,
5524 * we do not want to preempt the current task. Just return.. 5524 * we do not want to preempt the current task. Just return..
5525 */ 5525 */
5526 if (likely(ti->preempt_count || irqs_disabled())) 5526 if (likely(ti->preempt_count || irqs_disabled()))
5527 return; 5527 return;
5528 5528
5529 do { 5529 do {
5530 add_preempt_count(PREEMPT_ACTIVE); 5530 add_preempt_count(PREEMPT_ACTIVE);
5531 schedule(); 5531 schedule();
5532 sub_preempt_count(PREEMPT_ACTIVE); 5532 sub_preempt_count(PREEMPT_ACTIVE);
5533 5533
5534 /* 5534 /*
5535 * Check again in case we missed a preemption opportunity 5535 * Check again in case we missed a preemption opportunity
5536 * between schedule and now. 5536 * between schedule and now.
5537 */ 5537 */
5538 barrier(); 5538 barrier();
5539 } while (need_resched()); 5539 } while (need_resched());
5540 } 5540 }
5541 EXPORT_SYMBOL(preempt_schedule); 5541 EXPORT_SYMBOL(preempt_schedule);
5542 5542
5543 /* 5543 /*
5544 * this is the entry point to schedule() from kernel preemption 5544 * this is the entry point to schedule() from kernel preemption
5545 * off of irq context. 5545 * off of irq context.
5546 * Note, that this is called and return with irqs disabled. This will 5546 * Note, that this is called and return with irqs disabled. This will
5547 * protect us against recursive calling from irq. 5547 * protect us against recursive calling from irq.
5548 */ 5548 */
5549 asmlinkage void __sched preempt_schedule_irq(void) 5549 asmlinkage void __sched preempt_schedule_irq(void)
5550 { 5550 {
5551 struct thread_info *ti = current_thread_info(); 5551 struct thread_info *ti = current_thread_info();
5552 5552
5553 /* Catch callers which need to be fixed */ 5553 /* Catch callers which need to be fixed */
5554 BUG_ON(ti->preempt_count || !irqs_disabled()); 5554 BUG_ON(ti->preempt_count || !irqs_disabled());
5555 5555
5556 do { 5556 do {
5557 add_preempt_count(PREEMPT_ACTIVE); 5557 add_preempt_count(PREEMPT_ACTIVE);
5558 local_irq_enable(); 5558 local_irq_enable();
5559 schedule(); 5559 schedule();
5560 local_irq_disable(); 5560 local_irq_disable();
5561 sub_preempt_count(PREEMPT_ACTIVE); 5561 sub_preempt_count(PREEMPT_ACTIVE);
5562 5562
5563 /* 5563 /*
5564 * Check again in case we missed a preemption opportunity 5564 * Check again in case we missed a preemption opportunity
5565 * between schedule and now. 5565 * between schedule and now.
5566 */ 5566 */
5567 barrier(); 5567 barrier();
5568 } while (need_resched()); 5568 } while (need_resched());
5569 } 5569 }
5570 5570
5571 #endif /* CONFIG_PREEMPT */ 5571 #endif /* CONFIG_PREEMPT */
5572 5572
5573 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, 5573 int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5574 void *key) 5574 void *key)
5575 { 5575 {
5576 return try_to_wake_up(curr->private, mode, wake_flags); 5576 return try_to_wake_up(curr->private, mode, wake_flags);
5577 } 5577 }
5578 EXPORT_SYMBOL(default_wake_function); 5578 EXPORT_SYMBOL(default_wake_function);
5579 5579
5580 /* 5580 /*
5581 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 5581 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
5582 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 5582 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
5583 * number) then we wake all the non-exclusive tasks and one exclusive task. 5583 * number) then we wake all the non-exclusive tasks and one exclusive task.
5584 * 5584 *
5585 * There are circumstances in which we can try to wake a task which has already 5585 * There are circumstances in which we can try to wake a task which has already
5586 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 5586 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
5587 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5587 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5588 */ 5588 */
5589 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5589 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5590 int nr_exclusive, int wake_flags, void *key) 5590 int nr_exclusive, int wake_flags, void *key)
5591 { 5591 {
5592 wait_queue_t *curr, *next; 5592 wait_queue_t *curr, *next;
5593 5593
5594 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5594 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5595 unsigned flags = curr->flags; 5595 unsigned flags = curr->flags;
5596 5596
5597 if (curr->func(curr, mode, wake_flags, key) && 5597 if (curr->func(curr, mode, wake_flags, key) &&
5598 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5598 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5599 break; 5599 break;
5600 } 5600 }
5601 } 5601 }
5602 5602
5603 /** 5603 /**
5604 * __wake_up - wake up threads blocked on a waitqueue. 5604 * __wake_up - wake up threads blocked on a waitqueue.
5605 * @q: the waitqueue 5605 * @q: the waitqueue
5606 * @mode: which threads 5606 * @mode: which threads
5607 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5607 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5608 * @key: is directly passed to the wakeup function 5608 * @key: is directly passed to the wakeup function
5609 * 5609 *
5610 * It may be assumed that this function implies a write memory barrier before 5610 * It may be assumed that this function implies a write memory barrier before
5611 * changing the task state if and only if any tasks are woken up. 5611 * changing the task state if and only if any tasks are woken up.
5612 */ 5612 */
5613 void __wake_up(wait_queue_head_t *q, unsigned int mode, 5613 void __wake_up(wait_queue_head_t *q, unsigned int mode,
5614 int nr_exclusive, void *key) 5614 int nr_exclusive, void *key)
5615 { 5615 {
5616 unsigned long flags; 5616 unsigned long flags;
5617 5617
5618 spin_lock_irqsave(&q->lock, flags); 5618 spin_lock_irqsave(&q->lock, flags);
5619 __wake_up_common(q, mode, nr_exclusive, 0, key); 5619 __wake_up_common(q, mode, nr_exclusive, 0, key);
5620 spin_unlock_irqrestore(&q->lock, flags); 5620 spin_unlock_irqrestore(&q->lock, flags);
5621 } 5621 }
5622 EXPORT_SYMBOL(__wake_up); 5622 EXPORT_SYMBOL(__wake_up);
5623 5623
5624 /* 5624 /*
5625 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 5625 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
5626 */ 5626 */
5627 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 5627 void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5628 { 5628 {
5629 __wake_up_common(q, mode, 1, 0, NULL); 5629 __wake_up_common(q, mode, 1, 0, NULL);
5630 } 5630 }
5631 5631
5632 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 5632 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5633 { 5633 {
5634 __wake_up_common(q, mode, 1, 0, key); 5634 __wake_up_common(q, mode, 1, 0, key);
5635 } 5635 }
5636 5636
5637 /** 5637 /**
5638 * __wake_up_sync_key - wake up threads blocked on a waitqueue. 5638 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
5639 * @q: the waitqueue 5639 * @q: the waitqueue
5640 * @mode: which threads 5640 * @mode: which threads
5641 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5641 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5642 * @key: opaque value to be passed to wakeup targets 5642 * @key: opaque value to be passed to wakeup targets
5643 * 5643 *
5644 * The sync wakeup differs that the waker knows that it will schedule 5644 * The sync wakeup differs that the waker knows that it will schedule
5645 * away soon, so while the target thread will be woken up, it will not 5645 * away soon, so while the target thread will be woken up, it will not
5646 * be migrated to another CPU - ie. the two threads are 'synchronized' 5646 * be migrated to another CPU - ie. the two threads are 'synchronized'
5647 * with each other. This can prevent needless bouncing between CPUs. 5647 * with each other. This can prevent needless bouncing between CPUs.
5648 * 5648 *
5649 * On UP it can prevent extra preemption. 5649 * On UP it can prevent extra preemption.
5650 * 5650 *
5651 * It may be assumed that this function implies a write memory barrier before 5651 * It may be assumed that this function implies a write memory barrier before
5652 * changing the task state if and only if any tasks are woken up. 5652 * changing the task state if and only if any tasks are woken up.
5653 */ 5653 */
5654 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, 5654 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5655 int nr_exclusive, void *key) 5655 int nr_exclusive, void *key)
5656 { 5656 {
5657 unsigned long flags; 5657 unsigned long flags;
5658 int wake_flags = WF_SYNC; 5658 int wake_flags = WF_SYNC;
5659 5659
5660 if (unlikely(!q)) 5660 if (unlikely(!q))
5661 return; 5661 return;
5662 5662
5663 if (unlikely(!nr_exclusive)) 5663 if (unlikely(!nr_exclusive))
5664 wake_flags = 0; 5664 wake_flags = 0;
5665 5665
5666 spin_lock_irqsave(&q->lock, flags); 5666 spin_lock_irqsave(&q->lock, flags);
5667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key); 5667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5668 spin_unlock_irqrestore(&q->lock, flags); 5668 spin_unlock_irqrestore(&q->lock, flags);
5669 } 5669 }
5670 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5670 EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5671 5671
5672 /* 5672 /*
5673 * __wake_up_sync - see __wake_up_sync_key() 5673 * __wake_up_sync - see __wake_up_sync_key()
5674 */ 5674 */
5675 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5675 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5676 { 5676 {
5677 __wake_up_sync_key(q, mode, nr_exclusive, NULL); 5677 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5678 } 5678 }
5679 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5679 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
5680 5680
5681 /** 5681 /**
5682 * complete: - signals a single thread waiting on this completion 5682 * complete: - signals a single thread waiting on this completion
5683 * @x: holds the state of this particular completion 5683 * @x: holds the state of this particular completion
5684 * 5684 *
5685 * This will wake up a single thread waiting on this completion. Threads will be 5685 * This will wake up a single thread waiting on this completion. Threads will be
5686 * awakened in the same order in which they were queued. 5686 * awakened in the same order in which they were queued.
5687 * 5687 *
5688 * See also complete_all(), wait_for_completion() and related routines. 5688 * See also complete_all(), wait_for_completion() and related routines.
5689 * 5689 *
5690 * It may be assumed that this function implies a write memory barrier before 5690 * It may be assumed that this function implies a write memory barrier before
5691 * changing the task state if and only if any tasks are woken up. 5691 * changing the task state if and only if any tasks are woken up.
5692 */ 5692 */
5693 void complete(struct completion *x) 5693 void complete(struct completion *x)
5694 { 5694 {
5695 unsigned long flags; 5695 unsigned long flags;
5696 5696
5697 spin_lock_irqsave(&x->wait.lock, flags); 5697 spin_lock_irqsave(&x->wait.lock, flags);
5698 x->done++; 5698 x->done++;
5699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); 5699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
5700 spin_unlock_irqrestore(&x->wait.lock, flags); 5700 spin_unlock_irqrestore(&x->wait.lock, flags);
5701 } 5701 }
5702 EXPORT_SYMBOL(complete); 5702 EXPORT_SYMBOL(complete);
5703 5703
5704 /** 5704 /**
5705 * complete_all: - signals all threads waiting on this completion 5705 * complete_all: - signals all threads waiting on this completion
5706 * @x: holds the state of this particular completion 5706 * @x: holds the state of this particular completion
5707 * 5707 *
5708 * This will wake up all threads waiting on this particular completion event. 5708 * This will wake up all threads waiting on this particular completion event.
5709 * 5709 *
5710 * It may be assumed that this function implies a write memory barrier before 5710 * It may be assumed that this function implies a write memory barrier before
5711 * changing the task state if and only if any tasks are woken up. 5711 * changing the task state if and only if any tasks are woken up.
5712 */ 5712 */
5713 void complete_all(struct completion *x) 5713 void complete_all(struct completion *x)
5714 { 5714 {
5715 unsigned long flags; 5715 unsigned long flags;
5716 5716
5717 spin_lock_irqsave(&x->wait.lock, flags); 5717 spin_lock_irqsave(&x->wait.lock, flags);
5718 x->done += UINT_MAX/2; 5718 x->done += UINT_MAX/2;
5719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); 5719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
5720 spin_unlock_irqrestore(&x->wait.lock, flags); 5720 spin_unlock_irqrestore(&x->wait.lock, flags);
5721 } 5721 }
5722 EXPORT_SYMBOL(complete_all); 5722 EXPORT_SYMBOL(complete_all);
5723 5723
5724 static inline long __sched 5724 static inline long __sched
5725 do_wait_for_common(struct completion *x, long timeout, int state) 5725 do_wait_for_common(struct completion *x, long timeout, int state)
5726 { 5726 {
5727 if (!x->done) { 5727 if (!x->done) {
5728 DECLARE_WAITQUEUE(wait, current); 5728 DECLARE_WAITQUEUE(wait, current);
5729 5729
5730 wait.flags |= WQ_FLAG_EXCLUSIVE; 5730 wait.flags |= WQ_FLAG_EXCLUSIVE;
5731 __add_wait_queue_tail(&x->wait, &wait); 5731 __add_wait_queue_tail(&x->wait, &wait);
5732 do { 5732 do {
5733 if (signal_pending_state(state, current)) { 5733 if (signal_pending_state(state, current)) {
5734 timeout = -ERESTARTSYS; 5734 timeout = -ERESTARTSYS;
5735 break; 5735 break;
5736 } 5736 }
5737 __set_current_state(state); 5737 __set_current_state(state);
5738 spin_unlock_irq(&x->wait.lock); 5738 spin_unlock_irq(&x->wait.lock);
5739 timeout = schedule_timeout(timeout); 5739 timeout = schedule_timeout(timeout);
5740 spin_lock_irq(&x->wait.lock); 5740 spin_lock_irq(&x->wait.lock);
5741 } while (!x->done && timeout); 5741 } while (!x->done && timeout);
5742 __remove_wait_queue(&x->wait, &wait); 5742 __remove_wait_queue(&x->wait, &wait);
5743 if (!x->done) 5743 if (!x->done)
5744 return timeout; 5744 return timeout;
5745 } 5745 }
5746 x->done--; 5746 x->done--;
5747 return timeout ?: 1; 5747 return timeout ?: 1;
5748 } 5748 }
5749 5749
5750 static long __sched 5750 static long __sched
5751 wait_for_common(struct completion *x, long timeout, int state) 5751 wait_for_common(struct completion *x, long timeout, int state)
5752 { 5752 {
5753 might_sleep(); 5753 might_sleep();
5754 5754
5755 spin_lock_irq(&x->wait.lock); 5755 spin_lock_irq(&x->wait.lock);
5756 timeout = do_wait_for_common(x, timeout, state); 5756 timeout = do_wait_for_common(x, timeout, state);
5757 spin_unlock_irq(&x->wait.lock); 5757 spin_unlock_irq(&x->wait.lock);
5758 return timeout; 5758 return timeout;
5759 } 5759 }
5760 5760
5761 /** 5761 /**
5762 * wait_for_completion: - waits for completion of a task 5762 * wait_for_completion: - waits for completion of a task
5763 * @x: holds the state of this particular completion 5763 * @x: holds the state of this particular completion
5764 * 5764 *
5765 * This waits to be signaled for completion of a specific task. It is NOT 5765 * This waits to be signaled for completion of a specific task. It is NOT
5766 * interruptible and there is no timeout. 5766 * interruptible and there is no timeout.
5767 * 5767 *
5768 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout 5768 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
5769 * and interrupt capability. Also see complete(). 5769 * and interrupt capability. Also see complete().
5770 */ 5770 */
5771 void __sched wait_for_completion(struct completion *x) 5771 void __sched wait_for_completion(struct completion *x)
5772 { 5772 {
5773 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 5773 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
5774 } 5774 }
5775 EXPORT_SYMBOL(wait_for_completion); 5775 EXPORT_SYMBOL(wait_for_completion);
5776 5776
5777 /** 5777 /**
5778 * wait_for_completion_timeout: - waits for completion of a task (w/timeout) 5778 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
5779 * @x: holds the state of this particular completion 5779 * @x: holds the state of this particular completion
5780 * @timeout: timeout value in jiffies 5780 * @timeout: timeout value in jiffies
5781 * 5781 *
5782 * This waits for either a completion of a specific task to be signaled or for a 5782 * This waits for either a completion of a specific task to be signaled or for a
5783 * specified timeout to expire. The timeout is in jiffies. It is not 5783 * specified timeout to expire. The timeout is in jiffies. It is not
5784 * interruptible. 5784 * interruptible.
5785 */ 5785 */
5786 unsigned long __sched 5786 unsigned long __sched
5787 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 5787 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
5788 { 5788 {
5789 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); 5789 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
5790 } 5790 }
5791 EXPORT_SYMBOL(wait_for_completion_timeout); 5791 EXPORT_SYMBOL(wait_for_completion_timeout);
5792 5792
5793 /** 5793 /**
5794 * wait_for_completion_interruptible: - waits for completion of a task (w/intr) 5794 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
5795 * @x: holds the state of this particular completion 5795 * @x: holds the state of this particular completion
5796 * 5796 *
5797 * This waits for completion of a specific task to be signaled. It is 5797 * This waits for completion of a specific task to be signaled. It is
5798 * interruptible. 5798 * interruptible.
5799 */ 5799 */
5800 int __sched wait_for_completion_interruptible(struct completion *x) 5800 int __sched wait_for_completion_interruptible(struct completion *x)
5801 { 5801 {
5802 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 5802 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
5803 if (t == -ERESTARTSYS) 5803 if (t == -ERESTARTSYS)
5804 return t; 5804 return t;
5805 return 0; 5805 return 0;
5806 } 5806 }
5807 EXPORT_SYMBOL(wait_for_completion_interruptible); 5807 EXPORT_SYMBOL(wait_for_completion_interruptible);
5808 5808
5809 /** 5809 /**
5810 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) 5810 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
5811 * @x: holds the state of this particular completion 5811 * @x: holds the state of this particular completion
5812 * @timeout: timeout value in jiffies 5812 * @timeout: timeout value in jiffies
5813 * 5813 *
5814 * This waits for either a completion of a specific task to be signaled or for a 5814 * This waits for either a completion of a specific task to be signaled or for a
5815 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 5815 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
5816 */ 5816 */
5817 unsigned long __sched 5817 unsigned long __sched
5818 wait_for_completion_interruptible_timeout(struct completion *x, 5818 wait_for_completion_interruptible_timeout(struct completion *x,
5819 unsigned long timeout) 5819 unsigned long timeout)
5820 { 5820 {
5821 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); 5821 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
5822 } 5822 }
5823 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 5823 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
5824 5824
5825 /** 5825 /**
5826 * wait_for_completion_killable: - waits for completion of a task (killable) 5826 * wait_for_completion_killable: - waits for completion of a task (killable)
5827 * @x: holds the state of this particular completion 5827 * @x: holds the state of this particular completion
5828 * 5828 *
5829 * This waits to be signaled for completion of a specific task. It can be 5829 * This waits to be signaled for completion of a specific task. It can be
5830 * interrupted by a kill signal. 5830 * interrupted by a kill signal.
5831 */ 5831 */
5832 int __sched wait_for_completion_killable(struct completion *x) 5832 int __sched wait_for_completion_killable(struct completion *x)
5833 { 5833 {
5834 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 5834 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
5835 if (t == -ERESTARTSYS) 5835 if (t == -ERESTARTSYS)
5836 return t; 5836 return t;
5837 return 0; 5837 return 0;
5838 } 5838 }
5839 EXPORT_SYMBOL(wait_for_completion_killable); 5839 EXPORT_SYMBOL(wait_for_completion_killable);
5840 5840
5841 /** 5841 /**
5842 * try_wait_for_completion - try to decrement a completion without blocking 5842 * try_wait_for_completion - try to decrement a completion without blocking
5843 * @x: completion structure 5843 * @x: completion structure
5844 * 5844 *
5845 * Returns: 0 if a decrement cannot be done without blocking 5845 * Returns: 0 if a decrement cannot be done without blocking
5846 * 1 if a decrement succeeded. 5846 * 1 if a decrement succeeded.
5847 * 5847 *
5848 * If a completion is being used as a counting completion, 5848 * If a completion is being used as a counting completion,
5849 * attempt to decrement the counter without blocking. This 5849 * attempt to decrement the counter without blocking. This
5850 * enables us to avoid waiting if the resource the completion 5850 * enables us to avoid waiting if the resource the completion
5851 * is protecting is not available. 5851 * is protecting is not available.
5852 */ 5852 */
5853 bool try_wait_for_completion(struct completion *x) 5853 bool try_wait_for_completion(struct completion *x)
5854 { 5854 {
5855 int ret = 1; 5855 int ret = 1;
5856 5856
5857 spin_lock_irq(&x->wait.lock); 5857 spin_lock_irq(&x->wait.lock);
5858 if (!x->done) 5858 if (!x->done)
5859 ret = 0; 5859 ret = 0;
5860 else 5860 else
5861 x->done--; 5861 x->done--;
5862 spin_unlock_irq(&x->wait.lock); 5862 spin_unlock_irq(&x->wait.lock);
5863 return ret; 5863 return ret;
5864 } 5864 }
5865 EXPORT_SYMBOL(try_wait_for_completion); 5865 EXPORT_SYMBOL(try_wait_for_completion);
5866 5866
5867 /** 5867 /**
5868 * completion_done - Test to see if a completion has any waiters 5868 * completion_done - Test to see if a completion has any waiters
5869 * @x: completion structure 5869 * @x: completion structure
5870 * 5870 *
5871 * Returns: 0 if there are waiters (wait_for_completion() in progress) 5871 * Returns: 0 if there are waiters (wait_for_completion() in progress)
5872 * 1 if there are no waiters. 5872 * 1 if there are no waiters.
5873 * 5873 *
5874 */ 5874 */
5875 bool completion_done(struct completion *x) 5875 bool completion_done(struct completion *x)
5876 { 5876 {
5877 int ret = 1; 5877 int ret = 1;
5878 5878
5879 spin_lock_irq(&x->wait.lock); 5879 spin_lock_irq(&x->wait.lock);
5880 if (!x->done) 5880 if (!x->done)
5881 ret = 0; 5881 ret = 0;
5882 spin_unlock_irq(&x->wait.lock); 5882 spin_unlock_irq(&x->wait.lock);
5883 return ret; 5883 return ret;
5884 } 5884 }
5885 EXPORT_SYMBOL(completion_done); 5885 EXPORT_SYMBOL(completion_done);
5886 5886
5887 static long __sched 5887 static long __sched
5888 sleep_on_common(wait_queue_head_t *q, int state, long timeout) 5888 sleep_on_common(wait_queue_head_t *q, int state, long timeout)
5889 { 5889 {
5890 unsigned long flags; 5890 unsigned long flags;
5891 wait_queue_t wait; 5891 wait_queue_t wait;
5892 5892
5893 init_waitqueue_entry(&wait, current); 5893 init_waitqueue_entry(&wait, current);
5894 5894
5895 __set_current_state(state); 5895 __set_current_state(state);
5896 5896
5897 spin_lock_irqsave(&q->lock, flags); 5897 spin_lock_irqsave(&q->lock, flags);
5898 __add_wait_queue(q, &wait); 5898 __add_wait_queue(q, &wait);
5899 spin_unlock(&q->lock); 5899 spin_unlock(&q->lock);
5900 timeout = schedule_timeout(timeout); 5900 timeout = schedule_timeout(timeout);
5901 spin_lock_irq(&q->lock); 5901 spin_lock_irq(&q->lock);
5902 __remove_wait_queue(q, &wait); 5902 __remove_wait_queue(q, &wait);
5903 spin_unlock_irqrestore(&q->lock, flags); 5903 spin_unlock_irqrestore(&q->lock, flags);
5904 5904
5905 return timeout; 5905 return timeout;
5906 } 5906 }
5907 5907
5908 void __sched interruptible_sleep_on(wait_queue_head_t *q) 5908 void __sched interruptible_sleep_on(wait_queue_head_t *q)
5909 { 5909 {
5910 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 5910 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5911 } 5911 }
5912 EXPORT_SYMBOL(interruptible_sleep_on); 5912 EXPORT_SYMBOL(interruptible_sleep_on);
5913 5913
5914 long __sched 5914 long __sched
5915 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 5915 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
5916 { 5916 {
5917 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); 5917 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
5918 } 5918 }
5919 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 5919 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
5920 5920
5921 void __sched sleep_on(wait_queue_head_t *q) 5921 void __sched sleep_on(wait_queue_head_t *q)
5922 { 5922 {
5923 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 5923 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
5924 } 5924 }
5925 EXPORT_SYMBOL(sleep_on); 5925 EXPORT_SYMBOL(sleep_on);
5926 5926
5927 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 5927 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
5928 { 5928 {
5929 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); 5929 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
5930 } 5930 }
5931 EXPORT_SYMBOL(sleep_on_timeout); 5931 EXPORT_SYMBOL(sleep_on_timeout);
5932 5932
5933 #ifdef CONFIG_RT_MUTEXES 5933 #ifdef CONFIG_RT_MUTEXES
5934 5934
5935 /* 5935 /*
5936 * rt_mutex_setprio - set the current priority of a task 5936 * rt_mutex_setprio - set the current priority of a task
5937 * @p: task 5937 * @p: task
5938 * @prio: prio value (kernel-internal form) 5938 * @prio: prio value (kernel-internal form)
5939 * 5939 *
5940 * This function changes the 'effective' priority of a task. It does 5940 * This function changes the 'effective' priority of a task. It does
5941 * not touch ->normal_prio like __setscheduler(). 5941 * not touch ->normal_prio like __setscheduler().
5942 * 5942 *
5943 * Used by the rt_mutex code to implement priority inheritance logic. 5943 * Used by the rt_mutex code to implement priority inheritance logic.
5944 */ 5944 */
5945 void rt_mutex_setprio(struct task_struct *p, int prio) 5945 void rt_mutex_setprio(struct task_struct *p, int prio)
5946 { 5946 {
5947 unsigned long flags; 5947 unsigned long flags;
5948 int oldprio, on_rq, running; 5948 int oldprio, on_rq, running;
5949 struct rq *rq; 5949 struct rq *rq;
5950 const struct sched_class *prev_class = p->sched_class; 5950 const struct sched_class *prev_class = p->sched_class;
5951 5951
5952 BUG_ON(prio < 0 || prio > MAX_PRIO); 5952 BUG_ON(prio < 0 || prio > MAX_PRIO);
5953 5953
5954 rq = task_rq_lock(p, &flags); 5954 rq = task_rq_lock(p, &flags);
5955 update_rq_clock(rq); 5955 update_rq_clock(rq);
5956 5956
5957 oldprio = p->prio; 5957 oldprio = p->prio;
5958 on_rq = p->se.on_rq; 5958 on_rq = p->se.on_rq;
5959 running = task_current(rq, p); 5959 running = task_current(rq, p);
5960 if (on_rq) 5960 if (on_rq)
5961 dequeue_task(rq, p, 0); 5961 dequeue_task(rq, p, 0);
5962 if (running) 5962 if (running)
5963 p->sched_class->put_prev_task(rq, p); 5963 p->sched_class->put_prev_task(rq, p);
5964 5964
5965 if (rt_prio(prio)) 5965 if (rt_prio(prio))
5966 p->sched_class = &rt_sched_class; 5966 p->sched_class = &rt_sched_class;
5967 else 5967 else
5968 p->sched_class = &fair_sched_class; 5968 p->sched_class = &fair_sched_class;
5969 5969
5970 p->prio = prio; 5970 p->prio = prio;
5971 5971
5972 if (running) 5972 if (running)
5973 p->sched_class->set_curr_task(rq); 5973 p->sched_class->set_curr_task(rq);
5974 if (on_rq) { 5974 if (on_rq) {
5975 enqueue_task(rq, p, 0); 5975 enqueue_task(rq, p, 0);
5976 5976
5977 check_class_changed(rq, p, prev_class, oldprio, running); 5977 check_class_changed(rq, p, prev_class, oldprio, running);
5978 } 5978 }
5979 task_rq_unlock(rq, &flags); 5979 task_rq_unlock(rq, &flags);
5980 } 5980 }
5981 5981
5982 #endif 5982 #endif
5983 5983
5984 void set_user_nice(struct task_struct *p, long nice) 5984 void set_user_nice(struct task_struct *p, long nice)
5985 { 5985 {
5986 int old_prio, delta, on_rq; 5986 int old_prio, delta, on_rq;
5987 unsigned long flags; 5987 unsigned long flags;
5988 struct rq *rq; 5988 struct rq *rq;
5989 5989
5990 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 5990 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5991 return; 5991 return;
5992 /* 5992 /*
5993 * We have to be careful, if called from sys_setpriority(), 5993 * We have to be careful, if called from sys_setpriority(),
5994 * the task might be in the middle of scheduling on another CPU. 5994 * the task might be in the middle of scheduling on another CPU.
5995 */ 5995 */
5996 rq = task_rq_lock(p, &flags); 5996 rq = task_rq_lock(p, &flags);
5997 update_rq_clock(rq); 5997 update_rq_clock(rq);
5998 /* 5998 /*
5999 * The RT priorities are set via sched_setscheduler(), but we still 5999 * The RT priorities are set via sched_setscheduler(), but we still
6000 * allow the 'normal' nice value to be set - but as expected 6000 * allow the 'normal' nice value to be set - but as expected
6001 * it wont have any effect on scheduling until the task is 6001 * it wont have any effect on scheduling until the task is
6002 * SCHED_FIFO/SCHED_RR: 6002 * SCHED_FIFO/SCHED_RR:
6003 */ 6003 */
6004 if (task_has_rt_policy(p)) { 6004 if (task_has_rt_policy(p)) {
6005 p->static_prio = NICE_TO_PRIO(nice); 6005 p->static_prio = NICE_TO_PRIO(nice);
6006 goto out_unlock; 6006 goto out_unlock;
6007 } 6007 }
6008 on_rq = p->se.on_rq; 6008 on_rq = p->se.on_rq;
6009 if (on_rq) 6009 if (on_rq)
6010 dequeue_task(rq, p, 0); 6010 dequeue_task(rq, p, 0);
6011 6011
6012 p->static_prio = NICE_TO_PRIO(nice); 6012 p->static_prio = NICE_TO_PRIO(nice);
6013 set_load_weight(p); 6013 set_load_weight(p);
6014 old_prio = p->prio; 6014 old_prio = p->prio;
6015 p->prio = effective_prio(p); 6015 p->prio = effective_prio(p);
6016 delta = p->prio - old_prio; 6016 delta = p->prio - old_prio;
6017 6017
6018 if (on_rq) { 6018 if (on_rq) {
6019 enqueue_task(rq, p, 0); 6019 enqueue_task(rq, p, 0);
6020 /* 6020 /*
6021 * If the task increased its priority or is running and 6021 * If the task increased its priority or is running and
6022 * lowered its priority, then reschedule its CPU: 6022 * lowered its priority, then reschedule its CPU:
6023 */ 6023 */
6024 if (delta < 0 || (delta > 0 && task_running(rq, p))) 6024 if (delta < 0 || (delta > 0 && task_running(rq, p)))
6025 resched_task(rq->curr); 6025 resched_task(rq->curr);
6026 } 6026 }
6027 out_unlock: 6027 out_unlock:
6028 task_rq_unlock(rq, &flags); 6028 task_rq_unlock(rq, &flags);
6029 } 6029 }
6030 EXPORT_SYMBOL(set_user_nice); 6030 EXPORT_SYMBOL(set_user_nice);
6031 6031
6032 /* 6032 /*
6033 * can_nice - check if a task can reduce its nice value 6033 * can_nice - check if a task can reduce its nice value
6034 * @p: task 6034 * @p: task
6035 * @nice: nice value 6035 * @nice: nice value
6036 */ 6036 */
6037 int can_nice(const struct task_struct *p, const int nice) 6037 int can_nice(const struct task_struct *p, const int nice)
6038 { 6038 {
6039 /* convert nice value [19,-20] to rlimit style value [1,40] */ 6039 /* convert nice value [19,-20] to rlimit style value [1,40] */
6040 int nice_rlim = 20 - nice; 6040 int nice_rlim = 20 - nice;
6041 6041
6042 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 6042 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
6043 capable(CAP_SYS_NICE)); 6043 capable(CAP_SYS_NICE));
6044 } 6044 }
6045 6045
6046 #ifdef __ARCH_WANT_SYS_NICE 6046 #ifdef __ARCH_WANT_SYS_NICE
6047 6047
6048 /* 6048 /*
6049 * sys_nice - change the priority of the current process. 6049 * sys_nice - change the priority of the current process.
6050 * @increment: priority increment 6050 * @increment: priority increment
6051 * 6051 *
6052 * sys_setpriority is a more generic, but much slower function that 6052 * sys_setpriority is a more generic, but much slower function that
6053 * does similar things. 6053 * does similar things.
6054 */ 6054 */
6055 SYSCALL_DEFINE1(nice, int, increment) 6055 SYSCALL_DEFINE1(nice, int, increment)
6056 { 6056 {
6057 long nice, retval; 6057 long nice, retval;
6058 6058
6059 /* 6059 /*
6060 * Setpriority might change our priority at the same moment. 6060 * Setpriority might change our priority at the same moment.
6061 * We don't have to worry. Conceptually one call occurs first 6061 * We don't have to worry. Conceptually one call occurs first
6062 * and we have a single winner. 6062 * and we have a single winner.
6063 */ 6063 */
6064 if (increment < -40) 6064 if (increment < -40)
6065 increment = -40; 6065 increment = -40;
6066 if (increment > 40) 6066 if (increment > 40)
6067 increment = 40; 6067 increment = 40;
6068 6068
6069 nice = TASK_NICE(current) + increment; 6069 nice = TASK_NICE(current) + increment;
6070 if (nice < -20) 6070 if (nice < -20)
6071 nice = -20; 6071 nice = -20;
6072 if (nice > 19) 6072 if (nice > 19)
6073 nice = 19; 6073 nice = 19;
6074 6074
6075 if (increment < 0 && !can_nice(current, nice)) 6075 if (increment < 0 && !can_nice(current, nice))
6076 return -EPERM; 6076 return -EPERM;
6077 6077
6078 retval = security_task_setnice(current, nice); 6078 retval = security_task_setnice(current, nice);
6079 if (retval) 6079 if (retval)
6080 return retval; 6080 return retval;
6081 6081
6082 set_user_nice(current, nice); 6082 set_user_nice(current, nice);
6083 return 0; 6083 return 0;
6084 } 6084 }
6085 6085
6086 #endif 6086 #endif
6087 6087
6088 /** 6088 /**
6089 * task_prio - return the priority value of a given task. 6089 * task_prio - return the priority value of a given task.
6090 * @p: the task in question. 6090 * @p: the task in question.
6091 * 6091 *
6092 * This is the priority value as seen by users in /proc. 6092 * This is the priority value as seen by users in /proc.
6093 * RT tasks are offset by -200. Normal tasks are centered 6093 * RT tasks are offset by -200. Normal tasks are centered
6094 * around 0, value goes from -16 to +15. 6094 * around 0, value goes from -16 to +15.
6095 */ 6095 */
6096 int task_prio(const struct task_struct *p) 6096 int task_prio(const struct task_struct *p)
6097 { 6097 {
6098 return p->prio - MAX_RT_PRIO; 6098 return p->prio - MAX_RT_PRIO;
6099 } 6099 }
6100 6100
6101 /** 6101 /**
6102 * task_nice - return the nice value of a given task. 6102 * task_nice - return the nice value of a given task.
6103 * @p: the task in question. 6103 * @p: the task in question.
6104 */ 6104 */
6105 int task_nice(const struct task_struct *p) 6105 int task_nice(const struct task_struct *p)
6106 { 6106 {
6107 return TASK_NICE(p); 6107 return TASK_NICE(p);
6108 } 6108 }
6109 EXPORT_SYMBOL(task_nice); 6109 EXPORT_SYMBOL(task_nice);
6110 6110
6111 /** 6111 /**
6112 * idle_cpu - is a given cpu idle currently? 6112 * idle_cpu - is a given cpu idle currently?
6113 * @cpu: the processor in question. 6113 * @cpu: the processor in question.
6114 */ 6114 */
6115 int idle_cpu(int cpu) 6115 int idle_cpu(int cpu)
6116 { 6116 {
6117 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 6117 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
6118 } 6118 }
6119 6119
6120 /** 6120 /**
6121 * idle_task - return the idle task for a given cpu. 6121 * idle_task - return the idle task for a given cpu.
6122 * @cpu: the processor in question. 6122 * @cpu: the processor in question.
6123 */ 6123 */
6124 struct task_struct *idle_task(int cpu) 6124 struct task_struct *idle_task(int cpu)
6125 { 6125 {
6126 return cpu_rq(cpu)->idle; 6126 return cpu_rq(cpu)->idle;
6127 } 6127 }
6128 6128
6129 /** 6129 /**
6130 * find_process_by_pid - find a process with a matching PID value. 6130 * find_process_by_pid - find a process with a matching PID value.
6131 * @pid: the pid in question. 6131 * @pid: the pid in question.
6132 */ 6132 */
6133 static struct task_struct *find_process_by_pid(pid_t pid) 6133 static struct task_struct *find_process_by_pid(pid_t pid)
6134 { 6134 {
6135 return pid ? find_task_by_vpid(pid) : current; 6135 return pid ? find_task_by_vpid(pid) : current;
6136 } 6136 }
6137 6137
6138 /* Actually do priority change: must hold rq lock. */ 6138 /* Actually do priority change: must hold rq lock. */
6139 static void 6139 static void
6140 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 6140 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6141 { 6141 {
6142 BUG_ON(p->se.on_rq); 6142 BUG_ON(p->se.on_rq);
6143 6143
6144 p->policy = policy; 6144 p->policy = policy;
6145 switch (p->policy) { 6145 switch (p->policy) {
6146 case SCHED_NORMAL: 6146 case SCHED_NORMAL:
6147 case SCHED_BATCH: 6147 case SCHED_BATCH:
6148 case SCHED_IDLE: 6148 case SCHED_IDLE:
6149 p->sched_class = &fair_sched_class; 6149 p->sched_class = &fair_sched_class;
6150 break; 6150 break;
6151 case SCHED_FIFO: 6151 case SCHED_FIFO:
6152 case SCHED_RR: 6152 case SCHED_RR:
6153 p->sched_class = &rt_sched_class; 6153 p->sched_class = &rt_sched_class;
6154 break; 6154 break;
6155 } 6155 }
6156 6156
6157 p->rt_priority = prio; 6157 p->rt_priority = prio;
6158 p->normal_prio = normal_prio(p); 6158 p->normal_prio = normal_prio(p);
6159 /* we are holding p->pi_lock already */ 6159 /* we are holding p->pi_lock already */
6160 p->prio = rt_mutex_getprio(p); 6160 p->prio = rt_mutex_getprio(p);
6161 set_load_weight(p); 6161 set_load_weight(p);
6162 } 6162 }
6163 6163
6164 /* 6164 /*
6165 * check the target process has a UID that matches the current process's 6165 * check the target process has a UID that matches the current process's
6166 */ 6166 */
6167 static bool check_same_owner(struct task_struct *p) 6167 static bool check_same_owner(struct task_struct *p)
6168 { 6168 {
6169 const struct cred *cred = current_cred(), *pcred; 6169 const struct cred *cred = current_cred(), *pcred;
6170 bool match; 6170 bool match;
6171 6171
6172 rcu_read_lock(); 6172 rcu_read_lock();
6173 pcred = __task_cred(p); 6173 pcred = __task_cred(p);
6174 match = (cred->euid == pcred->euid || 6174 match = (cred->euid == pcred->euid ||
6175 cred->euid == pcred->uid); 6175 cred->euid == pcred->uid);
6176 rcu_read_unlock(); 6176 rcu_read_unlock();
6177 return match; 6177 return match;
6178 } 6178 }
6179 6179
6180 static int __sched_setscheduler(struct task_struct *p, int policy, 6180 static int __sched_setscheduler(struct task_struct *p, int policy,
6181 struct sched_param *param, bool user) 6181 struct sched_param *param, bool user)
6182 { 6182 {
6183 int retval, oldprio, oldpolicy = -1, on_rq, running; 6183 int retval, oldprio, oldpolicy = -1, on_rq, running;
6184 unsigned long flags; 6184 unsigned long flags;
6185 const struct sched_class *prev_class = p->sched_class; 6185 const struct sched_class *prev_class = p->sched_class;
6186 struct rq *rq; 6186 struct rq *rq;
6187 int reset_on_fork; 6187 int reset_on_fork;
6188 6188
6189 /* may grab non-irq protected spin_locks */ 6189 /* may grab non-irq protected spin_locks */
6190 BUG_ON(in_interrupt()); 6190 BUG_ON(in_interrupt());
6191 recheck: 6191 recheck:
6192 /* double check policy once rq lock held */ 6192 /* double check policy once rq lock held */
6193 if (policy < 0) { 6193 if (policy < 0) {
6194 reset_on_fork = p->sched_reset_on_fork; 6194 reset_on_fork = p->sched_reset_on_fork;
6195 policy = oldpolicy = p->policy; 6195 policy = oldpolicy = p->policy;
6196 } else { 6196 } else {
6197 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); 6197 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6198 policy &= ~SCHED_RESET_ON_FORK; 6198 policy &= ~SCHED_RESET_ON_FORK;
6199 6199
6200 if (policy != SCHED_FIFO && policy != SCHED_RR && 6200 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6201 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6201 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6202 policy != SCHED_IDLE) 6202 policy != SCHED_IDLE)
6203 return -EINVAL; 6203 return -EINVAL;
6204 } 6204 }
6205 6205
6206 /* 6206 /*
6207 * Valid priorities for SCHED_FIFO and SCHED_RR are 6207 * Valid priorities for SCHED_FIFO and SCHED_RR are
6208 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6208 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
6209 * SCHED_BATCH and SCHED_IDLE is 0. 6209 * SCHED_BATCH and SCHED_IDLE is 0.
6210 */ 6210 */
6211 if (param->sched_priority < 0 || 6211 if (param->sched_priority < 0 ||
6212 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 6212 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
6213 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 6213 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
6214 return -EINVAL; 6214 return -EINVAL;
6215 if (rt_policy(policy) != (param->sched_priority != 0)) 6215 if (rt_policy(policy) != (param->sched_priority != 0))
6216 return -EINVAL; 6216 return -EINVAL;
6217 6217
6218 /* 6218 /*
6219 * Allow unprivileged RT tasks to decrease priority: 6219 * Allow unprivileged RT tasks to decrease priority:
6220 */ 6220 */
6221 if (user && !capable(CAP_SYS_NICE)) { 6221 if (user && !capable(CAP_SYS_NICE)) {
6222 if (rt_policy(policy)) { 6222 if (rt_policy(policy)) {
6223 unsigned long rlim_rtprio; 6223 unsigned long rlim_rtprio;
6224 6224
6225 if (!lock_task_sighand(p, &flags)) 6225 if (!lock_task_sighand(p, &flags))
6226 return -ESRCH; 6226 return -ESRCH;
6227 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 6227 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
6228 unlock_task_sighand(p, &flags); 6228 unlock_task_sighand(p, &flags);
6229 6229
6230 /* can't set/change the rt policy */ 6230 /* can't set/change the rt policy */
6231 if (policy != p->policy && !rlim_rtprio) 6231 if (policy != p->policy && !rlim_rtprio)
6232 return -EPERM; 6232 return -EPERM;
6233 6233
6234 /* can't increase priority */ 6234 /* can't increase priority */
6235 if (param->sched_priority > p->rt_priority && 6235 if (param->sched_priority > p->rt_priority &&
6236 param->sched_priority > rlim_rtprio) 6236 param->sched_priority > rlim_rtprio)
6237 return -EPERM; 6237 return -EPERM;
6238 } 6238 }
6239 /* 6239 /*
6240 * Like positive nice levels, dont allow tasks to 6240 * Like positive nice levels, dont allow tasks to
6241 * move out of SCHED_IDLE either: 6241 * move out of SCHED_IDLE either:
6242 */ 6242 */
6243 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 6243 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
6244 return -EPERM; 6244 return -EPERM;
6245 6245
6246 /* can't change other user's priorities */ 6246 /* can't change other user's priorities */
6247 if (!check_same_owner(p)) 6247 if (!check_same_owner(p))
6248 return -EPERM; 6248 return -EPERM;
6249 6249
6250 /* Normal users shall not reset the sched_reset_on_fork flag */ 6250 /* Normal users shall not reset the sched_reset_on_fork flag */
6251 if (p->sched_reset_on_fork && !reset_on_fork) 6251 if (p->sched_reset_on_fork && !reset_on_fork)
6252 return -EPERM; 6252 return -EPERM;
6253 } 6253 }
6254 6254
6255 if (user) { 6255 if (user) {
6256 #ifdef CONFIG_RT_GROUP_SCHED 6256 #ifdef CONFIG_RT_GROUP_SCHED
6257 /* 6257 /*
6258 * Do not allow realtime tasks into groups that have no runtime 6258 * Do not allow realtime tasks into groups that have no runtime
6259 * assigned. 6259 * assigned.
6260 */ 6260 */
6261 if (rt_bandwidth_enabled() && rt_policy(policy) && 6261 if (rt_bandwidth_enabled() && rt_policy(policy) &&
6262 task_group(p)->rt_bandwidth.rt_runtime == 0) 6262 task_group(p)->rt_bandwidth.rt_runtime == 0)
6263 return -EPERM; 6263 return -EPERM;
6264 #endif 6264 #endif
6265 6265
6266 retval = security_task_setscheduler(p, policy, param); 6266 retval = security_task_setscheduler(p, policy, param);
6267 if (retval) 6267 if (retval)
6268 return retval; 6268 return retval;
6269 } 6269 }
6270 6270
6271 /* 6271 /*
6272 * make sure no PI-waiters arrive (or leave) while we are 6272 * make sure no PI-waiters arrive (or leave) while we are
6273 * changing the priority of the task: 6273 * changing the priority of the task:
6274 */ 6274 */
6275 spin_lock_irqsave(&p->pi_lock, flags); 6275 spin_lock_irqsave(&p->pi_lock, flags);
6276 /* 6276 /*
6277 * To be able to change p->policy safely, the apropriate 6277 * To be able to change p->policy safely, the apropriate
6278 * runqueue lock must be held. 6278 * runqueue lock must be held.
6279 */ 6279 */
6280 rq = __task_rq_lock(p); 6280 rq = __task_rq_lock(p);
6281 /* recheck policy now with rq lock held */ 6281 /* recheck policy now with rq lock held */
6282 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 6282 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6283 policy = oldpolicy = -1; 6283 policy = oldpolicy = -1;
6284 __task_rq_unlock(rq); 6284 __task_rq_unlock(rq);
6285 spin_unlock_irqrestore(&p->pi_lock, flags); 6285 spin_unlock_irqrestore(&p->pi_lock, flags);
6286 goto recheck; 6286 goto recheck;
6287 } 6287 }
6288 update_rq_clock(rq); 6288 update_rq_clock(rq);
6289 on_rq = p->se.on_rq; 6289 on_rq = p->se.on_rq;
6290 running = task_current(rq, p); 6290 running = task_current(rq, p);
6291 if (on_rq) 6291 if (on_rq)
6292 deactivate_task(rq, p, 0); 6292 deactivate_task(rq, p, 0);
6293 if (running) 6293 if (running)
6294 p->sched_class->put_prev_task(rq, p); 6294 p->sched_class->put_prev_task(rq, p);
6295 6295
6296 p->sched_reset_on_fork = reset_on_fork; 6296 p->sched_reset_on_fork = reset_on_fork;
6297 6297
6298 oldprio = p->prio; 6298 oldprio = p->prio;
6299 __setscheduler(rq, p, policy, param->sched_priority); 6299 __setscheduler(rq, p, policy, param->sched_priority);
6300 6300
6301 if (running) 6301 if (running)
6302 p->sched_class->set_curr_task(rq); 6302 p->sched_class->set_curr_task(rq);
6303 if (on_rq) { 6303 if (on_rq) {
6304 activate_task(rq, p, 0); 6304 activate_task(rq, p, 0);
6305 6305
6306 check_class_changed(rq, p, prev_class, oldprio, running); 6306 check_class_changed(rq, p, prev_class, oldprio, running);
6307 } 6307 }
6308 __task_rq_unlock(rq); 6308 __task_rq_unlock(rq);
6309 spin_unlock_irqrestore(&p->pi_lock, flags); 6309 spin_unlock_irqrestore(&p->pi_lock, flags);
6310 6310
6311 rt_mutex_adjust_pi(p); 6311 rt_mutex_adjust_pi(p);
6312 6312
6313 return 0; 6313 return 0;
6314 } 6314 }
6315 6315
6316 /** 6316 /**
6317 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 6317 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
6318 * @p: the task in question. 6318 * @p: the task in question.
6319 * @policy: new policy. 6319 * @policy: new policy.
6320 * @param: structure containing the new RT priority. 6320 * @param: structure containing the new RT priority.
6321 * 6321 *
6322 * NOTE that the task may be already dead. 6322 * NOTE that the task may be already dead.
6323 */ 6323 */
6324 int sched_setscheduler(struct task_struct *p, int policy, 6324 int sched_setscheduler(struct task_struct *p, int policy,
6325 struct sched_param *param) 6325 struct sched_param *param)
6326 { 6326 {
6327 return __sched_setscheduler(p, policy, param, true); 6327 return __sched_setscheduler(p, policy, param, true);
6328 } 6328 }
6329 EXPORT_SYMBOL_GPL(sched_setscheduler); 6329 EXPORT_SYMBOL_GPL(sched_setscheduler);
6330 6330
6331 /** 6331 /**
6332 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 6332 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
6333 * @p: the task in question. 6333 * @p: the task in question.
6334 * @policy: new policy. 6334 * @policy: new policy.
6335 * @param: structure containing the new RT priority. 6335 * @param: structure containing the new RT priority.
6336 * 6336 *
6337 * Just like sched_setscheduler, only don't bother checking if the 6337 * Just like sched_setscheduler, only don't bother checking if the
6338 * current context has permission. For example, this is needed in 6338 * current context has permission. For example, this is needed in
6339 * stop_machine(): we create temporary high priority worker threads, 6339 * stop_machine(): we create temporary high priority worker threads,
6340 * but our caller might not have that capability. 6340 * but our caller might not have that capability.
6341 */ 6341 */
6342 int sched_setscheduler_nocheck(struct task_struct *p, int policy, 6342 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
6343 struct sched_param *param) 6343 struct sched_param *param)
6344 { 6344 {
6345 return __sched_setscheduler(p, policy, param, false); 6345 return __sched_setscheduler(p, policy, param, false);
6346 } 6346 }
6347 6347
6348 static int 6348 static int
6349 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 6349 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
6350 { 6350 {
6351 struct sched_param lparam; 6351 struct sched_param lparam;
6352 struct task_struct *p; 6352 struct task_struct *p;
6353 int retval; 6353 int retval;
6354 6354
6355 if (!param || pid < 0) 6355 if (!param || pid < 0)
6356 return -EINVAL; 6356 return -EINVAL;
6357 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 6357 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
6358 return -EFAULT; 6358 return -EFAULT;
6359 6359
6360 rcu_read_lock(); 6360 rcu_read_lock();
6361 retval = -ESRCH; 6361 retval = -ESRCH;
6362 p = find_process_by_pid(pid); 6362 p = find_process_by_pid(pid);
6363 if (p != NULL) 6363 if (p != NULL)
6364 retval = sched_setscheduler(p, policy, &lparam); 6364 retval = sched_setscheduler(p, policy, &lparam);
6365 rcu_read_unlock(); 6365 rcu_read_unlock();
6366 6366
6367 return retval; 6367 return retval;
6368 } 6368 }
6369 6369
6370 /** 6370 /**
6371 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 6371 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
6372 * @pid: the pid in question. 6372 * @pid: the pid in question.
6373 * @policy: new policy. 6373 * @policy: new policy.
6374 * @param: structure containing the new RT priority. 6374 * @param: structure containing the new RT priority.
6375 */ 6375 */
6376 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 6376 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
6377 struct sched_param __user *, param) 6377 struct sched_param __user *, param)
6378 { 6378 {
6379 /* negative values for policy are not valid */ 6379 /* negative values for policy are not valid */
6380 if (policy < 0) 6380 if (policy < 0)
6381 return -EINVAL; 6381 return -EINVAL;
6382 6382
6383 return do_sched_setscheduler(pid, policy, param); 6383 return do_sched_setscheduler(pid, policy, param);
6384 } 6384 }
6385 6385
6386 /** 6386 /**
6387 * sys_sched_setparam - set/change the RT priority of a thread 6387 * sys_sched_setparam - set/change the RT priority of a thread
6388 * @pid: the pid in question. 6388 * @pid: the pid in question.
6389 * @param: structure containing the new RT priority. 6389 * @param: structure containing the new RT priority.
6390 */ 6390 */
6391 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) 6391 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
6392 { 6392 {
6393 return do_sched_setscheduler(pid, -1, param); 6393 return do_sched_setscheduler(pid, -1, param);
6394 } 6394 }
6395 6395
6396 /** 6396 /**
6397 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 6397 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
6398 * @pid: the pid in question. 6398 * @pid: the pid in question.
6399 */ 6399 */
6400 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) 6400 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6401 { 6401 {
6402 struct task_struct *p; 6402 struct task_struct *p;
6403 int retval; 6403 int retval;
6404 6404
6405 if (pid < 0) 6405 if (pid < 0)
6406 return -EINVAL; 6406 return -EINVAL;
6407 6407
6408 retval = -ESRCH; 6408 retval = -ESRCH;
6409 read_lock(&tasklist_lock); 6409 read_lock(&tasklist_lock);
6410 p = find_process_by_pid(pid); 6410 p = find_process_by_pid(pid);
6411 if (p) { 6411 if (p) {
6412 retval = security_task_getscheduler(p); 6412 retval = security_task_getscheduler(p);
6413 if (!retval) 6413 if (!retval)
6414 retval = p->policy 6414 retval = p->policy
6415 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 6415 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6416 } 6416 }
6417 read_unlock(&tasklist_lock); 6417 read_unlock(&tasklist_lock);
6418 return retval; 6418 return retval;
6419 } 6419 }
6420 6420
6421 /** 6421 /**
6422 * sys_sched_getparam - get the RT priority of a thread 6422 * sys_sched_getparam - get the RT priority of a thread
6423 * @pid: the pid in question. 6423 * @pid: the pid in question.
6424 * @param: structure containing the RT priority. 6424 * @param: structure containing the RT priority.
6425 */ 6425 */
6426 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) 6426 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6427 { 6427 {
6428 struct sched_param lp; 6428 struct sched_param lp;
6429 struct task_struct *p; 6429 struct task_struct *p;
6430 int retval; 6430 int retval;
6431 6431
6432 if (!param || pid < 0) 6432 if (!param || pid < 0)
6433 return -EINVAL; 6433 return -EINVAL;
6434 6434
6435 read_lock(&tasklist_lock); 6435 read_lock(&tasklist_lock);
6436 p = find_process_by_pid(pid); 6436 p = find_process_by_pid(pid);
6437 retval = -ESRCH; 6437 retval = -ESRCH;
6438 if (!p) 6438 if (!p)
6439 goto out_unlock; 6439 goto out_unlock;
6440 6440
6441 retval = security_task_getscheduler(p); 6441 retval = security_task_getscheduler(p);
6442 if (retval) 6442 if (retval)
6443 goto out_unlock; 6443 goto out_unlock;
6444 6444
6445 lp.sched_priority = p->rt_priority; 6445 lp.sched_priority = p->rt_priority;
6446 read_unlock(&tasklist_lock); 6446 read_unlock(&tasklist_lock);
6447 6447
6448 /* 6448 /*
6449 * This one might sleep, we cannot do it with a spinlock held ... 6449 * This one might sleep, we cannot do it with a spinlock held ...
6450 */ 6450 */
6451 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 6451 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
6452 6452
6453 return retval; 6453 return retval;
6454 6454
6455 out_unlock: 6455 out_unlock:
6456 read_unlock(&tasklist_lock); 6456 read_unlock(&tasklist_lock);
6457 return retval; 6457 return retval;
6458 } 6458 }
6459 6459
6460 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) 6460 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6461 { 6461 {
6462 cpumask_var_t cpus_allowed, new_mask; 6462 cpumask_var_t cpus_allowed, new_mask;
6463 struct task_struct *p; 6463 struct task_struct *p;
6464 int retval; 6464 int retval;
6465 6465
6466 get_online_cpus(); 6466 get_online_cpus();
6467 read_lock(&tasklist_lock); 6467 read_lock(&tasklist_lock);
6468 6468
6469 p = find_process_by_pid(pid); 6469 p = find_process_by_pid(pid);
6470 if (!p) { 6470 if (!p) {
6471 read_unlock(&tasklist_lock); 6471 read_unlock(&tasklist_lock);
6472 put_online_cpus(); 6472 put_online_cpus();
6473 return -ESRCH; 6473 return -ESRCH;
6474 } 6474 }
6475 6475
6476 /* 6476 /*
6477 * It is not safe to call set_cpus_allowed with the 6477 * It is not safe to call set_cpus_allowed with the
6478 * tasklist_lock held. We will bump the task_struct's 6478 * tasklist_lock held. We will bump the task_struct's
6479 * usage count and then drop tasklist_lock. 6479 * usage count and then drop tasklist_lock.
6480 */ 6480 */
6481 get_task_struct(p); 6481 get_task_struct(p);
6482 read_unlock(&tasklist_lock); 6482 read_unlock(&tasklist_lock);
6483 6483
6484 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 6484 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6485 retval = -ENOMEM; 6485 retval = -ENOMEM;
6486 goto out_put_task; 6486 goto out_put_task;
6487 } 6487 }
6488 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { 6488 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6489 retval = -ENOMEM; 6489 retval = -ENOMEM;
6490 goto out_free_cpus_allowed; 6490 goto out_free_cpus_allowed;
6491 } 6491 }
6492 retval = -EPERM; 6492 retval = -EPERM;
6493 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 6493 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
6494 goto out_unlock; 6494 goto out_unlock;
6495 6495
6496 retval = security_task_setscheduler(p, 0, NULL); 6496 retval = security_task_setscheduler(p, 0, NULL);
6497 if (retval) 6497 if (retval)
6498 goto out_unlock; 6498 goto out_unlock;
6499 6499
6500 cpuset_cpus_allowed(p, cpus_allowed); 6500 cpuset_cpus_allowed(p, cpus_allowed);
6501 cpumask_and(new_mask, in_mask, cpus_allowed); 6501 cpumask_and(new_mask, in_mask, cpus_allowed);
6502 again: 6502 again:
6503 retval = set_cpus_allowed_ptr(p, new_mask); 6503 retval = set_cpus_allowed_ptr(p, new_mask);
6504 6504
6505 if (!retval) { 6505 if (!retval) {
6506 cpuset_cpus_allowed(p, cpus_allowed); 6506 cpuset_cpus_allowed(p, cpus_allowed);
6507 if (!cpumask_subset(new_mask, cpus_allowed)) { 6507 if (!cpumask_subset(new_mask, cpus_allowed)) {
6508 /* 6508 /*
6509 * We must have raced with a concurrent cpuset 6509 * We must have raced with a concurrent cpuset
6510 * update. Just reset the cpus_allowed to the 6510 * update. Just reset the cpus_allowed to the
6511 * cpuset's cpus_allowed 6511 * cpuset's cpus_allowed
6512 */ 6512 */
6513 cpumask_copy(new_mask, cpus_allowed); 6513 cpumask_copy(new_mask, cpus_allowed);
6514 goto again; 6514 goto again;
6515 } 6515 }
6516 } 6516 }
6517 out_unlock: 6517 out_unlock:
6518 free_cpumask_var(new_mask); 6518 free_cpumask_var(new_mask);
6519 out_free_cpus_allowed: 6519 out_free_cpus_allowed:
6520 free_cpumask_var(cpus_allowed); 6520 free_cpumask_var(cpus_allowed);
6521 out_put_task: 6521 out_put_task:
6522 put_task_struct(p); 6522 put_task_struct(p);
6523 put_online_cpus(); 6523 put_online_cpus();
6524 return retval; 6524 return retval;
6525 } 6525 }
6526 6526
6527 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 6527 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6528 struct cpumask *new_mask) 6528 struct cpumask *new_mask)
6529 { 6529 {
6530 if (len < cpumask_size()) 6530 if (len < cpumask_size())
6531 cpumask_clear(new_mask); 6531 cpumask_clear(new_mask);
6532 else if (len > cpumask_size()) 6532 else if (len > cpumask_size())
6533 len = cpumask_size(); 6533 len = cpumask_size();
6534 6534
6535 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 6535 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6536 } 6536 }
6537 6537
6538 /** 6538 /**
6539 * sys_sched_setaffinity - set the cpu affinity of a process 6539 * sys_sched_setaffinity - set the cpu affinity of a process
6540 * @pid: pid of the process 6540 * @pid: pid of the process
6541 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 6541 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6542 * @user_mask_ptr: user-space pointer to the new cpu mask 6542 * @user_mask_ptr: user-space pointer to the new cpu mask
6543 */ 6543 */
6544 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, 6544 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6545 unsigned long __user *, user_mask_ptr) 6545 unsigned long __user *, user_mask_ptr)
6546 { 6546 {
6547 cpumask_var_t new_mask; 6547 cpumask_var_t new_mask;
6548 int retval; 6548 int retval;
6549 6549
6550 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) 6550 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
6551 return -ENOMEM; 6551 return -ENOMEM;
6552 6552
6553 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); 6553 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6554 if (retval == 0) 6554 if (retval == 0)
6555 retval = sched_setaffinity(pid, new_mask); 6555 retval = sched_setaffinity(pid, new_mask);
6556 free_cpumask_var(new_mask); 6556 free_cpumask_var(new_mask);
6557 return retval; 6557 return retval;
6558 } 6558 }
6559 6559
6560 long sched_getaffinity(pid_t pid, struct cpumask *mask) 6560 long sched_getaffinity(pid_t pid, struct cpumask *mask)
6561 { 6561 {
6562 struct task_struct *p; 6562 struct task_struct *p;
6563 int retval; 6563 int retval;
6564 6564
6565 get_online_cpus(); 6565 get_online_cpus();
6566 read_lock(&tasklist_lock); 6566 read_lock(&tasklist_lock);
6567 6567
6568 retval = -ESRCH; 6568 retval = -ESRCH;
6569 p = find_process_by_pid(pid); 6569 p = find_process_by_pid(pid);
6570 if (!p) 6570 if (!p)
6571 goto out_unlock; 6571 goto out_unlock;
6572 6572
6573 retval = security_task_getscheduler(p); 6573 retval = security_task_getscheduler(p);
6574 if (retval) 6574 if (retval)
6575 goto out_unlock; 6575 goto out_unlock;
6576 6576
6577 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6577 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6578 6578
6579 out_unlock: 6579 out_unlock:
6580 read_unlock(&tasklist_lock); 6580 read_unlock(&tasklist_lock);
6581 put_online_cpus(); 6581 put_online_cpus();
6582 6582
6583 return retval; 6583 return retval;
6584 } 6584 }
6585 6585
6586 /** 6586 /**
6587 * sys_sched_getaffinity - get the cpu affinity of a process 6587 * sys_sched_getaffinity - get the cpu affinity of a process
6588 * @pid: pid of the process 6588 * @pid: pid of the process
6589 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 6589 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6590 * @user_mask_ptr: user-space pointer to hold the current cpu mask 6590 * @user_mask_ptr: user-space pointer to hold the current cpu mask
6591 */ 6591 */
6592 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, 6592 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6593 unsigned long __user *, user_mask_ptr) 6593 unsigned long __user *, user_mask_ptr)
6594 { 6594 {
6595 int ret; 6595 int ret;
6596 cpumask_var_t mask; 6596 cpumask_var_t mask;
6597 6597
6598 if (len < cpumask_size()) 6598 if (len < cpumask_size())
6599 return -EINVAL; 6599 return -EINVAL;
6600 6600
6601 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 6601 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
6602 return -ENOMEM; 6602 return -ENOMEM;
6603 6603
6604 ret = sched_getaffinity(pid, mask); 6604 ret = sched_getaffinity(pid, mask);
6605 if (ret == 0) { 6605 if (ret == 0) {
6606 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 6606 if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
6607 ret = -EFAULT; 6607 ret = -EFAULT;
6608 else 6608 else
6609 ret = cpumask_size(); 6609 ret = cpumask_size();
6610 } 6610 }
6611 free_cpumask_var(mask); 6611 free_cpumask_var(mask);
6612 6612
6613 return ret; 6613 return ret;
6614 } 6614 }
6615 6615
6616 /** 6616 /**
6617 * sys_sched_yield - yield the current processor to other threads. 6617 * sys_sched_yield - yield the current processor to other threads.
6618 * 6618 *
6619 * This function yields the current CPU to other tasks. If there are no 6619 * This function yields the current CPU to other tasks. If there are no
6620 * other threads running on this CPU then this function will return. 6620 * other threads running on this CPU then this function will return.
6621 */ 6621 */
6622 SYSCALL_DEFINE0(sched_yield) 6622 SYSCALL_DEFINE0(sched_yield)
6623 { 6623 {
6624 struct rq *rq = this_rq_lock(); 6624 struct rq *rq = this_rq_lock();
6625 6625
6626 schedstat_inc(rq, yld_count); 6626 schedstat_inc(rq, yld_count);
6627 current->sched_class->yield_task(rq); 6627 current->sched_class->yield_task(rq);
6628 6628
6629 /* 6629 /*
6630 * Since we are going to call schedule() anyway, there's 6630 * Since we are going to call schedule() anyway, there's
6631 * no need to preempt or enable interrupts: 6631 * no need to preempt or enable interrupts:
6632 */ 6632 */
6633 __release(rq->lock); 6633 __release(rq->lock);
6634 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 6634 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6635 _raw_spin_unlock(&rq->lock); 6635 _raw_spin_unlock(&rq->lock);
6636 preempt_enable_no_resched(); 6636 preempt_enable_no_resched();
6637 6637
6638 schedule(); 6638 schedule();
6639 6639
6640 return 0; 6640 return 0;
6641 } 6641 }
6642 6642
6643 static inline int should_resched(void) 6643 static inline int should_resched(void)
6644 { 6644 {
6645 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); 6645 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
6646 } 6646 }
6647 6647
6648 static void __cond_resched(void) 6648 static void __cond_resched(void)
6649 { 6649 {
6650 add_preempt_count(PREEMPT_ACTIVE); 6650 add_preempt_count(PREEMPT_ACTIVE);
6651 schedule(); 6651 schedule();
6652 sub_preempt_count(PREEMPT_ACTIVE); 6652 sub_preempt_count(PREEMPT_ACTIVE);
6653 } 6653 }
6654 6654
6655 int __sched _cond_resched(void) 6655 int __sched _cond_resched(void)
6656 { 6656 {
6657 if (should_resched()) { 6657 if (should_resched()) {
6658 __cond_resched(); 6658 __cond_resched();
6659 return 1; 6659 return 1;
6660 } 6660 }
6661 return 0; 6661 return 0;
6662 } 6662 }
6663 EXPORT_SYMBOL(_cond_resched); 6663 EXPORT_SYMBOL(_cond_resched);
6664 6664
6665 /* 6665 /*
6666 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 6666 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6667 * call schedule, and on return reacquire the lock. 6667 * call schedule, and on return reacquire the lock.
6668 * 6668 *
6669 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6669 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6670 * operations here to prevent schedule() from being called twice (once via 6670 * operations here to prevent schedule() from being called twice (once via
6671 * spin_unlock(), once by hand). 6671 * spin_unlock(), once by hand).
6672 */ 6672 */
6673 int __cond_resched_lock(spinlock_t *lock) 6673 int __cond_resched_lock(spinlock_t *lock)
6674 { 6674 {
6675 int resched = should_resched(); 6675 int resched = should_resched();
6676 int ret = 0; 6676 int ret = 0;
6677 6677
6678 lockdep_assert_held(lock); 6678 lockdep_assert_held(lock);
6679 6679
6680 if (spin_needbreak(lock) || resched) { 6680 if (spin_needbreak(lock) || resched) {
6681 spin_unlock(lock); 6681 spin_unlock(lock);
6682 if (resched) 6682 if (resched)
6683 __cond_resched(); 6683 __cond_resched();
6684 else 6684 else
6685 cpu_relax(); 6685 cpu_relax();
6686 ret = 1; 6686 ret = 1;
6687 spin_lock(lock); 6687 spin_lock(lock);
6688 } 6688 }
6689 return ret; 6689 return ret;
6690 } 6690 }
6691 EXPORT_SYMBOL(__cond_resched_lock); 6691 EXPORT_SYMBOL(__cond_resched_lock);
6692 6692
6693 int __sched __cond_resched_softirq(void) 6693 int __sched __cond_resched_softirq(void)
6694 { 6694 {
6695 BUG_ON(!in_softirq()); 6695 BUG_ON(!in_softirq());
6696 6696
6697 if (should_resched()) { 6697 if (should_resched()) {
6698 local_bh_enable(); 6698 local_bh_enable();
6699 __cond_resched(); 6699 __cond_resched();
6700 local_bh_disable(); 6700 local_bh_disable();
6701 return 1; 6701 return 1;
6702 } 6702 }
6703 return 0; 6703 return 0;
6704 } 6704 }
6705 EXPORT_SYMBOL(__cond_resched_softirq); 6705 EXPORT_SYMBOL(__cond_resched_softirq);
6706 6706
6707 /** 6707 /**
6708 * yield - yield the current processor to other threads. 6708 * yield - yield the current processor to other threads.
6709 * 6709 *
6710 * This is a shortcut for kernel-space yielding - it marks the 6710 * This is a shortcut for kernel-space yielding - it marks the
6711 * thread runnable and calls sys_sched_yield(). 6711 * thread runnable and calls sys_sched_yield().
6712 */ 6712 */
6713 void __sched yield(void) 6713 void __sched yield(void)
6714 { 6714 {
6715 set_current_state(TASK_RUNNING); 6715 set_current_state(TASK_RUNNING);
6716 sys_sched_yield(); 6716 sys_sched_yield();
6717 } 6717 }
6718 EXPORT_SYMBOL(yield); 6718 EXPORT_SYMBOL(yield);
6719 6719
6720 /* 6720 /*
6721 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6721 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6722 * that process accounting knows that this is a task in IO wait state. 6722 * that process accounting knows that this is a task in IO wait state.
6723 */ 6723 */
6724 void __sched io_schedule(void) 6724 void __sched io_schedule(void)
6725 { 6725 {
6726 struct rq *rq = raw_rq(); 6726 struct rq *rq = raw_rq();
6727 6727
6728 delayacct_blkio_start(); 6728 delayacct_blkio_start();
6729 atomic_inc(&rq->nr_iowait); 6729 atomic_inc(&rq->nr_iowait);
6730 current->in_iowait = 1; 6730 current->in_iowait = 1;
6731 schedule(); 6731 schedule();
6732 current->in_iowait = 0; 6732 current->in_iowait = 0;
6733 atomic_dec(&rq->nr_iowait); 6733 atomic_dec(&rq->nr_iowait);
6734 delayacct_blkio_end(); 6734 delayacct_blkio_end();
6735 } 6735 }
6736 EXPORT_SYMBOL(io_schedule); 6736 EXPORT_SYMBOL(io_schedule);
6737 6737
6738 long __sched io_schedule_timeout(long timeout) 6738 long __sched io_schedule_timeout(long timeout)
6739 { 6739 {
6740 struct rq *rq = raw_rq(); 6740 struct rq *rq = raw_rq();
6741 long ret; 6741 long ret;
6742 6742
6743 delayacct_blkio_start(); 6743 delayacct_blkio_start();
6744 atomic_inc(&rq->nr_iowait); 6744 atomic_inc(&rq->nr_iowait);
6745 current->in_iowait = 1; 6745 current->in_iowait = 1;
6746 ret = schedule_timeout(timeout); 6746 ret = schedule_timeout(timeout);
6747 current->in_iowait = 0; 6747 current->in_iowait = 0;
6748 atomic_dec(&rq->nr_iowait); 6748 atomic_dec(&rq->nr_iowait);
6749 delayacct_blkio_end(); 6749 delayacct_blkio_end();
6750 return ret; 6750 return ret;
6751 } 6751 }
6752 6752
6753 /** 6753 /**
6754 * sys_sched_get_priority_max - return maximum RT priority. 6754 * sys_sched_get_priority_max - return maximum RT priority.
6755 * @policy: scheduling class. 6755 * @policy: scheduling class.
6756 * 6756 *
6757 * this syscall returns the maximum rt_priority that can be used 6757 * this syscall returns the maximum rt_priority that can be used
6758 * by a given scheduling class. 6758 * by a given scheduling class.
6759 */ 6759 */
6760 SYSCALL_DEFINE1(sched_get_priority_max, int, policy) 6760 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6761 { 6761 {
6762 int ret = -EINVAL; 6762 int ret = -EINVAL;
6763 6763
6764 switch (policy) { 6764 switch (policy) {
6765 case SCHED_FIFO: 6765 case SCHED_FIFO:
6766 case SCHED_RR: 6766 case SCHED_RR:
6767 ret = MAX_USER_RT_PRIO-1; 6767 ret = MAX_USER_RT_PRIO-1;
6768 break; 6768 break;
6769 case SCHED_NORMAL: 6769 case SCHED_NORMAL:
6770 case SCHED_BATCH: 6770 case SCHED_BATCH:
6771 case SCHED_IDLE: 6771 case SCHED_IDLE:
6772 ret = 0; 6772 ret = 0;
6773 break; 6773 break;
6774 } 6774 }
6775 return ret; 6775 return ret;
6776 } 6776 }
6777 6777
6778 /** 6778 /**
6779 * sys_sched_get_priority_min - return minimum RT priority. 6779 * sys_sched_get_priority_min - return minimum RT priority.
6780 * @policy: scheduling class. 6780 * @policy: scheduling class.
6781 * 6781 *
6782 * this syscall returns the minimum rt_priority that can be used 6782 * this syscall returns the minimum rt_priority that can be used
6783 * by a given scheduling class. 6783 * by a given scheduling class.
6784 */ 6784 */
6785 SYSCALL_DEFINE1(sched_get_priority_min, int, policy) 6785 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6786 { 6786 {
6787 int ret = -EINVAL; 6787 int ret = -EINVAL;
6788 6788
6789 switch (policy) { 6789 switch (policy) {
6790 case SCHED_FIFO: 6790 case SCHED_FIFO:
6791 case SCHED_RR: 6791 case SCHED_RR:
6792 ret = 1; 6792 ret = 1;
6793 break; 6793 break;
6794 case SCHED_NORMAL: 6794 case SCHED_NORMAL:
6795 case SCHED_BATCH: 6795 case SCHED_BATCH:
6796 case SCHED_IDLE: 6796 case SCHED_IDLE:
6797 ret = 0; 6797 ret = 0;
6798 } 6798 }
6799 return ret; 6799 return ret;
6800 } 6800 }
6801 6801
6802 /** 6802 /**
6803 * sys_sched_rr_get_interval - return the default timeslice of a process. 6803 * sys_sched_rr_get_interval - return the default timeslice of a process.
6804 * @pid: pid of the process. 6804 * @pid: pid of the process.
6805 * @interval: userspace pointer to the timeslice value. 6805 * @interval: userspace pointer to the timeslice value.
6806 * 6806 *
6807 * this syscall writes the default timeslice value of a given process 6807 * this syscall writes the default timeslice value of a given process
6808 * into the user-space timespec buffer. A value of '0' means infinity. 6808 * into the user-space timespec buffer. A value of '0' means infinity.
6809 */ 6809 */
6810 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, 6810 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6811 struct timespec __user *, interval) 6811 struct timespec __user *, interval)
6812 { 6812 {
6813 struct task_struct *p; 6813 struct task_struct *p;
6814 unsigned int time_slice; 6814 unsigned int time_slice;
6815 int retval; 6815 int retval;
6816 struct timespec t; 6816 struct timespec t;
6817 6817
6818 if (pid < 0) 6818 if (pid < 0)
6819 return -EINVAL; 6819 return -EINVAL;
6820 6820
6821 retval = -ESRCH; 6821 retval = -ESRCH;
6822 read_lock(&tasklist_lock); 6822 read_lock(&tasklist_lock);
6823 p = find_process_by_pid(pid); 6823 p = find_process_by_pid(pid);
6824 if (!p) 6824 if (!p)
6825 goto out_unlock; 6825 goto out_unlock;
6826 6826
6827 retval = security_task_getscheduler(p); 6827 retval = security_task_getscheduler(p);
6828 if (retval) 6828 if (retval)
6829 goto out_unlock; 6829 goto out_unlock;
6830 6830
6831 time_slice = p->sched_class->get_rr_interval(p); 6831 time_slice = p->sched_class->get_rr_interval(p);
6832 6832
6833 read_unlock(&tasklist_lock); 6833 read_unlock(&tasklist_lock);
6834 jiffies_to_timespec(time_slice, &t); 6834 jiffies_to_timespec(time_slice, &t);
6835 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6835 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6836 return retval; 6836 return retval;
6837 6837
6838 out_unlock: 6838 out_unlock:
6839 read_unlock(&tasklist_lock); 6839 read_unlock(&tasklist_lock);
6840 return retval; 6840 return retval;
6841 } 6841 }
6842 6842
6843 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; 6843 static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
6844 6844
6845 void sched_show_task(struct task_struct *p) 6845 void sched_show_task(struct task_struct *p)
6846 { 6846 {
6847 unsigned long free = 0; 6847 unsigned long free = 0;
6848 unsigned state; 6848 unsigned state;
6849 6849
6850 state = p->state ? __ffs(p->state) + 1 : 0; 6850 state = p->state ? __ffs(p->state) + 1 : 0;
6851 printk(KERN_INFO "%-13.13s %c", p->comm, 6851 printk(KERN_INFO "%-13.13s %c", p->comm,
6852 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 6852 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
6853 #if BITS_PER_LONG == 32 6853 #if BITS_PER_LONG == 32
6854 if (state == TASK_RUNNING) 6854 if (state == TASK_RUNNING)
6855 printk(KERN_CONT " running "); 6855 printk(KERN_CONT " running ");
6856 else 6856 else
6857 printk(KERN_CONT " %08lx ", thread_saved_pc(p)); 6857 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
6858 #else 6858 #else
6859 if (state == TASK_RUNNING) 6859 if (state == TASK_RUNNING)
6860 printk(KERN_CONT " running task "); 6860 printk(KERN_CONT " running task ");
6861 else 6861 else
6862 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6862 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6863 #endif 6863 #endif
6864 #ifdef CONFIG_DEBUG_STACK_USAGE 6864 #ifdef CONFIG_DEBUG_STACK_USAGE
6865 free = stack_not_used(p); 6865 free = stack_not_used(p);
6866 #endif 6866 #endif
6867 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, 6867 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
6868 task_pid_nr(p), task_pid_nr(p->real_parent), 6868 task_pid_nr(p), task_pid_nr(p->real_parent),
6869 (unsigned long)task_thread_info(p)->flags); 6869 (unsigned long)task_thread_info(p)->flags);
6870 6870
6871 show_stack(p, NULL); 6871 show_stack(p, NULL);
6872 } 6872 }
6873 6873
6874 void show_state_filter(unsigned long state_filter) 6874 void show_state_filter(unsigned long state_filter)
6875 { 6875 {
6876 struct task_struct *g, *p; 6876 struct task_struct *g, *p;
6877 6877
6878 #if BITS_PER_LONG == 32 6878 #if BITS_PER_LONG == 32
6879 printk(KERN_INFO 6879 printk(KERN_INFO
6880 " task PC stack pid father\n"); 6880 " task PC stack pid father\n");
6881 #else 6881 #else
6882 printk(KERN_INFO 6882 printk(KERN_INFO
6883 " task PC stack pid father\n"); 6883 " task PC stack pid father\n");
6884 #endif 6884 #endif
6885 read_lock(&tasklist_lock); 6885 read_lock(&tasklist_lock);
6886 do_each_thread(g, p) { 6886 do_each_thread(g, p) {
6887 /* 6887 /*
6888 * reset the NMI-timeout, listing all files on a slow 6888 * reset the NMI-timeout, listing all files on a slow
6889 * console might take alot of time: 6889 * console might take alot of time:
6890 */ 6890 */
6891 touch_nmi_watchdog(); 6891 touch_nmi_watchdog();
6892 if (!state_filter || (p->state & state_filter)) 6892 if (!state_filter || (p->state & state_filter))
6893 sched_show_task(p); 6893 sched_show_task(p);
6894 } while_each_thread(g, p); 6894 } while_each_thread(g, p);
6895 6895
6896 touch_all_softlockup_watchdogs(); 6896 touch_all_softlockup_watchdogs();
6897 6897
6898 #ifdef CONFIG_SCHED_DEBUG 6898 #ifdef CONFIG_SCHED_DEBUG
6899 sysrq_sched_debug_show(); 6899 sysrq_sched_debug_show();
6900 #endif 6900 #endif
6901 read_unlock(&tasklist_lock); 6901 read_unlock(&tasklist_lock);
6902 /* 6902 /*
6903 * Only show locks if all tasks are dumped: 6903 * Only show locks if all tasks are dumped:
6904 */ 6904 */
6905 if (state_filter == -1) 6905 if (state_filter == -1)
6906 debug_show_all_locks(); 6906 debug_show_all_locks();
6907 } 6907 }
6908 6908
6909 void __cpuinit init_idle_bootup_task(struct task_struct *idle) 6909 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6910 { 6910 {
6911 idle->sched_class = &idle_sched_class; 6911 idle->sched_class = &idle_sched_class;
6912 } 6912 }
6913 6913
6914 /** 6914 /**
6915 * init_idle - set up an idle thread for a given CPU 6915 * init_idle - set up an idle thread for a given CPU
6916 * @idle: task in question 6916 * @idle: task in question
6917 * @cpu: cpu the idle task belongs to 6917 * @cpu: cpu the idle task belongs to
6918 * 6918 *
6919 * NOTE: this function does not set the idle thread's NEED_RESCHED 6919 * NOTE: this function does not set the idle thread's NEED_RESCHED
6920 * flag, to make booting more robust. 6920 * flag, to make booting more robust.
6921 */ 6921 */
6922 void __cpuinit init_idle(struct task_struct *idle, int cpu) 6922 void __cpuinit init_idle(struct task_struct *idle, int cpu)
6923 { 6923 {
6924 struct rq *rq = cpu_rq(cpu); 6924 struct rq *rq = cpu_rq(cpu);
6925 unsigned long flags; 6925 unsigned long flags;
6926 6926
6927 spin_lock_irqsave(&rq->lock, flags); 6927 spin_lock_irqsave(&rq->lock, flags);
6928 6928
6929 __sched_fork(idle); 6929 __sched_fork(idle);
6930 idle->se.exec_start = sched_clock(); 6930 idle->se.exec_start = sched_clock();
6931 6931
6932 idle->prio = idle->normal_prio = MAX_PRIO; 6932 idle->prio = idle->normal_prio = MAX_PRIO;
6933 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 6933 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6934 __set_task_cpu(idle, cpu); 6934 __set_task_cpu(idle, cpu);
6935 6935
6936 rq->curr = rq->idle = idle; 6936 rq->curr = rq->idle = idle;
6937 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 6937 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6938 idle->oncpu = 1; 6938 idle->oncpu = 1;
6939 #endif 6939 #endif
6940 spin_unlock_irqrestore(&rq->lock, flags); 6940 spin_unlock_irqrestore(&rq->lock, flags);
6941 6941
6942 /* Set the preempt count _outside_ the spinlocks! */ 6942 /* Set the preempt count _outside_ the spinlocks! */
6943 #if defined(CONFIG_PREEMPT) 6943 #if defined(CONFIG_PREEMPT)
6944 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 6944 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
6945 #else 6945 #else
6946 task_thread_info(idle)->preempt_count = 0; 6946 task_thread_info(idle)->preempt_count = 0;
6947 #endif 6947 #endif
6948 /* 6948 /*
6949 * The idle tasks have their own, simple scheduling class: 6949 * The idle tasks have their own, simple scheduling class:
6950 */ 6950 */
6951 idle->sched_class = &idle_sched_class; 6951 idle->sched_class = &idle_sched_class;
6952 ftrace_graph_init_task(idle); 6952 ftrace_graph_init_task(idle);
6953 } 6953 }
6954 6954
6955 /* 6955 /*
6956 * In a system that switches off the HZ timer nohz_cpu_mask 6956 * In a system that switches off the HZ timer nohz_cpu_mask
6957 * indicates which cpus entered this state. This is used 6957 * indicates which cpus entered this state. This is used
6958 * in the rcu update to wait only for active cpus. For system 6958 * in the rcu update to wait only for active cpus. For system
6959 * which do not switch off the HZ timer nohz_cpu_mask should 6959 * which do not switch off the HZ timer nohz_cpu_mask should
6960 * always be CPU_BITS_NONE. 6960 * always be CPU_BITS_NONE.
6961 */ 6961 */
6962 cpumask_var_t nohz_cpu_mask; 6962 cpumask_var_t nohz_cpu_mask;
6963 6963
6964 /* 6964 /*
6965 * Increase the granularity value when there are more CPUs, 6965 * Increase the granularity value when there are more CPUs,
6966 * because with more CPUs the 'effective latency' as visible 6966 * because with more CPUs the 'effective latency' as visible
6967 * to users decreases. But the relationship is not linear, 6967 * to users decreases. But the relationship is not linear,
6968 * so pick a second-best guess by going with the log2 of the 6968 * so pick a second-best guess by going with the log2 of the
6969 * number of CPUs. 6969 * number of CPUs.
6970 * 6970 *
6971 * This idea comes from the SD scheduler of Con Kolivas: 6971 * This idea comes from the SD scheduler of Con Kolivas:
6972 */ 6972 */
6973 static inline void sched_init_granularity(void) 6973 static inline void sched_init_granularity(void)
6974 { 6974 {
6975 unsigned int factor = 1 + ilog2(num_online_cpus()); 6975 unsigned int factor = 1 + ilog2(num_online_cpus());
6976 const unsigned long limit = 200000000; 6976 const unsigned long limit = 200000000;
6977 6977
6978 sysctl_sched_min_granularity *= factor; 6978 sysctl_sched_min_granularity *= factor;
6979 if (sysctl_sched_min_granularity > limit) 6979 if (sysctl_sched_min_granularity > limit)
6980 sysctl_sched_min_granularity = limit; 6980 sysctl_sched_min_granularity = limit;
6981 6981
6982 sysctl_sched_latency *= factor; 6982 sysctl_sched_latency *= factor;
6983 if (sysctl_sched_latency > limit) 6983 if (sysctl_sched_latency > limit)
6984 sysctl_sched_latency = limit; 6984 sysctl_sched_latency = limit;
6985 6985
6986 sysctl_sched_wakeup_granularity *= factor; 6986 sysctl_sched_wakeup_granularity *= factor;
6987 6987
6988 sysctl_sched_shares_ratelimit *= factor; 6988 sysctl_sched_shares_ratelimit *= factor;
6989 } 6989 }
6990 6990
6991 #ifdef CONFIG_SMP 6991 #ifdef CONFIG_SMP
6992 /* 6992 /*
6993 * This is how migration works: 6993 * This is how migration works:
6994 * 6994 *
6995 * 1) we queue a struct migration_req structure in the source CPU's 6995 * 1) we queue a struct migration_req structure in the source CPU's
6996 * runqueue and wake up that CPU's migration thread. 6996 * runqueue and wake up that CPU's migration thread.
6997 * 2) we down() the locked semaphore => thread blocks. 6997 * 2) we down() the locked semaphore => thread blocks.
6998 * 3) migration thread wakes up (implicitly it forces the migrated 6998 * 3) migration thread wakes up (implicitly it forces the migrated
6999 * thread off the CPU) 6999 * thread off the CPU)
7000 * 4) it gets the migration request and checks whether the migrated 7000 * 4) it gets the migration request and checks whether the migrated
7001 * task is still in the wrong runqueue. 7001 * task is still in the wrong runqueue.
7002 * 5) if it's in the wrong runqueue then the migration thread removes 7002 * 5) if it's in the wrong runqueue then the migration thread removes
7003 * it and puts it into the right queue. 7003 * it and puts it into the right queue.
7004 * 6) migration thread up()s the semaphore. 7004 * 6) migration thread up()s the semaphore.
7005 * 7) we wake up and the migration is done. 7005 * 7) we wake up and the migration is done.
7006 */ 7006 */
7007 7007
7008 /* 7008 /*
7009 * Change a given task's CPU affinity. Migrate the thread to a 7009 * Change a given task's CPU affinity. Migrate the thread to a
7010 * proper CPU and schedule it away if the CPU it's executing on 7010 * proper CPU and schedule it away if the CPU it's executing on
7011 * is removed from the allowed bitmask. 7011 * is removed from the allowed bitmask.
7012 * 7012 *
7013 * NOTE: the caller must have a valid reference to the task, the 7013 * NOTE: the caller must have a valid reference to the task, the
7014 * task must not exit() & deallocate itself prematurely. The 7014 * task must not exit() & deallocate itself prematurely. The
7015 * call is not atomic; no spinlocks may be held. 7015 * call is not atomic; no spinlocks may be held.
7016 */ 7016 */
7017 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 7017 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7018 { 7018 {
7019 struct migration_req req; 7019 struct migration_req req;
7020 unsigned long flags; 7020 unsigned long flags;
7021 struct rq *rq; 7021 struct rq *rq;
7022 int ret = 0; 7022 int ret = 0;
7023 7023
7024 rq = task_rq_lock(p, &flags); 7024 rq = task_rq_lock(p, &flags);
7025 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7025 if (!cpumask_intersects(new_mask, cpu_online_mask)) {
7026 ret = -EINVAL; 7026 ret = -EINVAL;
7027 goto out; 7027 goto out;
7028 } 7028 }
7029 7029
7030 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 7030 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
7031 !cpumask_equal(&p->cpus_allowed, new_mask))) { 7031 !cpumask_equal(&p->cpus_allowed, new_mask))) {
7032 ret = -EINVAL; 7032 ret = -EINVAL;
7033 goto out; 7033 goto out;
7034 } 7034 }
7035 7035
7036 if (p->sched_class->set_cpus_allowed) 7036 if (p->sched_class->set_cpus_allowed)
7037 p->sched_class->set_cpus_allowed(p, new_mask); 7037 p->sched_class->set_cpus_allowed(p, new_mask);
7038 else { 7038 else {
7039 cpumask_copy(&p->cpus_allowed, new_mask); 7039 cpumask_copy(&p->cpus_allowed, new_mask);
7040 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 7040 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
7041 } 7041 }
7042 7042
7043 /* Can the task run on the task's current CPU? If so, we're done */ 7043 /* Can the task run on the task's current CPU? If so, we're done */
7044 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7044 if (cpumask_test_cpu(task_cpu(p), new_mask))
7045 goto out; 7045 goto out;
7046 7046
7047 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7047 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
7048 /* Need help from migration thread: drop lock and wait. */ 7048 /* Need help from migration thread: drop lock and wait. */
7049 struct task_struct *mt = rq->migration_thread; 7049 struct task_struct *mt = rq->migration_thread;
7050 7050
7051 get_task_struct(mt); 7051 get_task_struct(mt);
7052 task_rq_unlock(rq, &flags); 7052 task_rq_unlock(rq, &flags);
7053 wake_up_process(rq->migration_thread); 7053 wake_up_process(rq->migration_thread);
7054 put_task_struct(mt); 7054 put_task_struct(mt);
7055 wait_for_completion(&req.done); 7055 wait_for_completion(&req.done);
7056 tlb_migrate_finish(p->mm); 7056 tlb_migrate_finish(p->mm);
7057 return 0; 7057 return 0;
7058 } 7058 }
7059 out: 7059 out:
7060 task_rq_unlock(rq, &flags); 7060 task_rq_unlock(rq, &flags);
7061 7061
7062 return ret; 7062 return ret;
7063 } 7063 }
7064 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 7064 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7065 7065
7066 /* 7066 /*
7067 * Move (not current) task off this cpu, onto dest cpu. We're doing 7067 * Move (not current) task off this cpu, onto dest cpu. We're doing
7068 * this because either it can't run here any more (set_cpus_allowed() 7068 * this because either it can't run here any more (set_cpus_allowed()
7069 * away from this CPU, or CPU going down), or because we're 7069 * away from this CPU, or CPU going down), or because we're
7070 * attempting to rebalance this task on exec (sched_exec). 7070 * attempting to rebalance this task on exec (sched_exec).
7071 * 7071 *
7072 * So we race with normal scheduler movements, but that's OK, as long 7072 * So we race with normal scheduler movements, but that's OK, as long
7073 * as the task is no longer on this CPU. 7073 * as the task is no longer on this CPU.
7074 * 7074 *
7075 * Returns non-zero if task was successfully migrated. 7075 * Returns non-zero if task was successfully migrated.
7076 */ 7076 */
7077 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 7077 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7078 { 7078 {
7079 struct rq *rq_dest, *rq_src; 7079 struct rq *rq_dest, *rq_src;
7080 int ret = 0, on_rq; 7080 int ret = 0, on_rq;
7081 7081
7082 if (unlikely(!cpu_active(dest_cpu))) 7082 if (unlikely(!cpu_active(dest_cpu)))
7083 return ret; 7083 return ret;
7084 7084
7085 rq_src = cpu_rq(src_cpu); 7085 rq_src = cpu_rq(src_cpu);
7086 rq_dest = cpu_rq(dest_cpu); 7086 rq_dest = cpu_rq(dest_cpu);
7087 7087
7088 double_rq_lock(rq_src, rq_dest); 7088 double_rq_lock(rq_src, rq_dest);
7089 /* Already moved. */ 7089 /* Already moved. */
7090 if (task_cpu(p) != src_cpu) 7090 if (task_cpu(p) != src_cpu)
7091 goto done; 7091 goto done;
7092 /* Affinity changed (again). */ 7092 /* Affinity changed (again). */
7093 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7093 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7094 goto fail; 7094 goto fail;
7095 7095
7096 on_rq = p->se.on_rq; 7096 on_rq = p->se.on_rq;
7097 if (on_rq) 7097 if (on_rq)
7098 deactivate_task(rq_src, p, 0); 7098 deactivate_task(rq_src, p, 0);
7099 7099
7100 set_task_cpu(p, dest_cpu); 7100 set_task_cpu(p, dest_cpu);
7101 if (on_rq) { 7101 if (on_rq) {
7102 activate_task(rq_dest, p, 0); 7102 activate_task(rq_dest, p, 0);
7103 check_preempt_curr(rq_dest, p, 0); 7103 check_preempt_curr(rq_dest, p, 0);
7104 } 7104 }
7105 done: 7105 done:
7106 ret = 1; 7106 ret = 1;
7107 fail: 7107 fail:
7108 double_rq_unlock(rq_src, rq_dest); 7108 double_rq_unlock(rq_src, rq_dest);
7109 return ret; 7109 return ret;
7110 } 7110 }
7111 7111
7112 #define RCU_MIGRATION_IDLE 0 7112 #define RCU_MIGRATION_IDLE 0
7113 #define RCU_MIGRATION_NEED_QS 1 7113 #define RCU_MIGRATION_NEED_QS 1
7114 #define RCU_MIGRATION_GOT_QS 2 7114 #define RCU_MIGRATION_GOT_QS 2
7115 #define RCU_MIGRATION_MUST_SYNC 3 7115 #define RCU_MIGRATION_MUST_SYNC 3
7116 7116
7117 /* 7117 /*
7118 * migration_thread - this is a highprio system thread that performs 7118 * migration_thread - this is a highprio system thread that performs
7119 * thread migration by bumping thread off CPU then 'pushing' onto 7119 * thread migration by bumping thread off CPU then 'pushing' onto
7120 * another runqueue. 7120 * another runqueue.
7121 */ 7121 */
7122 static int migration_thread(void *data) 7122 static int migration_thread(void *data)
7123 { 7123 {
7124 int badcpu; 7124 int badcpu;
7125 int cpu = (long)data; 7125 int cpu = (long)data;
7126 struct rq *rq; 7126 struct rq *rq;
7127 7127
7128 rq = cpu_rq(cpu); 7128 rq = cpu_rq(cpu);
7129 BUG_ON(rq->migration_thread != current); 7129 BUG_ON(rq->migration_thread != current);
7130 7130
7131 set_current_state(TASK_INTERRUPTIBLE); 7131 set_current_state(TASK_INTERRUPTIBLE);
7132 while (!kthread_should_stop()) { 7132 while (!kthread_should_stop()) {
7133 struct migration_req *req; 7133 struct migration_req *req;
7134 struct list_head *head; 7134 struct list_head *head;
7135 7135
7136 spin_lock_irq(&rq->lock); 7136 spin_lock_irq(&rq->lock);
7137 7137
7138 if (cpu_is_offline(cpu)) { 7138 if (cpu_is_offline(cpu)) {
7139 spin_unlock_irq(&rq->lock); 7139 spin_unlock_irq(&rq->lock);
7140 break; 7140 break;
7141 } 7141 }
7142 7142
7143 if (rq->active_balance) { 7143 if (rq->active_balance) {
7144 active_load_balance(rq, cpu); 7144 active_load_balance(rq, cpu);
7145 rq->active_balance = 0; 7145 rq->active_balance = 0;
7146 } 7146 }
7147 7147
7148 head = &rq->migration_queue; 7148 head = &rq->migration_queue;
7149 7149
7150 if (list_empty(head)) { 7150 if (list_empty(head)) {
7151 spin_unlock_irq(&rq->lock); 7151 spin_unlock_irq(&rq->lock);
7152 schedule(); 7152 schedule();
7153 set_current_state(TASK_INTERRUPTIBLE); 7153 set_current_state(TASK_INTERRUPTIBLE);
7154 continue; 7154 continue;
7155 } 7155 }
7156 req = list_entry(head->next, struct migration_req, list); 7156 req = list_entry(head->next, struct migration_req, list);
7157 list_del_init(head->next); 7157 list_del_init(head->next);
7158 7158
7159 if (req->task != NULL) { 7159 if (req->task != NULL) {
7160 spin_unlock(&rq->lock); 7160 spin_unlock(&rq->lock);
7161 __migrate_task(req->task, cpu, req->dest_cpu); 7161 __migrate_task(req->task, cpu, req->dest_cpu);
7162 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7162 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7163 req->dest_cpu = RCU_MIGRATION_GOT_QS; 7163 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7164 spin_unlock(&rq->lock); 7164 spin_unlock(&rq->lock);
7165 } else { 7165 } else {
7166 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7166 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7167 spin_unlock(&rq->lock); 7167 spin_unlock(&rq->lock);
7168 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7168 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7169 } 7169 }
7170 local_irq_enable(); 7170 local_irq_enable();
7171 7171
7172 complete(&req->done); 7172 complete(&req->done);
7173 } 7173 }
7174 __set_current_state(TASK_RUNNING); 7174 __set_current_state(TASK_RUNNING);
7175 7175
7176 return 0; 7176 return 0;
7177 } 7177 }
7178 7178
7179 #ifdef CONFIG_HOTPLUG_CPU 7179 #ifdef CONFIG_HOTPLUG_CPU
7180 7180
7181 static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu) 7181 static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7182 { 7182 {
7183 int ret; 7183 int ret;
7184 7184
7185 local_irq_disable(); 7185 local_irq_disable();
7186 ret = __migrate_task(p, src_cpu, dest_cpu); 7186 ret = __migrate_task(p, src_cpu, dest_cpu);
7187 local_irq_enable(); 7187 local_irq_enable();
7188 return ret; 7188 return ret;
7189 } 7189 }
7190 7190
7191 /* 7191 /*
7192 * Figure out where task on dead CPU should go, use force if necessary. 7192 * Figure out where task on dead CPU should go, use force if necessary.
7193 */ 7193 */
7194 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 7194 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7195 { 7195 {
7196 int dest_cpu; 7196 int dest_cpu;
7197 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu)); 7197 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7198 7198
7199 again: 7199 again:
7200 /* Look for allowed, online CPU in same node. */ 7200 /* Look for allowed, online CPU in same node. */
7201 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask) 7201 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7202 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7202 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7203 goto move; 7203 goto move;
7204 7204
7205 /* Any allowed, online CPU? */ 7205 /* Any allowed, online CPU? */
7206 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask); 7206 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7207 if (dest_cpu < nr_cpu_ids) 7207 if (dest_cpu < nr_cpu_ids)
7208 goto move; 7208 goto move;
7209 7209
7210 /* No more Mr. Nice Guy. */ 7210 /* No more Mr. Nice Guy. */
7211 if (dest_cpu >= nr_cpu_ids) { 7211 if (dest_cpu >= nr_cpu_ids) {
7212 cpuset_cpus_allowed_locked(p, &p->cpus_allowed); 7212 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7213 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed); 7213 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7214 7214
7215 /* 7215 /*
7216 * Don't tell them about moving exiting tasks or 7216 * Don't tell them about moving exiting tasks or
7217 * kernel threads (both mm NULL), since they never 7217 * kernel threads (both mm NULL), since they never
7218 * leave kernel. 7218 * leave kernel.
7219 */ 7219 */
7220 if (p->mm && printk_ratelimit()) { 7220 if (p->mm && printk_ratelimit()) {
7221 printk(KERN_INFO "process %d (%s) no " 7221 printk(KERN_INFO "process %d (%s) no "
7222 "longer affine to cpu%d\n", 7222 "longer affine to cpu%d\n",
7223 task_pid_nr(p), p->comm, dead_cpu); 7223 task_pid_nr(p), p->comm, dead_cpu);
7224 } 7224 }
7225 } 7225 }
7226 7226
7227 move: 7227 move:
7228 /* It can have affinity changed while we were choosing. */ 7228 /* It can have affinity changed while we were choosing. */
7229 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 7229 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7230 goto again; 7230 goto again;
7231 } 7231 }
7232 7232
7233 /* 7233 /*
7234 * While a dead CPU has no uninterruptible tasks queued at this point, 7234 * While a dead CPU has no uninterruptible tasks queued at this point,
7235 * it might still have a nonzero ->nr_uninterruptible counter, because 7235 * it might still have a nonzero ->nr_uninterruptible counter, because
7236 * for performance reasons the counter is not stricly tracking tasks to 7236 * for performance reasons the counter is not stricly tracking tasks to
7237 * their home CPUs. So we just add the counter to another CPU's counter, 7237 * their home CPUs. So we just add the counter to another CPU's counter,
7238 * to keep the global sum constant after CPU-down: 7238 * to keep the global sum constant after CPU-down:
7239 */ 7239 */
7240 static void migrate_nr_uninterruptible(struct rq *rq_src) 7240 static void migrate_nr_uninterruptible(struct rq *rq_src)
7241 { 7241 {
7242 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7242 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
7243 unsigned long flags; 7243 unsigned long flags;
7244 7244
7245 local_irq_save(flags); 7245 local_irq_save(flags);
7246 double_rq_lock(rq_src, rq_dest); 7246 double_rq_lock(rq_src, rq_dest);
7247 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 7247 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
7248 rq_src->nr_uninterruptible = 0; 7248 rq_src->nr_uninterruptible = 0;
7249 double_rq_unlock(rq_src, rq_dest); 7249 double_rq_unlock(rq_src, rq_dest);
7250 local_irq_restore(flags); 7250 local_irq_restore(flags);
7251 } 7251 }
7252 7252
7253 /* Run through task list and migrate tasks from the dead cpu. */ 7253 /* Run through task list and migrate tasks from the dead cpu. */
7254 static void migrate_live_tasks(int src_cpu) 7254 static void migrate_live_tasks(int src_cpu)
7255 { 7255 {
7256 struct task_struct *p, *t; 7256 struct task_struct *p, *t;
7257 7257
7258 read_lock(&tasklist_lock); 7258 read_lock(&tasklist_lock);
7259 7259
7260 do_each_thread(t, p) { 7260 do_each_thread(t, p) {
7261 if (p == current) 7261 if (p == current)
7262 continue; 7262 continue;
7263 7263
7264 if (task_cpu(p) == src_cpu) 7264 if (task_cpu(p) == src_cpu)
7265 move_task_off_dead_cpu(src_cpu, p); 7265 move_task_off_dead_cpu(src_cpu, p);
7266 } while_each_thread(t, p); 7266 } while_each_thread(t, p);
7267 7267
7268 read_unlock(&tasklist_lock); 7268 read_unlock(&tasklist_lock);
7269 } 7269 }
7270 7270
7271 /* 7271 /*
7272 * Schedules idle task to be the next runnable task on current CPU. 7272 * Schedules idle task to be the next runnable task on current CPU.
7273 * It does so by boosting its priority to highest possible. 7273 * It does so by boosting its priority to highest possible.
7274 * Used by CPU offline code. 7274 * Used by CPU offline code.
7275 */ 7275 */
7276 void sched_idle_next(void) 7276 void sched_idle_next(void)
7277 { 7277 {
7278 int this_cpu = smp_processor_id(); 7278 int this_cpu = smp_processor_id();
7279 struct rq *rq = cpu_rq(this_cpu); 7279 struct rq *rq = cpu_rq(this_cpu);
7280 struct task_struct *p = rq->idle; 7280 struct task_struct *p = rq->idle;
7281 unsigned long flags; 7281 unsigned long flags;
7282 7282
7283 /* cpu has to be offline */ 7283 /* cpu has to be offline */
7284 BUG_ON(cpu_online(this_cpu)); 7284 BUG_ON(cpu_online(this_cpu));
7285 7285
7286 /* 7286 /*
7287 * Strictly not necessary since rest of the CPUs are stopped by now 7287 * Strictly not necessary since rest of the CPUs are stopped by now
7288 * and interrupts disabled on the current cpu. 7288 * and interrupts disabled on the current cpu.
7289 */ 7289 */
7290 spin_lock_irqsave(&rq->lock, flags); 7290 spin_lock_irqsave(&rq->lock, flags);
7291 7291
7292 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7292 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7293 7293
7294 update_rq_clock(rq); 7294 update_rq_clock(rq);
7295 activate_task(rq, p, 0); 7295 activate_task(rq, p, 0);
7296 7296
7297 spin_unlock_irqrestore(&rq->lock, flags); 7297 spin_unlock_irqrestore(&rq->lock, flags);
7298 } 7298 }
7299 7299
7300 /* 7300 /*
7301 * Ensures that the idle task is using init_mm right before its cpu goes 7301 * Ensures that the idle task is using init_mm right before its cpu goes
7302 * offline. 7302 * offline.
7303 */ 7303 */
7304 void idle_task_exit(void) 7304 void idle_task_exit(void)
7305 { 7305 {
7306 struct mm_struct *mm = current->active_mm; 7306 struct mm_struct *mm = current->active_mm;
7307 7307
7308 BUG_ON(cpu_online(smp_processor_id())); 7308 BUG_ON(cpu_online(smp_processor_id()));
7309 7309
7310 if (mm != &init_mm) 7310 if (mm != &init_mm)
7311 switch_mm(mm, &init_mm, current); 7311 switch_mm(mm, &init_mm, current);
7312 mmdrop(mm); 7312 mmdrop(mm);
7313 } 7313 }
7314 7314
7315 /* called under rq->lock with disabled interrupts */ 7315 /* called under rq->lock with disabled interrupts */
7316 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 7316 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7317 { 7317 {
7318 struct rq *rq = cpu_rq(dead_cpu); 7318 struct rq *rq = cpu_rq(dead_cpu);
7319 7319
7320 /* Must be exiting, otherwise would be on tasklist. */ 7320 /* Must be exiting, otherwise would be on tasklist. */
7321 BUG_ON(!p->exit_state); 7321 BUG_ON(!p->exit_state);
7322 7322
7323 /* Cannot have done final schedule yet: would have vanished. */ 7323 /* Cannot have done final schedule yet: would have vanished. */
7324 BUG_ON(p->state == TASK_DEAD); 7324 BUG_ON(p->state == TASK_DEAD);
7325 7325
7326 get_task_struct(p); 7326 get_task_struct(p);
7327 7327
7328 /* 7328 /*
7329 * Drop lock around migration; if someone else moves it, 7329 * Drop lock around migration; if someone else moves it,
7330 * that's OK. No task can be added to this CPU, so iteration is 7330 * that's OK. No task can be added to this CPU, so iteration is
7331 * fine. 7331 * fine.
7332 */ 7332 */
7333 spin_unlock_irq(&rq->lock); 7333 spin_unlock_irq(&rq->lock);
7334 move_task_off_dead_cpu(dead_cpu, p); 7334 move_task_off_dead_cpu(dead_cpu, p);
7335 spin_lock_irq(&rq->lock); 7335 spin_lock_irq(&rq->lock);
7336 7336
7337 put_task_struct(p); 7337 put_task_struct(p);
7338 } 7338 }
7339 7339
7340 /* release_task() removes task from tasklist, so we won't find dead tasks. */ 7340 /* release_task() removes task from tasklist, so we won't find dead tasks. */
7341 static void migrate_dead_tasks(unsigned int dead_cpu) 7341 static void migrate_dead_tasks(unsigned int dead_cpu)
7342 { 7342 {
7343 struct rq *rq = cpu_rq(dead_cpu); 7343 struct rq *rq = cpu_rq(dead_cpu);
7344 struct task_struct *next; 7344 struct task_struct *next;
7345 7345
7346 for ( ; ; ) { 7346 for ( ; ; ) {
7347 if (!rq->nr_running) 7347 if (!rq->nr_running)
7348 break; 7348 break;
7349 update_rq_clock(rq); 7349 update_rq_clock(rq);
7350 next = pick_next_task(rq); 7350 next = pick_next_task(rq);
7351 if (!next) 7351 if (!next)
7352 break; 7352 break;
7353 next->sched_class->put_prev_task(rq, next); 7353 next->sched_class->put_prev_task(rq, next);
7354 migrate_dead(dead_cpu, next); 7354 migrate_dead(dead_cpu, next);
7355 7355
7356 } 7356 }
7357 } 7357 }
7358 7358
7359 /* 7359 /*
7360 * remove the tasks which were accounted by rq from calc_load_tasks. 7360 * remove the tasks which were accounted by rq from calc_load_tasks.
7361 */ 7361 */
7362 static void calc_global_load_remove(struct rq *rq) 7362 static void calc_global_load_remove(struct rq *rq)
7363 { 7363 {
7364 atomic_long_sub(rq->calc_load_active, &calc_load_tasks); 7364 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
7365 rq->calc_load_active = 0; 7365 rq->calc_load_active = 0;
7366 } 7366 }
7367 #endif /* CONFIG_HOTPLUG_CPU */ 7367 #endif /* CONFIG_HOTPLUG_CPU */
7368 7368
7369 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 7369 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
7370 7370
7371 static struct ctl_table sd_ctl_dir[] = { 7371 static struct ctl_table sd_ctl_dir[] = {
7372 { 7372 {
7373 .procname = "sched_domain", 7373 .procname = "sched_domain",
7374 .mode = 0555, 7374 .mode = 0555,
7375 }, 7375 },
7376 {0, }, 7376 {}
7377 }; 7377 };
7378 7378
7379 static struct ctl_table sd_ctl_root[] = { 7379 static struct ctl_table sd_ctl_root[] = {
7380 { 7380 {
7381 .ctl_name = CTL_KERN,
7382 .procname = "kernel", 7381 .procname = "kernel",
7383 .mode = 0555, 7382 .mode = 0555,
7384 .child = sd_ctl_dir, 7383 .child = sd_ctl_dir,
7385 }, 7384 },
7386 {0, }, 7385 {}
7387 }; 7386 };
7388 7387
7389 static struct ctl_table *sd_alloc_ctl_entry(int n) 7388 static struct ctl_table *sd_alloc_ctl_entry(int n)
7390 { 7389 {
7391 struct ctl_table *entry = 7390 struct ctl_table *entry =
7392 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); 7391 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
7393 7392
7394 return entry; 7393 return entry;
7395 } 7394 }
7396 7395
7397 static void sd_free_ctl_entry(struct ctl_table **tablep) 7396 static void sd_free_ctl_entry(struct ctl_table **tablep)
7398 { 7397 {
7399 struct ctl_table *entry; 7398 struct ctl_table *entry;
7400 7399
7401 /* 7400 /*
7402 * In the intermediate directories, both the child directory and 7401 * In the intermediate directories, both the child directory and
7403 * procname are dynamically allocated and could fail but the mode 7402 * procname are dynamically allocated and could fail but the mode
7404 * will always be set. In the lowest directory the names are 7403 * will always be set. In the lowest directory the names are
7405 * static strings and all have proc handlers. 7404 * static strings and all have proc handlers.
7406 */ 7405 */
7407 for (entry = *tablep; entry->mode; entry++) { 7406 for (entry = *tablep; entry->mode; entry++) {
7408 if (entry->child) 7407 if (entry->child)
7409 sd_free_ctl_entry(&entry->child); 7408 sd_free_ctl_entry(&entry->child);
7410 if (entry->proc_handler == NULL) 7409 if (entry->proc_handler == NULL)
7411 kfree(entry->procname); 7410 kfree(entry->procname);
7412 } 7411 }
7413 7412
7414 kfree(*tablep); 7413 kfree(*tablep);
7415 *tablep = NULL; 7414 *tablep = NULL;
7416 } 7415 }
7417 7416
7418 static void 7417 static void
7419 set_table_entry(struct ctl_table *entry, 7418 set_table_entry(struct ctl_table *entry,
7420 const char *procname, void *data, int maxlen, 7419 const char *procname, void *data, int maxlen,
7421 mode_t mode, proc_handler *proc_handler) 7420 mode_t mode, proc_handler *proc_handler)
7422 { 7421 {
7423 entry->procname = procname; 7422 entry->procname = procname;
7424 entry->data = data; 7423 entry->data = data;
7425 entry->maxlen = maxlen; 7424 entry->maxlen = maxlen;
7426 entry->mode = mode; 7425 entry->mode = mode;
7427 entry->proc_handler = proc_handler; 7426 entry->proc_handler = proc_handler;
7428 } 7427 }
7429 7428
7430 static struct ctl_table * 7429 static struct ctl_table *
7431 sd_alloc_ctl_domain_table(struct sched_domain *sd) 7430 sd_alloc_ctl_domain_table(struct sched_domain *sd)
7432 { 7431 {
7433 struct ctl_table *table = sd_alloc_ctl_entry(13); 7432 struct ctl_table *table = sd_alloc_ctl_entry(13);
7434 7433
7435 if (table == NULL) 7434 if (table == NULL)
7436 return NULL; 7435 return NULL;
7437 7436
7438 set_table_entry(&table[0], "min_interval", &sd->min_interval, 7437 set_table_entry(&table[0], "min_interval", &sd->min_interval,
7439 sizeof(long), 0644, proc_doulongvec_minmax); 7438 sizeof(long), 0644, proc_doulongvec_minmax);
7440 set_table_entry(&table[1], "max_interval", &sd->max_interval, 7439 set_table_entry(&table[1], "max_interval", &sd->max_interval,
7441 sizeof(long), 0644, proc_doulongvec_minmax); 7440 sizeof(long), 0644, proc_doulongvec_minmax);
7442 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 7441 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
7443 sizeof(int), 0644, proc_dointvec_minmax); 7442 sizeof(int), 0644, proc_dointvec_minmax);
7444 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 7443 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
7445 sizeof(int), 0644, proc_dointvec_minmax); 7444 sizeof(int), 0644, proc_dointvec_minmax);
7446 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 7445 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
7447 sizeof(int), 0644, proc_dointvec_minmax); 7446 sizeof(int), 0644, proc_dointvec_minmax);
7448 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 7447 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
7449 sizeof(int), 0644, proc_dointvec_minmax); 7448 sizeof(int), 0644, proc_dointvec_minmax);
7450 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 7449 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
7451 sizeof(int), 0644, proc_dointvec_minmax); 7450 sizeof(int), 0644, proc_dointvec_minmax);
7452 set_table_entry(&table[7], "busy_factor", &sd->busy_factor, 7451 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
7453 sizeof(int), 0644, proc_dointvec_minmax); 7452 sizeof(int), 0644, proc_dointvec_minmax);
7454 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, 7453 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
7455 sizeof(int), 0644, proc_dointvec_minmax); 7454 sizeof(int), 0644, proc_dointvec_minmax);
7456 set_table_entry(&table[9], "cache_nice_tries", 7455 set_table_entry(&table[9], "cache_nice_tries",
7457 &sd->cache_nice_tries, 7456 &sd->cache_nice_tries,
7458 sizeof(int), 0644, proc_dointvec_minmax); 7457 sizeof(int), 0644, proc_dointvec_minmax);
7459 set_table_entry(&table[10], "flags", &sd->flags, 7458 set_table_entry(&table[10], "flags", &sd->flags,
7460 sizeof(int), 0644, proc_dointvec_minmax); 7459 sizeof(int), 0644, proc_dointvec_minmax);
7461 set_table_entry(&table[11], "name", sd->name, 7460 set_table_entry(&table[11], "name", sd->name,
7462 CORENAME_MAX_SIZE, 0444, proc_dostring); 7461 CORENAME_MAX_SIZE, 0444, proc_dostring);
7463 /* &table[12] is terminator */ 7462 /* &table[12] is terminator */
7464 7463
7465 return table; 7464 return table;
7466 } 7465 }
7467 7466
7468 static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 7467 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7469 { 7468 {
7470 struct ctl_table *entry, *table; 7469 struct ctl_table *entry, *table;
7471 struct sched_domain *sd; 7470 struct sched_domain *sd;
7472 int domain_num = 0, i; 7471 int domain_num = 0, i;
7473 char buf[32]; 7472 char buf[32];
7474 7473
7475 for_each_domain(cpu, sd) 7474 for_each_domain(cpu, sd)
7476 domain_num++; 7475 domain_num++;
7477 entry = table = sd_alloc_ctl_entry(domain_num + 1); 7476 entry = table = sd_alloc_ctl_entry(domain_num + 1);
7478 if (table == NULL) 7477 if (table == NULL)
7479 return NULL; 7478 return NULL;
7480 7479
7481 i = 0; 7480 i = 0;
7482 for_each_domain(cpu, sd) { 7481 for_each_domain(cpu, sd) {
7483 snprintf(buf, 32, "domain%d", i); 7482 snprintf(buf, 32, "domain%d", i);
7484 entry->procname = kstrdup(buf, GFP_KERNEL); 7483 entry->procname = kstrdup(buf, GFP_KERNEL);
7485 entry->mode = 0555; 7484 entry->mode = 0555;
7486 entry->child = sd_alloc_ctl_domain_table(sd); 7485 entry->child = sd_alloc_ctl_domain_table(sd);
7487 entry++; 7486 entry++;
7488 i++; 7487 i++;
7489 } 7488 }
7490 return table; 7489 return table;
7491 } 7490 }
7492 7491
7493 static struct ctl_table_header *sd_sysctl_header; 7492 static struct ctl_table_header *sd_sysctl_header;
7494 static void register_sched_domain_sysctl(void) 7493 static void register_sched_domain_sysctl(void)
7495 { 7494 {
7496 int i, cpu_num = num_online_cpus(); 7495 int i, cpu_num = num_online_cpus();
7497 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7496 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7498 char buf[32]; 7497 char buf[32];
7499 7498
7500 WARN_ON(sd_ctl_dir[0].child); 7499 WARN_ON(sd_ctl_dir[0].child);
7501 sd_ctl_dir[0].child = entry; 7500 sd_ctl_dir[0].child = entry;
7502 7501
7503 if (entry == NULL) 7502 if (entry == NULL)
7504 return; 7503 return;
7505 7504
7506 for_each_online_cpu(i) { 7505 for_each_online_cpu(i) {
7507 snprintf(buf, 32, "cpu%d", i); 7506 snprintf(buf, 32, "cpu%d", i);
7508 entry->procname = kstrdup(buf, GFP_KERNEL); 7507 entry->procname = kstrdup(buf, GFP_KERNEL);
7509 entry->mode = 0555; 7508 entry->mode = 0555;
7510 entry->child = sd_alloc_ctl_cpu_table(i); 7509 entry->child = sd_alloc_ctl_cpu_table(i);
7511 entry++; 7510 entry++;
7512 } 7511 }
7513 7512
7514 WARN_ON(sd_sysctl_header); 7513 WARN_ON(sd_sysctl_header);
7515 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 7514 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
7516 } 7515 }
7517 7516
7518 /* may be called multiple times per register */ 7517 /* may be called multiple times per register */
7519 static void unregister_sched_domain_sysctl(void) 7518 static void unregister_sched_domain_sysctl(void)
7520 { 7519 {
7521 if (sd_sysctl_header) 7520 if (sd_sysctl_header)
7522 unregister_sysctl_table(sd_sysctl_header); 7521 unregister_sysctl_table(sd_sysctl_header);
7523 sd_sysctl_header = NULL; 7522 sd_sysctl_header = NULL;
7524 if (sd_ctl_dir[0].child) 7523 if (sd_ctl_dir[0].child)
7525 sd_free_ctl_entry(&sd_ctl_dir[0].child); 7524 sd_free_ctl_entry(&sd_ctl_dir[0].child);
7526 } 7525 }
7527 #else 7526 #else
7528 static void register_sched_domain_sysctl(void) 7527 static void register_sched_domain_sysctl(void)
7529 { 7528 {
7530 } 7529 }
7531 static void unregister_sched_domain_sysctl(void) 7530 static void unregister_sched_domain_sysctl(void)
7532 { 7531 {
7533 } 7532 }
7534 #endif 7533 #endif
7535 7534
7536 static void set_rq_online(struct rq *rq) 7535 static void set_rq_online(struct rq *rq)
7537 { 7536 {
7538 if (!rq->online) { 7537 if (!rq->online) {
7539 const struct sched_class *class; 7538 const struct sched_class *class;
7540 7539
7541 cpumask_set_cpu(rq->cpu, rq->rd->online); 7540 cpumask_set_cpu(rq->cpu, rq->rd->online);
7542 rq->online = 1; 7541 rq->online = 1;
7543 7542
7544 for_each_class(class) { 7543 for_each_class(class) {
7545 if (class->rq_online) 7544 if (class->rq_online)
7546 class->rq_online(rq); 7545 class->rq_online(rq);
7547 } 7546 }
7548 } 7547 }
7549 } 7548 }
7550 7549
7551 static void set_rq_offline(struct rq *rq) 7550 static void set_rq_offline(struct rq *rq)
7552 { 7551 {
7553 if (rq->online) { 7552 if (rq->online) {
7554 const struct sched_class *class; 7553 const struct sched_class *class;
7555 7554
7556 for_each_class(class) { 7555 for_each_class(class) {
7557 if (class->rq_offline) 7556 if (class->rq_offline)
7558 class->rq_offline(rq); 7557 class->rq_offline(rq);
7559 } 7558 }
7560 7559
7561 cpumask_clear_cpu(rq->cpu, rq->rd->online); 7560 cpumask_clear_cpu(rq->cpu, rq->rd->online);
7562 rq->online = 0; 7561 rq->online = 0;
7563 } 7562 }
7564 } 7563 }
7565 7564
7566 /* 7565 /*
7567 * migration_call - callback that gets triggered when a CPU is added. 7566 * migration_call - callback that gets triggered when a CPU is added.
7568 * Here we can start up the necessary migration thread for the new CPU. 7567 * Here we can start up the necessary migration thread for the new CPU.
7569 */ 7568 */
7570 static int __cpuinit 7569 static int __cpuinit
7571 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 7570 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7572 { 7571 {
7573 struct task_struct *p; 7572 struct task_struct *p;
7574 int cpu = (long)hcpu; 7573 int cpu = (long)hcpu;
7575 unsigned long flags; 7574 unsigned long flags;
7576 struct rq *rq; 7575 struct rq *rq;
7577 7576
7578 switch (action) { 7577 switch (action) {
7579 7578
7580 case CPU_UP_PREPARE: 7579 case CPU_UP_PREPARE:
7581 case CPU_UP_PREPARE_FROZEN: 7580 case CPU_UP_PREPARE_FROZEN:
7582 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); 7581 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
7583 if (IS_ERR(p)) 7582 if (IS_ERR(p))
7584 return NOTIFY_BAD; 7583 return NOTIFY_BAD;
7585 kthread_bind(p, cpu); 7584 kthread_bind(p, cpu);
7586 /* Must be high prio: stop_machine expects to yield to it. */ 7585 /* Must be high prio: stop_machine expects to yield to it. */
7587 rq = task_rq_lock(p, &flags); 7586 rq = task_rq_lock(p, &flags);
7588 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7587 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7589 task_rq_unlock(rq, &flags); 7588 task_rq_unlock(rq, &flags);
7590 get_task_struct(p); 7589 get_task_struct(p);
7591 cpu_rq(cpu)->migration_thread = p; 7590 cpu_rq(cpu)->migration_thread = p;
7592 rq->calc_load_update = calc_load_update; 7591 rq->calc_load_update = calc_load_update;
7593 break; 7592 break;
7594 7593
7595 case CPU_ONLINE: 7594 case CPU_ONLINE:
7596 case CPU_ONLINE_FROZEN: 7595 case CPU_ONLINE_FROZEN:
7597 /* Strictly unnecessary, as first user will wake it. */ 7596 /* Strictly unnecessary, as first user will wake it. */
7598 wake_up_process(cpu_rq(cpu)->migration_thread); 7597 wake_up_process(cpu_rq(cpu)->migration_thread);
7599 7598
7600 /* Update our root-domain */ 7599 /* Update our root-domain */
7601 rq = cpu_rq(cpu); 7600 rq = cpu_rq(cpu);
7602 spin_lock_irqsave(&rq->lock, flags); 7601 spin_lock_irqsave(&rq->lock, flags);
7603 if (rq->rd) { 7602 if (rq->rd) {
7604 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7603 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7605 7604
7606 set_rq_online(rq); 7605 set_rq_online(rq);
7607 } 7606 }
7608 spin_unlock_irqrestore(&rq->lock, flags); 7607 spin_unlock_irqrestore(&rq->lock, flags);
7609 break; 7608 break;
7610 7609
7611 #ifdef CONFIG_HOTPLUG_CPU 7610 #ifdef CONFIG_HOTPLUG_CPU
7612 case CPU_UP_CANCELED: 7611 case CPU_UP_CANCELED:
7613 case CPU_UP_CANCELED_FROZEN: 7612 case CPU_UP_CANCELED_FROZEN:
7614 if (!cpu_rq(cpu)->migration_thread) 7613 if (!cpu_rq(cpu)->migration_thread)
7615 break; 7614 break;
7616 /* Unbind it from offline cpu so it can run. Fall thru. */ 7615 /* Unbind it from offline cpu so it can run. Fall thru. */
7617 kthread_bind(cpu_rq(cpu)->migration_thread, 7616 kthread_bind(cpu_rq(cpu)->migration_thread,
7618 cpumask_any(cpu_online_mask)); 7617 cpumask_any(cpu_online_mask));
7619 kthread_stop(cpu_rq(cpu)->migration_thread); 7618 kthread_stop(cpu_rq(cpu)->migration_thread);
7620 put_task_struct(cpu_rq(cpu)->migration_thread); 7619 put_task_struct(cpu_rq(cpu)->migration_thread);
7621 cpu_rq(cpu)->migration_thread = NULL; 7620 cpu_rq(cpu)->migration_thread = NULL;
7622 break; 7621 break;
7623 7622
7624 case CPU_DEAD: 7623 case CPU_DEAD:
7625 case CPU_DEAD_FROZEN: 7624 case CPU_DEAD_FROZEN:
7626 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */ 7625 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
7627 migrate_live_tasks(cpu); 7626 migrate_live_tasks(cpu);
7628 rq = cpu_rq(cpu); 7627 rq = cpu_rq(cpu);
7629 kthread_stop(rq->migration_thread); 7628 kthread_stop(rq->migration_thread);
7630 put_task_struct(rq->migration_thread); 7629 put_task_struct(rq->migration_thread);
7631 rq->migration_thread = NULL; 7630 rq->migration_thread = NULL;
7632 /* Idle task back to normal (off runqueue, low prio) */ 7631 /* Idle task back to normal (off runqueue, low prio) */
7633 spin_lock_irq(&rq->lock); 7632 spin_lock_irq(&rq->lock);
7634 update_rq_clock(rq); 7633 update_rq_clock(rq);
7635 deactivate_task(rq, rq->idle, 0); 7634 deactivate_task(rq, rq->idle, 0);
7636 rq->idle->static_prio = MAX_PRIO; 7635 rq->idle->static_prio = MAX_PRIO;
7637 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7636 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7638 rq->idle->sched_class = &idle_sched_class; 7637 rq->idle->sched_class = &idle_sched_class;
7639 migrate_dead_tasks(cpu); 7638 migrate_dead_tasks(cpu);
7640 spin_unlock_irq(&rq->lock); 7639 spin_unlock_irq(&rq->lock);
7641 cpuset_unlock(); 7640 cpuset_unlock();
7642 migrate_nr_uninterruptible(rq); 7641 migrate_nr_uninterruptible(rq);
7643 BUG_ON(rq->nr_running != 0); 7642 BUG_ON(rq->nr_running != 0);
7644 calc_global_load_remove(rq); 7643 calc_global_load_remove(rq);
7645 /* 7644 /*
7646 * No need to migrate the tasks: it was best-effort if 7645 * No need to migrate the tasks: it was best-effort if
7647 * they didn't take sched_hotcpu_mutex. Just wake up 7646 * they didn't take sched_hotcpu_mutex. Just wake up
7648 * the requestors. 7647 * the requestors.
7649 */ 7648 */
7650 spin_lock_irq(&rq->lock); 7649 spin_lock_irq(&rq->lock);
7651 while (!list_empty(&rq->migration_queue)) { 7650 while (!list_empty(&rq->migration_queue)) {
7652 struct migration_req *req; 7651 struct migration_req *req;
7653 7652
7654 req = list_entry(rq->migration_queue.next, 7653 req = list_entry(rq->migration_queue.next,
7655 struct migration_req, list); 7654 struct migration_req, list);
7656 list_del_init(&req->list); 7655 list_del_init(&req->list);
7657 spin_unlock_irq(&rq->lock); 7656 spin_unlock_irq(&rq->lock);
7658 complete(&req->done); 7657 complete(&req->done);
7659 spin_lock_irq(&rq->lock); 7658 spin_lock_irq(&rq->lock);
7660 } 7659 }
7661 spin_unlock_irq(&rq->lock); 7660 spin_unlock_irq(&rq->lock);
7662 break; 7661 break;
7663 7662
7664 case CPU_DYING: 7663 case CPU_DYING:
7665 case CPU_DYING_FROZEN: 7664 case CPU_DYING_FROZEN:
7666 /* Update our root-domain */ 7665 /* Update our root-domain */
7667 rq = cpu_rq(cpu); 7666 rq = cpu_rq(cpu);
7668 spin_lock_irqsave(&rq->lock, flags); 7667 spin_lock_irqsave(&rq->lock, flags);
7669 if (rq->rd) { 7668 if (rq->rd) {
7670 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7669 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7671 set_rq_offline(rq); 7670 set_rq_offline(rq);
7672 } 7671 }
7673 spin_unlock_irqrestore(&rq->lock, flags); 7672 spin_unlock_irqrestore(&rq->lock, flags);
7674 break; 7673 break;
7675 #endif 7674 #endif
7676 } 7675 }
7677 return NOTIFY_OK; 7676 return NOTIFY_OK;
7678 } 7677 }
7679 7678
7680 /* 7679 /*
7681 * Register at high priority so that task migration (migrate_all_tasks) 7680 * Register at high priority so that task migration (migrate_all_tasks)
7682 * happens before everything else. This has to be lower priority than 7681 * happens before everything else. This has to be lower priority than
7683 * the notifier in the perf_event subsystem, though. 7682 * the notifier in the perf_event subsystem, though.
7684 */ 7683 */
7685 static struct notifier_block __cpuinitdata migration_notifier = { 7684 static struct notifier_block __cpuinitdata migration_notifier = {
7686 .notifier_call = migration_call, 7685 .notifier_call = migration_call,
7687 .priority = 10 7686 .priority = 10
7688 }; 7687 };
7689 7688
7690 static int __init migration_init(void) 7689 static int __init migration_init(void)
7691 { 7690 {
7692 void *cpu = (void *)(long)smp_processor_id(); 7691 void *cpu = (void *)(long)smp_processor_id();
7693 int err; 7692 int err;
7694 7693
7695 /* Start one for the boot CPU: */ 7694 /* Start one for the boot CPU: */
7696 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 7695 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
7697 BUG_ON(err == NOTIFY_BAD); 7696 BUG_ON(err == NOTIFY_BAD);
7698 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7697 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7699 register_cpu_notifier(&migration_notifier); 7698 register_cpu_notifier(&migration_notifier);
7700 7699
7701 return 0; 7700 return 0;
7702 } 7701 }
7703 early_initcall(migration_init); 7702 early_initcall(migration_init);
7704 #endif 7703 #endif
7705 7704
7706 #ifdef CONFIG_SMP 7705 #ifdef CONFIG_SMP
7707 7706
7708 #ifdef CONFIG_SCHED_DEBUG 7707 #ifdef CONFIG_SCHED_DEBUG
7709 7708
7710 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7709 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7711 struct cpumask *groupmask) 7710 struct cpumask *groupmask)
7712 { 7711 {
7713 struct sched_group *group = sd->groups; 7712 struct sched_group *group = sd->groups;
7714 char str[256]; 7713 char str[256];
7715 7714
7716 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); 7715 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
7717 cpumask_clear(groupmask); 7716 cpumask_clear(groupmask);
7718 7717
7719 printk(KERN_DEBUG "%*s domain %d: ", level, "", level); 7718 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
7720 7719
7721 if (!(sd->flags & SD_LOAD_BALANCE)) { 7720 if (!(sd->flags & SD_LOAD_BALANCE)) {
7722 printk("does not load-balance\n"); 7721 printk("does not load-balance\n");
7723 if (sd->parent) 7722 if (sd->parent)
7724 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 7723 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
7725 " has parent"); 7724 " has parent");
7726 return -1; 7725 return -1;
7727 } 7726 }
7728 7727
7729 printk(KERN_CONT "span %s level %s\n", str, sd->name); 7728 printk(KERN_CONT "span %s level %s\n", str, sd->name);
7730 7729
7731 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 7730 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
7732 printk(KERN_ERR "ERROR: domain->span does not contain " 7731 printk(KERN_ERR "ERROR: domain->span does not contain "
7733 "CPU%d\n", cpu); 7732 "CPU%d\n", cpu);
7734 } 7733 }
7735 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { 7734 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
7736 printk(KERN_ERR "ERROR: domain->groups does not contain" 7735 printk(KERN_ERR "ERROR: domain->groups does not contain"
7737 " CPU%d\n", cpu); 7736 " CPU%d\n", cpu);
7738 } 7737 }
7739 7738
7740 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 7739 printk(KERN_DEBUG "%*s groups:", level + 1, "");
7741 do { 7740 do {
7742 if (!group) { 7741 if (!group) {
7743 printk("\n"); 7742 printk("\n");
7744 printk(KERN_ERR "ERROR: group is NULL\n"); 7743 printk(KERN_ERR "ERROR: group is NULL\n");
7745 break; 7744 break;
7746 } 7745 }
7747 7746
7748 if (!group->cpu_power) { 7747 if (!group->cpu_power) {
7749 printk(KERN_CONT "\n"); 7748 printk(KERN_CONT "\n");
7750 printk(KERN_ERR "ERROR: domain->cpu_power not " 7749 printk(KERN_ERR "ERROR: domain->cpu_power not "
7751 "set\n"); 7750 "set\n");
7752 break; 7751 break;
7753 } 7752 }
7754 7753
7755 if (!cpumask_weight(sched_group_cpus(group))) { 7754 if (!cpumask_weight(sched_group_cpus(group))) {
7756 printk(KERN_CONT "\n"); 7755 printk(KERN_CONT "\n");
7757 printk(KERN_ERR "ERROR: empty group\n"); 7756 printk(KERN_ERR "ERROR: empty group\n");
7758 break; 7757 break;
7759 } 7758 }
7760 7759
7761 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 7760 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
7762 printk(KERN_CONT "\n"); 7761 printk(KERN_CONT "\n");
7763 printk(KERN_ERR "ERROR: repeated CPUs\n"); 7762 printk(KERN_ERR "ERROR: repeated CPUs\n");
7764 break; 7763 break;
7765 } 7764 }
7766 7765
7767 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 7766 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7768 7767
7769 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7768 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7770 7769
7771 printk(KERN_CONT " %s", str); 7770 printk(KERN_CONT " %s", str);
7772 if (group->cpu_power != SCHED_LOAD_SCALE) { 7771 if (group->cpu_power != SCHED_LOAD_SCALE) {
7773 printk(KERN_CONT " (cpu_power = %d)", 7772 printk(KERN_CONT " (cpu_power = %d)",
7774 group->cpu_power); 7773 group->cpu_power);
7775 } 7774 }
7776 7775
7777 group = group->next; 7776 group = group->next;
7778 } while (group != sd->groups); 7777 } while (group != sd->groups);
7779 printk(KERN_CONT "\n"); 7778 printk(KERN_CONT "\n");
7780 7779
7781 if (!cpumask_equal(sched_domain_span(sd), groupmask)) 7780 if (!cpumask_equal(sched_domain_span(sd), groupmask))
7782 printk(KERN_ERR "ERROR: groups don't span domain->span\n"); 7781 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
7783 7782
7784 if (sd->parent && 7783 if (sd->parent &&
7785 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 7784 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
7786 printk(KERN_ERR "ERROR: parent span is not a superset " 7785 printk(KERN_ERR "ERROR: parent span is not a superset "
7787 "of domain->span\n"); 7786 "of domain->span\n");
7788 return 0; 7787 return 0;
7789 } 7788 }
7790 7789
7791 static void sched_domain_debug(struct sched_domain *sd, int cpu) 7790 static void sched_domain_debug(struct sched_domain *sd, int cpu)
7792 { 7791 {
7793 cpumask_var_t groupmask; 7792 cpumask_var_t groupmask;
7794 int level = 0; 7793 int level = 0;
7795 7794
7796 if (!sd) { 7795 if (!sd) {
7797 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7796 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7798 return; 7797 return;
7799 } 7798 }
7800 7799
7801 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 7800 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
7802 7801
7803 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) { 7802 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
7804 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n"); 7803 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
7805 return; 7804 return;
7806 } 7805 }
7807 7806
7808 for (;;) { 7807 for (;;) {
7809 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 7808 if (sched_domain_debug_one(sd, cpu, level, groupmask))
7810 break; 7809 break;
7811 level++; 7810 level++;
7812 sd = sd->parent; 7811 sd = sd->parent;
7813 if (!sd) 7812 if (!sd)
7814 break; 7813 break;
7815 } 7814 }
7816 free_cpumask_var(groupmask); 7815 free_cpumask_var(groupmask);
7817 } 7816 }
7818 #else /* !CONFIG_SCHED_DEBUG */ 7817 #else /* !CONFIG_SCHED_DEBUG */
7819 # define sched_domain_debug(sd, cpu) do { } while (0) 7818 # define sched_domain_debug(sd, cpu) do { } while (0)
7820 #endif /* CONFIG_SCHED_DEBUG */ 7819 #endif /* CONFIG_SCHED_DEBUG */
7821 7820
7822 static int sd_degenerate(struct sched_domain *sd) 7821 static int sd_degenerate(struct sched_domain *sd)
7823 { 7822 {
7824 if (cpumask_weight(sched_domain_span(sd)) == 1) 7823 if (cpumask_weight(sched_domain_span(sd)) == 1)
7825 return 1; 7824 return 1;
7826 7825
7827 /* Following flags need at least 2 groups */ 7826 /* Following flags need at least 2 groups */
7828 if (sd->flags & (SD_LOAD_BALANCE | 7827 if (sd->flags & (SD_LOAD_BALANCE |
7829 SD_BALANCE_NEWIDLE | 7828 SD_BALANCE_NEWIDLE |
7830 SD_BALANCE_FORK | 7829 SD_BALANCE_FORK |
7831 SD_BALANCE_EXEC | 7830 SD_BALANCE_EXEC |
7832 SD_SHARE_CPUPOWER | 7831 SD_SHARE_CPUPOWER |
7833 SD_SHARE_PKG_RESOURCES)) { 7832 SD_SHARE_PKG_RESOURCES)) {
7834 if (sd->groups != sd->groups->next) 7833 if (sd->groups != sd->groups->next)
7835 return 0; 7834 return 0;
7836 } 7835 }
7837 7836
7838 /* Following flags don't use groups */ 7837 /* Following flags don't use groups */
7839 if (sd->flags & (SD_WAKE_AFFINE)) 7838 if (sd->flags & (SD_WAKE_AFFINE))
7840 return 0; 7839 return 0;
7841 7840
7842 return 1; 7841 return 1;
7843 } 7842 }
7844 7843
7845 static int 7844 static int
7846 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 7845 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7847 { 7846 {
7848 unsigned long cflags = sd->flags, pflags = parent->flags; 7847 unsigned long cflags = sd->flags, pflags = parent->flags;
7849 7848
7850 if (sd_degenerate(parent)) 7849 if (sd_degenerate(parent))
7851 return 1; 7850 return 1;
7852 7851
7853 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7852 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7854 return 0; 7853 return 0;
7855 7854
7856 /* Flags needing groups don't count if only 1 group in parent */ 7855 /* Flags needing groups don't count if only 1 group in parent */
7857 if (parent->groups == parent->groups->next) { 7856 if (parent->groups == parent->groups->next) {
7858 pflags &= ~(SD_LOAD_BALANCE | 7857 pflags &= ~(SD_LOAD_BALANCE |
7859 SD_BALANCE_NEWIDLE | 7858 SD_BALANCE_NEWIDLE |
7860 SD_BALANCE_FORK | 7859 SD_BALANCE_FORK |
7861 SD_BALANCE_EXEC | 7860 SD_BALANCE_EXEC |
7862 SD_SHARE_CPUPOWER | 7861 SD_SHARE_CPUPOWER |
7863 SD_SHARE_PKG_RESOURCES); 7862 SD_SHARE_PKG_RESOURCES);
7864 if (nr_node_ids == 1) 7863 if (nr_node_ids == 1)
7865 pflags &= ~SD_SERIALIZE; 7864 pflags &= ~SD_SERIALIZE;
7866 } 7865 }
7867 if (~cflags & pflags) 7866 if (~cflags & pflags)
7868 return 0; 7867 return 0;
7869 7868
7870 return 1; 7869 return 1;
7871 } 7870 }
7872 7871
7873 static void free_rootdomain(struct root_domain *rd) 7872 static void free_rootdomain(struct root_domain *rd)
7874 { 7873 {
7875 cpupri_cleanup(&rd->cpupri); 7874 cpupri_cleanup(&rd->cpupri);
7876 7875
7877 free_cpumask_var(rd->rto_mask); 7876 free_cpumask_var(rd->rto_mask);
7878 free_cpumask_var(rd->online); 7877 free_cpumask_var(rd->online);
7879 free_cpumask_var(rd->span); 7878 free_cpumask_var(rd->span);
7880 kfree(rd); 7879 kfree(rd);
7881 } 7880 }
7882 7881
7883 static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7882 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7884 { 7883 {
7885 struct root_domain *old_rd = NULL; 7884 struct root_domain *old_rd = NULL;
7886 unsigned long flags; 7885 unsigned long flags;
7887 7886
7888 spin_lock_irqsave(&rq->lock, flags); 7887 spin_lock_irqsave(&rq->lock, flags);
7889 7888
7890 if (rq->rd) { 7889 if (rq->rd) {
7891 old_rd = rq->rd; 7890 old_rd = rq->rd;
7892 7891
7893 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 7892 if (cpumask_test_cpu(rq->cpu, old_rd->online))
7894 set_rq_offline(rq); 7893 set_rq_offline(rq);
7895 7894
7896 cpumask_clear_cpu(rq->cpu, old_rd->span); 7895 cpumask_clear_cpu(rq->cpu, old_rd->span);
7897 7896
7898 /* 7897 /*
7899 * If we dont want to free the old_rt yet then 7898 * If we dont want to free the old_rt yet then
7900 * set old_rd to NULL to skip the freeing later 7899 * set old_rd to NULL to skip the freeing later
7901 * in this function: 7900 * in this function:
7902 */ 7901 */
7903 if (!atomic_dec_and_test(&old_rd->refcount)) 7902 if (!atomic_dec_and_test(&old_rd->refcount))
7904 old_rd = NULL; 7903 old_rd = NULL;
7905 } 7904 }
7906 7905
7907 atomic_inc(&rd->refcount); 7906 atomic_inc(&rd->refcount);
7908 rq->rd = rd; 7907 rq->rd = rd;
7909 7908
7910 cpumask_set_cpu(rq->cpu, rd->span); 7909 cpumask_set_cpu(rq->cpu, rd->span);
7911 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 7910 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7912 set_rq_online(rq); 7911 set_rq_online(rq);
7913 7912
7914 spin_unlock_irqrestore(&rq->lock, flags); 7913 spin_unlock_irqrestore(&rq->lock, flags);
7915 7914
7916 if (old_rd) 7915 if (old_rd)
7917 free_rootdomain(old_rd); 7916 free_rootdomain(old_rd);
7918 } 7917 }
7919 7918
7920 static int init_rootdomain(struct root_domain *rd, bool bootmem) 7919 static int init_rootdomain(struct root_domain *rd, bool bootmem)
7921 { 7920 {
7922 gfp_t gfp = GFP_KERNEL; 7921 gfp_t gfp = GFP_KERNEL;
7923 7922
7924 memset(rd, 0, sizeof(*rd)); 7923 memset(rd, 0, sizeof(*rd));
7925 7924
7926 if (bootmem) 7925 if (bootmem)
7927 gfp = GFP_NOWAIT; 7926 gfp = GFP_NOWAIT;
7928 7927
7929 if (!alloc_cpumask_var(&rd->span, gfp)) 7928 if (!alloc_cpumask_var(&rd->span, gfp))
7930 goto out; 7929 goto out;
7931 if (!alloc_cpumask_var(&rd->online, gfp)) 7930 if (!alloc_cpumask_var(&rd->online, gfp))
7932 goto free_span; 7931 goto free_span;
7933 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 7932 if (!alloc_cpumask_var(&rd->rto_mask, gfp))
7934 goto free_online; 7933 goto free_online;
7935 7934
7936 if (cpupri_init(&rd->cpupri, bootmem) != 0) 7935 if (cpupri_init(&rd->cpupri, bootmem) != 0)
7937 goto free_rto_mask; 7936 goto free_rto_mask;
7938 return 0; 7937 return 0;
7939 7938
7940 free_rto_mask: 7939 free_rto_mask:
7941 free_cpumask_var(rd->rto_mask); 7940 free_cpumask_var(rd->rto_mask);
7942 free_online: 7941 free_online:
7943 free_cpumask_var(rd->online); 7942 free_cpumask_var(rd->online);
7944 free_span: 7943 free_span:
7945 free_cpumask_var(rd->span); 7944 free_cpumask_var(rd->span);
7946 out: 7945 out:
7947 return -ENOMEM; 7946 return -ENOMEM;
7948 } 7947 }
7949 7948
7950 static void init_defrootdomain(void) 7949 static void init_defrootdomain(void)
7951 { 7950 {
7952 init_rootdomain(&def_root_domain, true); 7951 init_rootdomain(&def_root_domain, true);
7953 7952
7954 atomic_set(&def_root_domain.refcount, 1); 7953 atomic_set(&def_root_domain.refcount, 1);
7955 } 7954 }
7956 7955
7957 static struct root_domain *alloc_rootdomain(void) 7956 static struct root_domain *alloc_rootdomain(void)
7958 { 7957 {
7959 struct root_domain *rd; 7958 struct root_domain *rd;
7960 7959
7961 rd = kmalloc(sizeof(*rd), GFP_KERNEL); 7960 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
7962 if (!rd) 7961 if (!rd)
7963 return NULL; 7962 return NULL;
7964 7963
7965 if (init_rootdomain(rd, false) != 0) { 7964 if (init_rootdomain(rd, false) != 0) {
7966 kfree(rd); 7965 kfree(rd);
7967 return NULL; 7966 return NULL;
7968 } 7967 }
7969 7968
7970 return rd; 7969 return rd;
7971 } 7970 }
7972 7971
7973 /* 7972 /*
7974 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 7973 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
7975 * hold the hotplug lock. 7974 * hold the hotplug lock.
7976 */ 7975 */
7977 static void 7976 static void
7978 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) 7977 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
7979 { 7978 {
7980 struct rq *rq = cpu_rq(cpu); 7979 struct rq *rq = cpu_rq(cpu);
7981 struct sched_domain *tmp; 7980 struct sched_domain *tmp;
7982 7981
7983 /* Remove the sched domains which do not contribute to scheduling. */ 7982 /* Remove the sched domains which do not contribute to scheduling. */
7984 for (tmp = sd; tmp; ) { 7983 for (tmp = sd; tmp; ) {
7985 struct sched_domain *parent = tmp->parent; 7984 struct sched_domain *parent = tmp->parent;
7986 if (!parent) 7985 if (!parent)
7987 break; 7986 break;
7988 7987
7989 if (sd_parent_degenerate(tmp, parent)) { 7988 if (sd_parent_degenerate(tmp, parent)) {
7990 tmp->parent = parent->parent; 7989 tmp->parent = parent->parent;
7991 if (parent->parent) 7990 if (parent->parent)
7992 parent->parent->child = tmp; 7991 parent->parent->child = tmp;
7993 } else 7992 } else
7994 tmp = tmp->parent; 7993 tmp = tmp->parent;
7995 } 7994 }
7996 7995
7997 if (sd && sd_degenerate(sd)) { 7996 if (sd && sd_degenerate(sd)) {
7998 sd = sd->parent; 7997 sd = sd->parent;
7999 if (sd) 7998 if (sd)
8000 sd->child = NULL; 7999 sd->child = NULL;
8001 } 8000 }
8002 8001
8003 sched_domain_debug(sd, cpu); 8002 sched_domain_debug(sd, cpu);
8004 8003
8005 rq_attach_root(rq, rd); 8004 rq_attach_root(rq, rd);
8006 rcu_assign_pointer(rq->sd, sd); 8005 rcu_assign_pointer(rq->sd, sd);
8007 } 8006 }
8008 8007
8009 /* cpus with isolated domains */ 8008 /* cpus with isolated domains */
8010 static cpumask_var_t cpu_isolated_map; 8009 static cpumask_var_t cpu_isolated_map;
8011 8010
8012 /* Setup the mask of cpus configured for isolated domains */ 8011 /* Setup the mask of cpus configured for isolated domains */
8013 static int __init isolated_cpu_setup(char *str) 8012 static int __init isolated_cpu_setup(char *str)
8014 { 8013 {
8015 cpulist_parse(str, cpu_isolated_map); 8014 cpulist_parse(str, cpu_isolated_map);
8016 return 1; 8015 return 1;
8017 } 8016 }
8018 8017
8019 __setup("isolcpus=", isolated_cpu_setup); 8018 __setup("isolcpus=", isolated_cpu_setup);
8020 8019
8021 /* 8020 /*
8022 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 8021 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
8023 * to a function which identifies what group(along with sched group) a CPU 8022 * to a function which identifies what group(along with sched group) a CPU
8024 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids 8023 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
8025 * (due to the fact that we keep track of groups covered with a struct cpumask). 8024 * (due to the fact that we keep track of groups covered with a struct cpumask).
8026 * 8025 *
8027 * init_sched_build_groups will build a circular linked list of the groups 8026 * init_sched_build_groups will build a circular linked list of the groups
8028 * covered by the given span, and will set each group's ->cpumask correctly, 8027 * covered by the given span, and will set each group's ->cpumask correctly,
8029 * and ->cpu_power to 0. 8028 * and ->cpu_power to 0.
8030 */ 8029 */
8031 static void 8030 static void
8032 init_sched_build_groups(const struct cpumask *span, 8031 init_sched_build_groups(const struct cpumask *span,
8033 const struct cpumask *cpu_map, 8032 const struct cpumask *cpu_map,
8034 int (*group_fn)(int cpu, const struct cpumask *cpu_map, 8033 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
8035 struct sched_group **sg, 8034 struct sched_group **sg,
8036 struct cpumask *tmpmask), 8035 struct cpumask *tmpmask),
8037 struct cpumask *covered, struct cpumask *tmpmask) 8036 struct cpumask *covered, struct cpumask *tmpmask)
8038 { 8037 {
8039 struct sched_group *first = NULL, *last = NULL; 8038 struct sched_group *first = NULL, *last = NULL;
8040 int i; 8039 int i;
8041 8040
8042 cpumask_clear(covered); 8041 cpumask_clear(covered);
8043 8042
8044 for_each_cpu(i, span) { 8043 for_each_cpu(i, span) {
8045 struct sched_group *sg; 8044 struct sched_group *sg;
8046 int group = group_fn(i, cpu_map, &sg, tmpmask); 8045 int group = group_fn(i, cpu_map, &sg, tmpmask);
8047 int j; 8046 int j;
8048 8047
8049 if (cpumask_test_cpu(i, covered)) 8048 if (cpumask_test_cpu(i, covered))
8050 continue; 8049 continue;
8051 8050
8052 cpumask_clear(sched_group_cpus(sg)); 8051 cpumask_clear(sched_group_cpus(sg));
8053 sg->cpu_power = 0; 8052 sg->cpu_power = 0;
8054 8053
8055 for_each_cpu(j, span) { 8054 for_each_cpu(j, span) {
8056 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8055 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
8057 continue; 8056 continue;
8058 8057
8059 cpumask_set_cpu(j, covered); 8058 cpumask_set_cpu(j, covered);
8060 cpumask_set_cpu(j, sched_group_cpus(sg)); 8059 cpumask_set_cpu(j, sched_group_cpus(sg));
8061 } 8060 }
8062 if (!first) 8061 if (!first)
8063 first = sg; 8062 first = sg;
8064 if (last) 8063 if (last)
8065 last->next = sg; 8064 last->next = sg;
8066 last = sg; 8065 last = sg;
8067 } 8066 }
8068 last->next = first; 8067 last->next = first;
8069 } 8068 }
8070 8069
8071 #define SD_NODES_PER_DOMAIN 16 8070 #define SD_NODES_PER_DOMAIN 16
8072 8071
8073 #ifdef CONFIG_NUMA 8072 #ifdef CONFIG_NUMA
8074 8073
8075 /** 8074 /**
8076 * find_next_best_node - find the next node to include in a sched_domain 8075 * find_next_best_node - find the next node to include in a sched_domain
8077 * @node: node whose sched_domain we're building 8076 * @node: node whose sched_domain we're building
8078 * @used_nodes: nodes already in the sched_domain 8077 * @used_nodes: nodes already in the sched_domain
8079 * 8078 *
8080 * Find the next node to include in a given scheduling domain. Simply 8079 * Find the next node to include in a given scheduling domain. Simply
8081 * finds the closest node not already in the @used_nodes map. 8080 * finds the closest node not already in the @used_nodes map.
8082 * 8081 *
8083 * Should use nodemask_t. 8082 * Should use nodemask_t.
8084 */ 8083 */
8085 static int find_next_best_node(int node, nodemask_t *used_nodes) 8084 static int find_next_best_node(int node, nodemask_t *used_nodes)
8086 { 8085 {
8087 int i, n, val, min_val, best_node = 0; 8086 int i, n, val, min_val, best_node = 0;
8088 8087
8089 min_val = INT_MAX; 8088 min_val = INT_MAX;
8090 8089
8091 for (i = 0; i < nr_node_ids; i++) { 8090 for (i = 0; i < nr_node_ids; i++) {
8092 /* Start at @node */ 8091 /* Start at @node */
8093 n = (node + i) % nr_node_ids; 8092 n = (node + i) % nr_node_ids;
8094 8093
8095 if (!nr_cpus_node(n)) 8094 if (!nr_cpus_node(n))
8096 continue; 8095 continue;
8097 8096
8098 /* Skip already used nodes */ 8097 /* Skip already used nodes */
8099 if (node_isset(n, *used_nodes)) 8098 if (node_isset(n, *used_nodes))
8100 continue; 8099 continue;
8101 8100
8102 /* Simple min distance search */ 8101 /* Simple min distance search */
8103 val = node_distance(node, n); 8102 val = node_distance(node, n);
8104 8103
8105 if (val < min_val) { 8104 if (val < min_val) {
8106 min_val = val; 8105 min_val = val;
8107 best_node = n; 8106 best_node = n;
8108 } 8107 }
8109 } 8108 }
8110 8109
8111 node_set(best_node, *used_nodes); 8110 node_set(best_node, *used_nodes);
8112 return best_node; 8111 return best_node;
8113 } 8112 }
8114 8113
8115 /** 8114 /**
8116 * sched_domain_node_span - get a cpumask for a node's sched_domain 8115 * sched_domain_node_span - get a cpumask for a node's sched_domain
8117 * @node: node whose cpumask we're constructing 8116 * @node: node whose cpumask we're constructing
8118 * @span: resulting cpumask 8117 * @span: resulting cpumask
8119 * 8118 *
8120 * Given a node, construct a good cpumask for its sched_domain to span. It 8119 * Given a node, construct a good cpumask for its sched_domain to span. It
8121 * should be one that prevents unnecessary balancing, but also spreads tasks 8120 * should be one that prevents unnecessary balancing, but also spreads tasks
8122 * out optimally. 8121 * out optimally.
8123 */ 8122 */
8124 static void sched_domain_node_span(int node, struct cpumask *span) 8123 static void sched_domain_node_span(int node, struct cpumask *span)
8125 { 8124 {
8126 nodemask_t used_nodes; 8125 nodemask_t used_nodes;
8127 int i; 8126 int i;
8128 8127
8129 cpumask_clear(span); 8128 cpumask_clear(span);
8130 nodes_clear(used_nodes); 8129 nodes_clear(used_nodes);
8131 8130
8132 cpumask_or(span, span, cpumask_of_node(node)); 8131 cpumask_or(span, span, cpumask_of_node(node));
8133 node_set(node, used_nodes); 8132 node_set(node, used_nodes);
8134 8133
8135 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 8134 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
8136 int next_node = find_next_best_node(node, &used_nodes); 8135 int next_node = find_next_best_node(node, &used_nodes);
8137 8136
8138 cpumask_or(span, span, cpumask_of_node(next_node)); 8137 cpumask_or(span, span, cpumask_of_node(next_node));
8139 } 8138 }
8140 } 8139 }
8141 #endif /* CONFIG_NUMA */ 8140 #endif /* CONFIG_NUMA */
8142 8141
8143 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 8142 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
8144 8143
8145 /* 8144 /*
8146 * The cpus mask in sched_group and sched_domain hangs off the end. 8145 * The cpus mask in sched_group and sched_domain hangs off the end.
8147 * 8146 *
8148 * ( See the the comments in include/linux/sched.h:struct sched_group 8147 * ( See the the comments in include/linux/sched.h:struct sched_group
8149 * and struct sched_domain. ) 8148 * and struct sched_domain. )
8150 */ 8149 */
8151 struct static_sched_group { 8150 struct static_sched_group {
8152 struct sched_group sg; 8151 struct sched_group sg;
8153 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); 8152 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
8154 }; 8153 };
8155 8154
8156 struct static_sched_domain { 8155 struct static_sched_domain {
8157 struct sched_domain sd; 8156 struct sched_domain sd;
8158 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8157 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8159 }; 8158 };
8160 8159
8161 struct s_data { 8160 struct s_data {
8162 #ifdef CONFIG_NUMA 8161 #ifdef CONFIG_NUMA
8163 int sd_allnodes; 8162 int sd_allnodes;
8164 cpumask_var_t domainspan; 8163 cpumask_var_t domainspan;
8165 cpumask_var_t covered; 8164 cpumask_var_t covered;
8166 cpumask_var_t notcovered; 8165 cpumask_var_t notcovered;
8167 #endif 8166 #endif
8168 cpumask_var_t nodemask; 8167 cpumask_var_t nodemask;
8169 cpumask_var_t this_sibling_map; 8168 cpumask_var_t this_sibling_map;
8170 cpumask_var_t this_core_map; 8169 cpumask_var_t this_core_map;
8171 cpumask_var_t send_covered; 8170 cpumask_var_t send_covered;
8172 cpumask_var_t tmpmask; 8171 cpumask_var_t tmpmask;
8173 struct sched_group **sched_group_nodes; 8172 struct sched_group **sched_group_nodes;
8174 struct root_domain *rd; 8173 struct root_domain *rd;
8175 }; 8174 };
8176 8175
8177 enum s_alloc { 8176 enum s_alloc {
8178 sa_sched_groups = 0, 8177 sa_sched_groups = 0,
8179 sa_rootdomain, 8178 sa_rootdomain,
8180 sa_tmpmask, 8179 sa_tmpmask,
8181 sa_send_covered, 8180 sa_send_covered,
8182 sa_this_core_map, 8181 sa_this_core_map,
8183 sa_this_sibling_map, 8182 sa_this_sibling_map,
8184 sa_nodemask, 8183 sa_nodemask,
8185 sa_sched_group_nodes, 8184 sa_sched_group_nodes,
8186 #ifdef CONFIG_NUMA 8185 #ifdef CONFIG_NUMA
8187 sa_notcovered, 8186 sa_notcovered,
8188 sa_covered, 8187 sa_covered,
8189 sa_domainspan, 8188 sa_domainspan,
8190 #endif 8189 #endif
8191 sa_none, 8190 sa_none,
8192 }; 8191 };
8193 8192
8194 /* 8193 /*
8195 * SMT sched-domains: 8194 * SMT sched-domains:
8196 */ 8195 */
8197 #ifdef CONFIG_SCHED_SMT 8196 #ifdef CONFIG_SCHED_SMT
8198 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 8197 static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8199 static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 8198 static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
8200 8199
8201 static int 8200 static int
8202 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 8201 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8203 struct sched_group **sg, struct cpumask *unused) 8202 struct sched_group **sg, struct cpumask *unused)
8204 { 8203 {
8205 if (sg) 8204 if (sg)
8206 *sg = &per_cpu(sched_group_cpus, cpu).sg; 8205 *sg = &per_cpu(sched_group_cpus, cpu).sg;
8207 return cpu; 8206 return cpu;
8208 } 8207 }
8209 #endif /* CONFIG_SCHED_SMT */ 8208 #endif /* CONFIG_SCHED_SMT */
8210 8209
8211 /* 8210 /*
8212 * multi-core sched-domains: 8211 * multi-core sched-domains:
8213 */ 8212 */
8214 #ifdef CONFIG_SCHED_MC 8213 #ifdef CONFIG_SCHED_MC
8215 static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 8214 static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
8216 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 8215 static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
8217 #endif /* CONFIG_SCHED_MC */ 8216 #endif /* CONFIG_SCHED_MC */
8218 8217
8219 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 8218 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
8220 static int 8219 static int
8221 cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 8220 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
8222 struct sched_group **sg, struct cpumask *mask) 8221 struct sched_group **sg, struct cpumask *mask)
8223 { 8222 {
8224 int group; 8223 int group;
8225 8224
8226 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 8225 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
8227 group = cpumask_first(mask); 8226 group = cpumask_first(mask);
8228 if (sg) 8227 if (sg)
8229 *sg = &per_cpu(sched_group_core, group).sg; 8228 *sg = &per_cpu(sched_group_core, group).sg;
8230 return group; 8229 return group;
8231 } 8230 }
8232 #elif defined(CONFIG_SCHED_MC) 8231 #elif defined(CONFIG_SCHED_MC)
8233 static int 8232 static int
8234 cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 8233 cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
8235 struct sched_group **sg, struct cpumask *unused) 8234 struct sched_group **sg, struct cpumask *unused)
8236 { 8235 {
8237 if (sg) 8236 if (sg)
8238 *sg = &per_cpu(sched_group_core, cpu).sg; 8237 *sg = &per_cpu(sched_group_core, cpu).sg;
8239 return cpu; 8238 return cpu;
8240 } 8239 }
8241 #endif 8240 #endif
8242 8241
8243 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 8242 static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
8244 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 8243 static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
8245 8244
8246 static int 8245 static int
8247 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, 8246 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
8248 struct sched_group **sg, struct cpumask *mask) 8247 struct sched_group **sg, struct cpumask *mask)
8249 { 8248 {
8250 int group; 8249 int group;
8251 #ifdef CONFIG_SCHED_MC 8250 #ifdef CONFIG_SCHED_MC
8252 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 8251 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
8253 group = cpumask_first(mask); 8252 group = cpumask_first(mask);
8254 #elif defined(CONFIG_SCHED_SMT) 8253 #elif defined(CONFIG_SCHED_SMT)
8255 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 8254 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
8256 group = cpumask_first(mask); 8255 group = cpumask_first(mask);
8257 #else 8256 #else
8258 group = cpu; 8257 group = cpu;
8259 #endif 8258 #endif
8260 if (sg) 8259 if (sg)
8261 *sg = &per_cpu(sched_group_phys, group).sg; 8260 *sg = &per_cpu(sched_group_phys, group).sg;
8262 return group; 8261 return group;
8263 } 8262 }
8264 8263
8265 #ifdef CONFIG_NUMA 8264 #ifdef CONFIG_NUMA
8266 /* 8265 /*
8267 * The init_sched_build_groups can't handle what we want to do with node 8266 * The init_sched_build_groups can't handle what we want to do with node
8268 * groups, so roll our own. Now each node has its own list of groups which 8267 * groups, so roll our own. Now each node has its own list of groups which
8269 * gets dynamically allocated. 8268 * gets dynamically allocated.
8270 */ 8269 */
8271 static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 8270 static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
8272 static struct sched_group ***sched_group_nodes_bycpu; 8271 static struct sched_group ***sched_group_nodes_bycpu;
8273 8272
8274 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains); 8273 static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
8275 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes); 8274 static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
8276 8275
8277 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map, 8276 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
8278 struct sched_group **sg, 8277 struct sched_group **sg,
8279 struct cpumask *nodemask) 8278 struct cpumask *nodemask)
8280 { 8279 {
8281 int group; 8280 int group;
8282 8281
8283 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map); 8282 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
8284 group = cpumask_first(nodemask); 8283 group = cpumask_first(nodemask);
8285 8284
8286 if (sg) 8285 if (sg)
8287 *sg = &per_cpu(sched_group_allnodes, group).sg; 8286 *sg = &per_cpu(sched_group_allnodes, group).sg;
8288 return group; 8287 return group;
8289 } 8288 }
8290 8289
8291 static void init_numa_sched_groups_power(struct sched_group *group_head) 8290 static void init_numa_sched_groups_power(struct sched_group *group_head)
8292 { 8291 {
8293 struct sched_group *sg = group_head; 8292 struct sched_group *sg = group_head;
8294 int j; 8293 int j;
8295 8294
8296 if (!sg) 8295 if (!sg)
8297 return; 8296 return;
8298 do { 8297 do {
8299 for_each_cpu(j, sched_group_cpus(sg)) { 8298 for_each_cpu(j, sched_group_cpus(sg)) {
8300 struct sched_domain *sd; 8299 struct sched_domain *sd;
8301 8300
8302 sd = &per_cpu(phys_domains, j).sd; 8301 sd = &per_cpu(phys_domains, j).sd;
8303 if (j != group_first_cpu(sd->groups)) { 8302 if (j != group_first_cpu(sd->groups)) {
8304 /* 8303 /*
8305 * Only add "power" once for each 8304 * Only add "power" once for each
8306 * physical package. 8305 * physical package.
8307 */ 8306 */
8308 continue; 8307 continue;
8309 } 8308 }
8310 8309
8311 sg->cpu_power += sd->groups->cpu_power; 8310 sg->cpu_power += sd->groups->cpu_power;
8312 } 8311 }
8313 sg = sg->next; 8312 sg = sg->next;
8314 } while (sg != group_head); 8313 } while (sg != group_head);
8315 } 8314 }
8316 8315
8317 static int build_numa_sched_groups(struct s_data *d, 8316 static int build_numa_sched_groups(struct s_data *d,
8318 const struct cpumask *cpu_map, int num) 8317 const struct cpumask *cpu_map, int num)
8319 { 8318 {
8320 struct sched_domain *sd; 8319 struct sched_domain *sd;
8321 struct sched_group *sg, *prev; 8320 struct sched_group *sg, *prev;
8322 int n, j; 8321 int n, j;
8323 8322
8324 cpumask_clear(d->covered); 8323 cpumask_clear(d->covered);
8325 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); 8324 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8326 if (cpumask_empty(d->nodemask)) { 8325 if (cpumask_empty(d->nodemask)) {
8327 d->sched_group_nodes[num] = NULL; 8326 d->sched_group_nodes[num] = NULL;
8328 goto out; 8327 goto out;
8329 } 8328 }
8330 8329
8331 sched_domain_node_span(num, d->domainspan); 8330 sched_domain_node_span(num, d->domainspan);
8332 cpumask_and(d->domainspan, d->domainspan, cpu_map); 8331 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8333 8332
8334 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8333 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8335 GFP_KERNEL, num); 8334 GFP_KERNEL, num);
8336 if (!sg) { 8335 if (!sg) {
8337 printk(KERN_WARNING "Can not alloc domain group for node %d\n", 8336 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8338 num); 8337 num);
8339 return -ENOMEM; 8338 return -ENOMEM;
8340 } 8339 }
8341 d->sched_group_nodes[num] = sg; 8340 d->sched_group_nodes[num] = sg;
8342 8341
8343 for_each_cpu(j, d->nodemask) { 8342 for_each_cpu(j, d->nodemask) {
8344 sd = &per_cpu(node_domains, j).sd; 8343 sd = &per_cpu(node_domains, j).sd;
8345 sd->groups = sg; 8344 sd->groups = sg;
8346 } 8345 }
8347 8346
8348 sg->cpu_power = 0; 8347 sg->cpu_power = 0;
8349 cpumask_copy(sched_group_cpus(sg), d->nodemask); 8348 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8350 sg->next = sg; 8349 sg->next = sg;
8351 cpumask_or(d->covered, d->covered, d->nodemask); 8350 cpumask_or(d->covered, d->covered, d->nodemask);
8352 8351
8353 prev = sg; 8352 prev = sg;
8354 for (j = 0; j < nr_node_ids; j++) { 8353 for (j = 0; j < nr_node_ids; j++) {
8355 n = (num + j) % nr_node_ids; 8354 n = (num + j) % nr_node_ids;
8356 cpumask_complement(d->notcovered, d->covered); 8355 cpumask_complement(d->notcovered, d->covered);
8357 cpumask_and(d->tmpmask, d->notcovered, cpu_map); 8356 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8358 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); 8357 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8359 if (cpumask_empty(d->tmpmask)) 8358 if (cpumask_empty(d->tmpmask))
8360 break; 8359 break;
8361 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); 8360 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8362 if (cpumask_empty(d->tmpmask)) 8361 if (cpumask_empty(d->tmpmask))
8363 continue; 8362 continue;
8364 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8363 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8365 GFP_KERNEL, num); 8364 GFP_KERNEL, num);
8366 if (!sg) { 8365 if (!sg) {
8367 printk(KERN_WARNING 8366 printk(KERN_WARNING
8368 "Can not alloc domain group for node %d\n", j); 8367 "Can not alloc domain group for node %d\n", j);
8369 return -ENOMEM; 8368 return -ENOMEM;
8370 } 8369 }
8371 sg->cpu_power = 0; 8370 sg->cpu_power = 0;
8372 cpumask_copy(sched_group_cpus(sg), d->tmpmask); 8371 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8373 sg->next = prev->next; 8372 sg->next = prev->next;
8374 cpumask_or(d->covered, d->covered, d->tmpmask); 8373 cpumask_or(d->covered, d->covered, d->tmpmask);
8375 prev->next = sg; 8374 prev->next = sg;
8376 prev = sg; 8375 prev = sg;
8377 } 8376 }
8378 out: 8377 out:
8379 return 0; 8378 return 0;
8380 } 8379 }
8381 #endif /* CONFIG_NUMA */ 8380 #endif /* CONFIG_NUMA */
8382 8381
8383 #ifdef CONFIG_NUMA 8382 #ifdef CONFIG_NUMA
8384 /* Free memory allocated for various sched_group structures */ 8383 /* Free memory allocated for various sched_group structures */
8385 static void free_sched_groups(const struct cpumask *cpu_map, 8384 static void free_sched_groups(const struct cpumask *cpu_map,
8386 struct cpumask *nodemask) 8385 struct cpumask *nodemask)
8387 { 8386 {
8388 int cpu, i; 8387 int cpu, i;
8389 8388
8390 for_each_cpu(cpu, cpu_map) { 8389 for_each_cpu(cpu, cpu_map) {
8391 struct sched_group **sched_group_nodes 8390 struct sched_group **sched_group_nodes
8392 = sched_group_nodes_bycpu[cpu]; 8391 = sched_group_nodes_bycpu[cpu];
8393 8392
8394 if (!sched_group_nodes) 8393 if (!sched_group_nodes)
8395 continue; 8394 continue;
8396 8395
8397 for (i = 0; i < nr_node_ids; i++) { 8396 for (i = 0; i < nr_node_ids; i++) {
8398 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 8397 struct sched_group *oldsg, *sg = sched_group_nodes[i];
8399 8398
8400 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8399 cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
8401 if (cpumask_empty(nodemask)) 8400 if (cpumask_empty(nodemask))
8402 continue; 8401 continue;
8403 8402
8404 if (sg == NULL) 8403 if (sg == NULL)
8405 continue; 8404 continue;
8406 sg = sg->next; 8405 sg = sg->next;
8407 next_sg: 8406 next_sg:
8408 oldsg = sg; 8407 oldsg = sg;
8409 sg = sg->next; 8408 sg = sg->next;
8410 kfree(oldsg); 8409 kfree(oldsg);
8411 if (oldsg != sched_group_nodes[i]) 8410 if (oldsg != sched_group_nodes[i])
8412 goto next_sg; 8411 goto next_sg;
8413 } 8412 }
8414 kfree(sched_group_nodes); 8413 kfree(sched_group_nodes);
8415 sched_group_nodes_bycpu[cpu] = NULL; 8414 sched_group_nodes_bycpu[cpu] = NULL;
8416 } 8415 }
8417 } 8416 }
8418 #else /* !CONFIG_NUMA */ 8417 #else /* !CONFIG_NUMA */
8419 static void free_sched_groups(const struct cpumask *cpu_map, 8418 static void free_sched_groups(const struct cpumask *cpu_map,
8420 struct cpumask *nodemask) 8419 struct cpumask *nodemask)
8421 { 8420 {
8422 } 8421 }
8423 #endif /* CONFIG_NUMA */ 8422 #endif /* CONFIG_NUMA */
8424 8423
8425 /* 8424 /*
8426 * Initialize sched groups cpu_power. 8425 * Initialize sched groups cpu_power.
8427 * 8426 *
8428 * cpu_power indicates the capacity of sched group, which is used while 8427 * cpu_power indicates the capacity of sched group, which is used while
8429 * distributing the load between different sched groups in a sched domain. 8428 * distributing the load between different sched groups in a sched domain.
8430 * Typically cpu_power for all the groups in a sched domain will be same unless 8429 * Typically cpu_power for all the groups in a sched domain will be same unless
8431 * there are asymmetries in the topology. If there are asymmetries, group 8430 * there are asymmetries in the topology. If there are asymmetries, group
8432 * having more cpu_power will pickup more load compared to the group having 8431 * having more cpu_power will pickup more load compared to the group having
8433 * less cpu_power. 8432 * less cpu_power.
8434 */ 8433 */
8435 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8434 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8436 { 8435 {
8437 struct sched_domain *child; 8436 struct sched_domain *child;
8438 struct sched_group *group; 8437 struct sched_group *group;
8439 long power; 8438 long power;
8440 int weight; 8439 int weight;
8441 8440
8442 WARN_ON(!sd || !sd->groups); 8441 WARN_ON(!sd || !sd->groups);
8443 8442
8444 if (cpu != group_first_cpu(sd->groups)) 8443 if (cpu != group_first_cpu(sd->groups))
8445 return; 8444 return;
8446 8445
8447 child = sd->child; 8446 child = sd->child;
8448 8447
8449 sd->groups->cpu_power = 0; 8448 sd->groups->cpu_power = 0;
8450 8449
8451 if (!child) { 8450 if (!child) {
8452 power = SCHED_LOAD_SCALE; 8451 power = SCHED_LOAD_SCALE;
8453 weight = cpumask_weight(sched_domain_span(sd)); 8452 weight = cpumask_weight(sched_domain_span(sd));
8454 /* 8453 /*
8455 * SMT siblings share the power of a single core. 8454 * SMT siblings share the power of a single core.
8456 * Usually multiple threads get a better yield out of 8455 * Usually multiple threads get a better yield out of
8457 * that one core than a single thread would have, 8456 * that one core than a single thread would have,
8458 * reflect that in sd->smt_gain. 8457 * reflect that in sd->smt_gain.
8459 */ 8458 */
8460 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 8459 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8461 power *= sd->smt_gain; 8460 power *= sd->smt_gain;
8462 power /= weight; 8461 power /= weight;
8463 power >>= SCHED_LOAD_SHIFT; 8462 power >>= SCHED_LOAD_SHIFT;
8464 } 8463 }
8465 sd->groups->cpu_power += power; 8464 sd->groups->cpu_power += power;
8466 return; 8465 return;
8467 } 8466 }
8468 8467
8469 /* 8468 /*
8470 * Add cpu_power of each child group to this groups cpu_power. 8469 * Add cpu_power of each child group to this groups cpu_power.
8471 */ 8470 */
8472 group = child->groups; 8471 group = child->groups;
8473 do { 8472 do {
8474 sd->groups->cpu_power += group->cpu_power; 8473 sd->groups->cpu_power += group->cpu_power;
8475 group = group->next; 8474 group = group->next;
8476 } while (group != child->groups); 8475 } while (group != child->groups);
8477 } 8476 }
8478 8477
8479 /* 8478 /*
8480 * Initializers for schedule domains 8479 * Initializers for schedule domains
8481 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 8480 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
8482 */ 8481 */
8483 8482
8484 #ifdef CONFIG_SCHED_DEBUG 8483 #ifdef CONFIG_SCHED_DEBUG
8485 # define SD_INIT_NAME(sd, type) sd->name = #type 8484 # define SD_INIT_NAME(sd, type) sd->name = #type
8486 #else 8485 #else
8487 # define SD_INIT_NAME(sd, type) do { } while (0) 8486 # define SD_INIT_NAME(sd, type) do { } while (0)
8488 #endif 8487 #endif
8489 8488
8490 #define SD_INIT(sd, type) sd_init_##type(sd) 8489 #define SD_INIT(sd, type) sd_init_##type(sd)
8491 8490
8492 #define SD_INIT_FUNC(type) \ 8491 #define SD_INIT_FUNC(type) \
8493 static noinline void sd_init_##type(struct sched_domain *sd) \ 8492 static noinline void sd_init_##type(struct sched_domain *sd) \
8494 { \ 8493 { \
8495 memset(sd, 0, sizeof(*sd)); \ 8494 memset(sd, 0, sizeof(*sd)); \
8496 *sd = SD_##type##_INIT; \ 8495 *sd = SD_##type##_INIT; \
8497 sd->level = SD_LV_##type; \ 8496 sd->level = SD_LV_##type; \
8498 SD_INIT_NAME(sd, type); \ 8497 SD_INIT_NAME(sd, type); \
8499 } 8498 }
8500 8499
8501 SD_INIT_FUNC(CPU) 8500 SD_INIT_FUNC(CPU)
8502 #ifdef CONFIG_NUMA 8501 #ifdef CONFIG_NUMA
8503 SD_INIT_FUNC(ALLNODES) 8502 SD_INIT_FUNC(ALLNODES)
8504 SD_INIT_FUNC(NODE) 8503 SD_INIT_FUNC(NODE)
8505 #endif 8504 #endif
8506 #ifdef CONFIG_SCHED_SMT 8505 #ifdef CONFIG_SCHED_SMT
8507 SD_INIT_FUNC(SIBLING) 8506 SD_INIT_FUNC(SIBLING)
8508 #endif 8507 #endif
8509 #ifdef CONFIG_SCHED_MC 8508 #ifdef CONFIG_SCHED_MC
8510 SD_INIT_FUNC(MC) 8509 SD_INIT_FUNC(MC)
8511 #endif 8510 #endif
8512 8511
8513 static int default_relax_domain_level = -1; 8512 static int default_relax_domain_level = -1;
8514 8513
8515 static int __init setup_relax_domain_level(char *str) 8514 static int __init setup_relax_domain_level(char *str)
8516 { 8515 {
8517 unsigned long val; 8516 unsigned long val;
8518 8517
8519 val = simple_strtoul(str, NULL, 0); 8518 val = simple_strtoul(str, NULL, 0);
8520 if (val < SD_LV_MAX) 8519 if (val < SD_LV_MAX)
8521 default_relax_domain_level = val; 8520 default_relax_domain_level = val;
8522 8521
8523 return 1; 8522 return 1;
8524 } 8523 }
8525 __setup("relax_domain_level=", setup_relax_domain_level); 8524 __setup("relax_domain_level=", setup_relax_domain_level);
8526 8525
8527 static void set_domain_attribute(struct sched_domain *sd, 8526 static void set_domain_attribute(struct sched_domain *sd,
8528 struct sched_domain_attr *attr) 8527 struct sched_domain_attr *attr)
8529 { 8528 {
8530 int request; 8529 int request;
8531 8530
8532 if (!attr || attr->relax_domain_level < 0) { 8531 if (!attr || attr->relax_domain_level < 0) {
8533 if (default_relax_domain_level < 0) 8532 if (default_relax_domain_level < 0)
8534 return; 8533 return;
8535 else 8534 else
8536 request = default_relax_domain_level; 8535 request = default_relax_domain_level;
8537 } else 8536 } else
8538 request = attr->relax_domain_level; 8537 request = attr->relax_domain_level;
8539 if (request < sd->level) { 8538 if (request < sd->level) {
8540 /* turn off idle balance on this domain */ 8539 /* turn off idle balance on this domain */
8541 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 8540 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8542 } else { 8541 } else {
8543 /* turn on idle balance on this domain */ 8542 /* turn on idle balance on this domain */
8544 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); 8543 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8545 } 8544 }
8546 } 8545 }
8547 8546
8548 static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 8547 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8549 const struct cpumask *cpu_map) 8548 const struct cpumask *cpu_map)
8550 { 8549 {
8551 switch (what) { 8550 switch (what) {
8552 case sa_sched_groups: 8551 case sa_sched_groups:
8553 free_sched_groups(cpu_map, d->tmpmask); /* fall through */ 8552 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8554 d->sched_group_nodes = NULL; 8553 d->sched_group_nodes = NULL;
8555 case sa_rootdomain: 8554 case sa_rootdomain:
8556 free_rootdomain(d->rd); /* fall through */ 8555 free_rootdomain(d->rd); /* fall through */
8557 case sa_tmpmask: 8556 case sa_tmpmask:
8558 free_cpumask_var(d->tmpmask); /* fall through */ 8557 free_cpumask_var(d->tmpmask); /* fall through */
8559 case sa_send_covered: 8558 case sa_send_covered:
8560 free_cpumask_var(d->send_covered); /* fall through */ 8559 free_cpumask_var(d->send_covered); /* fall through */
8561 case sa_this_core_map: 8560 case sa_this_core_map:
8562 free_cpumask_var(d->this_core_map); /* fall through */ 8561 free_cpumask_var(d->this_core_map); /* fall through */
8563 case sa_this_sibling_map: 8562 case sa_this_sibling_map:
8564 free_cpumask_var(d->this_sibling_map); /* fall through */ 8563 free_cpumask_var(d->this_sibling_map); /* fall through */
8565 case sa_nodemask: 8564 case sa_nodemask:
8566 free_cpumask_var(d->nodemask); /* fall through */ 8565 free_cpumask_var(d->nodemask); /* fall through */
8567 case sa_sched_group_nodes: 8566 case sa_sched_group_nodes:
8568 #ifdef CONFIG_NUMA 8567 #ifdef CONFIG_NUMA
8569 kfree(d->sched_group_nodes); /* fall through */ 8568 kfree(d->sched_group_nodes); /* fall through */
8570 case sa_notcovered: 8569 case sa_notcovered:
8571 free_cpumask_var(d->notcovered); /* fall through */ 8570 free_cpumask_var(d->notcovered); /* fall through */
8572 case sa_covered: 8571 case sa_covered:
8573 free_cpumask_var(d->covered); /* fall through */ 8572 free_cpumask_var(d->covered); /* fall through */
8574 case sa_domainspan: 8573 case sa_domainspan:
8575 free_cpumask_var(d->domainspan); /* fall through */ 8574 free_cpumask_var(d->domainspan); /* fall through */
8576 #endif 8575 #endif
8577 case sa_none: 8576 case sa_none:
8578 break; 8577 break;
8579 } 8578 }
8580 } 8579 }
8581 8580
8582 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 8581 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8583 const struct cpumask *cpu_map) 8582 const struct cpumask *cpu_map)
8584 { 8583 {
8585 #ifdef CONFIG_NUMA 8584 #ifdef CONFIG_NUMA
8586 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 8585 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8587 return sa_none; 8586 return sa_none;
8588 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 8587 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8589 return sa_domainspan; 8588 return sa_domainspan;
8590 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 8589 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8591 return sa_covered; 8590 return sa_covered;
8592 /* Allocate the per-node list of sched groups */ 8591 /* Allocate the per-node list of sched groups */
8593 d->sched_group_nodes = kcalloc(nr_node_ids, 8592 d->sched_group_nodes = kcalloc(nr_node_ids,
8594 sizeof(struct sched_group *), GFP_KERNEL); 8593 sizeof(struct sched_group *), GFP_KERNEL);
8595 if (!d->sched_group_nodes) { 8594 if (!d->sched_group_nodes) {
8596 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8595 printk(KERN_WARNING "Can not alloc sched group node list\n");
8597 return sa_notcovered; 8596 return sa_notcovered;
8598 } 8597 }
8599 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; 8598 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8600 #endif 8599 #endif
8601 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) 8600 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8602 return sa_sched_group_nodes; 8601 return sa_sched_group_nodes;
8603 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) 8602 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8604 return sa_nodemask; 8603 return sa_nodemask;
8605 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 8604 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8606 return sa_this_sibling_map; 8605 return sa_this_sibling_map;
8607 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 8606 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8608 return sa_this_core_map; 8607 return sa_this_core_map;
8609 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 8608 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8610 return sa_send_covered; 8609 return sa_send_covered;
8611 d->rd = alloc_rootdomain(); 8610 d->rd = alloc_rootdomain();
8612 if (!d->rd) { 8611 if (!d->rd) {
8613 printk(KERN_WARNING "Cannot alloc root domain\n"); 8612 printk(KERN_WARNING "Cannot alloc root domain\n");
8614 return sa_tmpmask; 8613 return sa_tmpmask;
8615 } 8614 }
8616 return sa_rootdomain; 8615 return sa_rootdomain;
8617 } 8616 }
8618 8617
8619 static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 8618 static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8620 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 8619 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8621 { 8620 {
8622 struct sched_domain *sd = NULL; 8621 struct sched_domain *sd = NULL;
8623 #ifdef CONFIG_NUMA 8622 #ifdef CONFIG_NUMA
8624 struct sched_domain *parent; 8623 struct sched_domain *parent;
8625 8624
8626 d->sd_allnodes = 0; 8625 d->sd_allnodes = 0;
8627 if (cpumask_weight(cpu_map) > 8626 if (cpumask_weight(cpu_map) >
8628 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { 8627 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8629 sd = &per_cpu(allnodes_domains, i).sd; 8628 sd = &per_cpu(allnodes_domains, i).sd;
8630 SD_INIT(sd, ALLNODES); 8629 SD_INIT(sd, ALLNODES);
8631 set_domain_attribute(sd, attr); 8630 set_domain_attribute(sd, attr);
8632 cpumask_copy(sched_domain_span(sd), cpu_map); 8631 cpumask_copy(sched_domain_span(sd), cpu_map);
8633 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); 8632 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8634 d->sd_allnodes = 1; 8633 d->sd_allnodes = 1;
8635 } 8634 }
8636 parent = sd; 8635 parent = sd;
8637 8636
8638 sd = &per_cpu(node_domains, i).sd; 8637 sd = &per_cpu(node_domains, i).sd;
8639 SD_INIT(sd, NODE); 8638 SD_INIT(sd, NODE);
8640 set_domain_attribute(sd, attr); 8639 set_domain_attribute(sd, attr);
8641 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8640 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8642 sd->parent = parent; 8641 sd->parent = parent;
8643 if (parent) 8642 if (parent)
8644 parent->child = sd; 8643 parent->child = sd;
8645 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); 8644 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8646 #endif 8645 #endif
8647 return sd; 8646 return sd;
8648 } 8647 }
8649 8648
8650 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 8649 static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8651 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 8650 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8652 struct sched_domain *parent, int i) 8651 struct sched_domain *parent, int i)
8653 { 8652 {
8654 struct sched_domain *sd; 8653 struct sched_domain *sd;
8655 sd = &per_cpu(phys_domains, i).sd; 8654 sd = &per_cpu(phys_domains, i).sd;
8656 SD_INIT(sd, CPU); 8655 SD_INIT(sd, CPU);
8657 set_domain_attribute(sd, attr); 8656 set_domain_attribute(sd, attr);
8658 cpumask_copy(sched_domain_span(sd), d->nodemask); 8657 cpumask_copy(sched_domain_span(sd), d->nodemask);
8659 sd->parent = parent; 8658 sd->parent = parent;
8660 if (parent) 8659 if (parent)
8661 parent->child = sd; 8660 parent->child = sd;
8662 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); 8661 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8663 return sd; 8662 return sd;
8664 } 8663 }
8665 8664
8666 static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 8665 static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8667 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 8666 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8668 struct sched_domain *parent, int i) 8667 struct sched_domain *parent, int i)
8669 { 8668 {
8670 struct sched_domain *sd = parent; 8669 struct sched_domain *sd = parent;
8671 #ifdef CONFIG_SCHED_MC 8670 #ifdef CONFIG_SCHED_MC
8672 sd = &per_cpu(core_domains, i).sd; 8671 sd = &per_cpu(core_domains, i).sd;
8673 SD_INIT(sd, MC); 8672 SD_INIT(sd, MC);
8674 set_domain_attribute(sd, attr); 8673 set_domain_attribute(sd, attr);
8675 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); 8674 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8676 sd->parent = parent; 8675 sd->parent = parent;
8677 parent->child = sd; 8676 parent->child = sd;
8678 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); 8677 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8679 #endif 8678 #endif
8680 return sd; 8679 return sd;
8681 } 8680 }
8682 8681
8683 static struct sched_domain *__build_smt_sched_domain(struct s_data *d, 8682 static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8684 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 8683 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8685 struct sched_domain *parent, int i) 8684 struct sched_domain *parent, int i)
8686 { 8685 {
8687 struct sched_domain *sd = parent; 8686 struct sched_domain *sd = parent;
8688 #ifdef CONFIG_SCHED_SMT 8687 #ifdef CONFIG_SCHED_SMT
8689 sd = &per_cpu(cpu_domains, i).sd; 8688 sd = &per_cpu(cpu_domains, i).sd;
8690 SD_INIT(sd, SIBLING); 8689 SD_INIT(sd, SIBLING);
8691 set_domain_attribute(sd, attr); 8690 set_domain_attribute(sd, attr);
8692 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); 8691 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8693 sd->parent = parent; 8692 sd->parent = parent;
8694 parent->child = sd; 8693 parent->child = sd;
8695 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); 8694 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8696 #endif 8695 #endif
8697 return sd; 8696 return sd;
8698 } 8697 }
8699 8698
8700 static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 8699 static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8701 const struct cpumask *cpu_map, int cpu) 8700 const struct cpumask *cpu_map, int cpu)
8702 { 8701 {
8703 switch (l) { 8702 switch (l) {
8704 #ifdef CONFIG_SCHED_SMT 8703 #ifdef CONFIG_SCHED_SMT
8705 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 8704 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8706 cpumask_and(d->this_sibling_map, cpu_map, 8705 cpumask_and(d->this_sibling_map, cpu_map,
8707 topology_thread_cpumask(cpu)); 8706 topology_thread_cpumask(cpu));
8708 if (cpu == cpumask_first(d->this_sibling_map)) 8707 if (cpu == cpumask_first(d->this_sibling_map))
8709 init_sched_build_groups(d->this_sibling_map, cpu_map, 8708 init_sched_build_groups(d->this_sibling_map, cpu_map,
8710 &cpu_to_cpu_group, 8709 &cpu_to_cpu_group,
8711 d->send_covered, d->tmpmask); 8710 d->send_covered, d->tmpmask);
8712 break; 8711 break;
8713 #endif 8712 #endif
8714 #ifdef CONFIG_SCHED_MC 8713 #ifdef CONFIG_SCHED_MC
8715 case SD_LV_MC: /* set up multi-core groups */ 8714 case SD_LV_MC: /* set up multi-core groups */
8716 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); 8715 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8717 if (cpu == cpumask_first(d->this_core_map)) 8716 if (cpu == cpumask_first(d->this_core_map))
8718 init_sched_build_groups(d->this_core_map, cpu_map, 8717 init_sched_build_groups(d->this_core_map, cpu_map,
8719 &cpu_to_core_group, 8718 &cpu_to_core_group,
8720 d->send_covered, d->tmpmask); 8719 d->send_covered, d->tmpmask);
8721 break; 8720 break;
8722 #endif 8721 #endif
8723 case SD_LV_CPU: /* set up physical groups */ 8722 case SD_LV_CPU: /* set up physical groups */
8724 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 8723 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8725 if (!cpumask_empty(d->nodemask)) 8724 if (!cpumask_empty(d->nodemask))
8726 init_sched_build_groups(d->nodemask, cpu_map, 8725 init_sched_build_groups(d->nodemask, cpu_map,
8727 &cpu_to_phys_group, 8726 &cpu_to_phys_group,
8728 d->send_covered, d->tmpmask); 8727 d->send_covered, d->tmpmask);
8729 break; 8728 break;
8730 #ifdef CONFIG_NUMA 8729 #ifdef CONFIG_NUMA
8731 case SD_LV_ALLNODES: 8730 case SD_LV_ALLNODES:
8732 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 8731 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8733 d->send_covered, d->tmpmask); 8732 d->send_covered, d->tmpmask);
8734 break; 8733 break;
8735 #endif 8734 #endif
8736 default: 8735 default:
8737 break; 8736 break;
8738 } 8737 }
8739 } 8738 }
8740 8739
8741 /* 8740 /*
8742 * Build sched domains for a given set of cpus and attach the sched domains 8741 * Build sched domains for a given set of cpus and attach the sched domains
8743 * to the individual cpus 8742 * to the individual cpus
8744 */ 8743 */
8745 static int __build_sched_domains(const struct cpumask *cpu_map, 8744 static int __build_sched_domains(const struct cpumask *cpu_map,
8746 struct sched_domain_attr *attr) 8745 struct sched_domain_attr *attr)
8747 { 8746 {
8748 enum s_alloc alloc_state = sa_none; 8747 enum s_alloc alloc_state = sa_none;
8749 struct s_data d; 8748 struct s_data d;
8750 struct sched_domain *sd; 8749 struct sched_domain *sd;
8751 int i; 8750 int i;
8752 #ifdef CONFIG_NUMA 8751 #ifdef CONFIG_NUMA
8753 d.sd_allnodes = 0; 8752 d.sd_allnodes = 0;
8754 #endif 8753 #endif
8755 8754
8756 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 8755 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8757 if (alloc_state != sa_rootdomain) 8756 if (alloc_state != sa_rootdomain)
8758 goto error; 8757 goto error;
8759 alloc_state = sa_sched_groups; 8758 alloc_state = sa_sched_groups;
8760 8759
8761 /* 8760 /*
8762 * Set up domains for cpus specified by the cpu_map. 8761 * Set up domains for cpus specified by the cpu_map.
8763 */ 8762 */
8764 for_each_cpu(i, cpu_map) { 8763 for_each_cpu(i, cpu_map) {
8765 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 8764 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8766 cpu_map); 8765 cpu_map);
8767 8766
8768 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 8767 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8769 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 8768 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8770 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 8769 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8771 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 8770 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8772 } 8771 }
8773 8772
8774 for_each_cpu(i, cpu_map) { 8773 for_each_cpu(i, cpu_map) {
8775 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 8774 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8776 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 8775 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8777 } 8776 }
8778 8777
8779 /* Set up physical groups */ 8778 /* Set up physical groups */
8780 for (i = 0; i < nr_node_ids; i++) 8779 for (i = 0; i < nr_node_ids; i++)
8781 build_sched_groups(&d, SD_LV_CPU, cpu_map, i); 8780 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8782 8781
8783 #ifdef CONFIG_NUMA 8782 #ifdef CONFIG_NUMA
8784 /* Set up node groups */ 8783 /* Set up node groups */
8785 if (d.sd_allnodes) 8784 if (d.sd_allnodes)
8786 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); 8785 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8787 8786
8788 for (i = 0; i < nr_node_ids; i++) 8787 for (i = 0; i < nr_node_ids; i++)
8789 if (build_numa_sched_groups(&d, cpu_map, i)) 8788 if (build_numa_sched_groups(&d, cpu_map, i))
8790 goto error; 8789 goto error;
8791 #endif 8790 #endif
8792 8791
8793 /* Calculate CPU power for physical packages and nodes */ 8792 /* Calculate CPU power for physical packages and nodes */
8794 #ifdef CONFIG_SCHED_SMT 8793 #ifdef CONFIG_SCHED_SMT
8795 for_each_cpu(i, cpu_map) { 8794 for_each_cpu(i, cpu_map) {
8796 sd = &per_cpu(cpu_domains, i).sd; 8795 sd = &per_cpu(cpu_domains, i).sd;
8797 init_sched_groups_power(i, sd); 8796 init_sched_groups_power(i, sd);
8798 } 8797 }
8799 #endif 8798 #endif
8800 #ifdef CONFIG_SCHED_MC 8799 #ifdef CONFIG_SCHED_MC
8801 for_each_cpu(i, cpu_map) { 8800 for_each_cpu(i, cpu_map) {
8802 sd = &per_cpu(core_domains, i).sd; 8801 sd = &per_cpu(core_domains, i).sd;
8803 init_sched_groups_power(i, sd); 8802 init_sched_groups_power(i, sd);
8804 } 8803 }
8805 #endif 8804 #endif
8806 8805
8807 for_each_cpu(i, cpu_map) { 8806 for_each_cpu(i, cpu_map) {
8808 sd = &per_cpu(phys_domains, i).sd; 8807 sd = &per_cpu(phys_domains, i).sd;
8809 init_sched_groups_power(i, sd); 8808 init_sched_groups_power(i, sd);
8810 } 8809 }
8811 8810
8812 #ifdef CONFIG_NUMA 8811 #ifdef CONFIG_NUMA
8813 for (i = 0; i < nr_node_ids; i++) 8812 for (i = 0; i < nr_node_ids; i++)
8814 init_numa_sched_groups_power(d.sched_group_nodes[i]); 8813 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8815 8814
8816 if (d.sd_allnodes) { 8815 if (d.sd_allnodes) {
8817 struct sched_group *sg; 8816 struct sched_group *sg;
8818 8817
8819 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8818 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8820 d.tmpmask); 8819 d.tmpmask);
8821 init_numa_sched_groups_power(sg); 8820 init_numa_sched_groups_power(sg);
8822 } 8821 }
8823 #endif 8822 #endif
8824 8823
8825 /* Attach the domains */ 8824 /* Attach the domains */
8826 for_each_cpu(i, cpu_map) { 8825 for_each_cpu(i, cpu_map) {
8827 #ifdef CONFIG_SCHED_SMT 8826 #ifdef CONFIG_SCHED_SMT
8828 sd = &per_cpu(cpu_domains, i).sd; 8827 sd = &per_cpu(cpu_domains, i).sd;
8829 #elif defined(CONFIG_SCHED_MC) 8828 #elif defined(CONFIG_SCHED_MC)
8830 sd = &per_cpu(core_domains, i).sd; 8829 sd = &per_cpu(core_domains, i).sd;
8831 #else 8830 #else
8832 sd = &per_cpu(phys_domains, i).sd; 8831 sd = &per_cpu(phys_domains, i).sd;
8833 #endif 8832 #endif
8834 cpu_attach_domain(sd, d.rd, i); 8833 cpu_attach_domain(sd, d.rd, i);
8835 } 8834 }
8836 8835
8837 d.sched_group_nodes = NULL; /* don't free this we still need it */ 8836 d.sched_group_nodes = NULL; /* don't free this we still need it */
8838 __free_domain_allocs(&d, sa_tmpmask, cpu_map); 8837 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8839 return 0; 8838 return 0;
8840 8839
8841 error: 8840 error:
8842 __free_domain_allocs(&d, alloc_state, cpu_map); 8841 __free_domain_allocs(&d, alloc_state, cpu_map);
8843 return -ENOMEM; 8842 return -ENOMEM;
8844 } 8843 }
8845 8844
8846 static int build_sched_domains(const struct cpumask *cpu_map) 8845 static int build_sched_domains(const struct cpumask *cpu_map)
8847 { 8846 {
8848 return __build_sched_domains(cpu_map, NULL); 8847 return __build_sched_domains(cpu_map, NULL);
8849 } 8848 }
8850 8849
8851 static struct cpumask *doms_cur; /* current sched domains */ 8850 static struct cpumask *doms_cur; /* current sched domains */
8852 static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8851 static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8853 static struct sched_domain_attr *dattr_cur; 8852 static struct sched_domain_attr *dattr_cur;
8854 /* attribues of custom domains in 'doms_cur' */ 8853 /* attribues of custom domains in 'doms_cur' */
8855 8854
8856 /* 8855 /*
8857 * Special case: If a kmalloc of a doms_cur partition (array of 8856 * Special case: If a kmalloc of a doms_cur partition (array of
8858 * cpumask) fails, then fallback to a single sched domain, 8857 * cpumask) fails, then fallback to a single sched domain,
8859 * as determined by the single cpumask fallback_doms. 8858 * as determined by the single cpumask fallback_doms.
8860 */ 8859 */
8861 static cpumask_var_t fallback_doms; 8860 static cpumask_var_t fallback_doms;
8862 8861
8863 /* 8862 /*
8864 * arch_update_cpu_topology lets virtualized architectures update the 8863 * arch_update_cpu_topology lets virtualized architectures update the
8865 * cpu core maps. It is supposed to return 1 if the topology changed 8864 * cpu core maps. It is supposed to return 1 if the topology changed
8866 * or 0 if it stayed the same. 8865 * or 0 if it stayed the same.
8867 */ 8866 */
8868 int __attribute__((weak)) arch_update_cpu_topology(void) 8867 int __attribute__((weak)) arch_update_cpu_topology(void)
8869 { 8868 {
8870 return 0; 8869 return 0;
8871 } 8870 }
8872 8871
8873 /* 8872 /*
8874 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8873 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8875 * For now this just excludes isolated cpus, but could be used to 8874 * For now this just excludes isolated cpus, but could be used to
8876 * exclude other special cases in the future. 8875 * exclude other special cases in the future.
8877 */ 8876 */
8878 static int arch_init_sched_domains(const struct cpumask *cpu_map) 8877 static int arch_init_sched_domains(const struct cpumask *cpu_map)
8879 { 8878 {
8880 int err; 8879 int err;
8881 8880
8882 arch_update_cpu_topology(); 8881 arch_update_cpu_topology();
8883 ndoms_cur = 1; 8882 ndoms_cur = 1;
8884 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8883 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
8885 if (!doms_cur) 8884 if (!doms_cur)
8886 doms_cur = fallback_doms; 8885 doms_cur = fallback_doms;
8887 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 8886 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
8888 dattr_cur = NULL; 8887 dattr_cur = NULL;
8889 err = build_sched_domains(doms_cur); 8888 err = build_sched_domains(doms_cur);
8890 register_sched_domain_sysctl(); 8889 register_sched_domain_sysctl();
8891 8890
8892 return err; 8891 return err;
8893 } 8892 }
8894 8893
8895 static void arch_destroy_sched_domains(const struct cpumask *cpu_map, 8894 static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
8896 struct cpumask *tmpmask) 8895 struct cpumask *tmpmask)
8897 { 8896 {
8898 free_sched_groups(cpu_map, tmpmask); 8897 free_sched_groups(cpu_map, tmpmask);
8899 } 8898 }
8900 8899
8901 /* 8900 /*
8902 * Detach sched domains from a group of cpus specified in cpu_map 8901 * Detach sched domains from a group of cpus specified in cpu_map
8903 * These cpus will now be attached to the NULL domain 8902 * These cpus will now be attached to the NULL domain
8904 */ 8903 */
8905 static void detach_destroy_domains(const struct cpumask *cpu_map) 8904 static void detach_destroy_domains(const struct cpumask *cpu_map)
8906 { 8905 {
8907 /* Save because hotplug lock held. */ 8906 /* Save because hotplug lock held. */
8908 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS); 8907 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
8909 int i; 8908 int i;
8910 8909
8911 for_each_cpu(i, cpu_map) 8910 for_each_cpu(i, cpu_map)
8912 cpu_attach_domain(NULL, &def_root_domain, i); 8911 cpu_attach_domain(NULL, &def_root_domain, i);
8913 synchronize_sched(); 8912 synchronize_sched();
8914 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask)); 8913 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
8915 } 8914 }
8916 8915
8917 /* handle null as "default" */ 8916 /* handle null as "default" */
8918 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, 8917 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8919 struct sched_domain_attr *new, int idx_new) 8918 struct sched_domain_attr *new, int idx_new)
8920 { 8919 {
8921 struct sched_domain_attr tmp; 8920 struct sched_domain_attr tmp;
8922 8921
8923 /* fast path */ 8922 /* fast path */
8924 if (!new && !cur) 8923 if (!new && !cur)
8925 return 1; 8924 return 1;
8926 8925
8927 tmp = SD_ATTR_INIT; 8926 tmp = SD_ATTR_INIT;
8928 return !memcmp(cur ? (cur + idx_cur) : &tmp, 8927 return !memcmp(cur ? (cur + idx_cur) : &tmp,
8929 new ? (new + idx_new) : &tmp, 8928 new ? (new + idx_new) : &tmp,
8930 sizeof(struct sched_domain_attr)); 8929 sizeof(struct sched_domain_attr));
8931 } 8930 }
8932 8931
8933 /* 8932 /*
8934 * Partition sched domains as specified by the 'ndoms_new' 8933 * Partition sched domains as specified by the 'ndoms_new'
8935 * cpumasks in the array doms_new[] of cpumasks. This compares 8934 * cpumasks in the array doms_new[] of cpumasks. This compares
8936 * doms_new[] to the current sched domain partitioning, doms_cur[]. 8935 * doms_new[] to the current sched domain partitioning, doms_cur[].
8937 * It destroys each deleted domain and builds each new domain. 8936 * It destroys each deleted domain and builds each new domain.
8938 * 8937 *
8939 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 8938 * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
8940 * The masks don't intersect (don't overlap.) We should setup one 8939 * The masks don't intersect (don't overlap.) We should setup one
8941 * sched domain for each mask. CPUs not in any of the cpumasks will 8940 * sched domain for each mask. CPUs not in any of the cpumasks will
8942 * not be load balanced. If the same cpumask appears both in the 8941 * not be load balanced. If the same cpumask appears both in the
8943 * current 'doms_cur' domains and in the new 'doms_new', we can leave 8942 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8944 * it as it is. 8943 * it as it is.
8945 * 8944 *
8946 * The passed in 'doms_new' should be kmalloc'd. This routine takes 8945 * The passed in 'doms_new' should be kmalloc'd. This routine takes
8947 * ownership of it and will kfree it when done with it. If the caller 8946 * ownership of it and will kfree it when done with it. If the caller
8948 * failed the kmalloc call, then it can pass in doms_new == NULL && 8947 * failed the kmalloc call, then it can pass in doms_new == NULL &&
8949 * ndoms_new == 1, and partition_sched_domains() will fallback to 8948 * ndoms_new == 1, and partition_sched_domains() will fallback to
8950 * the single partition 'fallback_doms', it also forces the domains 8949 * the single partition 'fallback_doms', it also forces the domains
8951 * to be rebuilt. 8950 * to be rebuilt.
8952 * 8951 *
8953 * If doms_new == NULL it will be replaced with cpu_online_mask. 8952 * If doms_new == NULL it will be replaced with cpu_online_mask.
8954 * ndoms_new == 0 is a special case for destroying existing domains, 8953 * ndoms_new == 0 is a special case for destroying existing domains,
8955 * and it will not create the default domain. 8954 * and it will not create the default domain.
8956 * 8955 *
8957 * Call with hotplug lock held 8956 * Call with hotplug lock held
8958 */ 8957 */
8959 /* FIXME: Change to struct cpumask *doms_new[] */ 8958 /* FIXME: Change to struct cpumask *doms_new[] */
8960 void partition_sched_domains(int ndoms_new, struct cpumask *doms_new, 8959 void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8961 struct sched_domain_attr *dattr_new) 8960 struct sched_domain_attr *dattr_new)
8962 { 8961 {
8963 int i, j, n; 8962 int i, j, n;
8964 int new_topology; 8963 int new_topology;
8965 8964
8966 mutex_lock(&sched_domains_mutex); 8965 mutex_lock(&sched_domains_mutex);
8967 8966
8968 /* always unregister in case we don't destroy any domains */ 8967 /* always unregister in case we don't destroy any domains */
8969 unregister_sched_domain_sysctl(); 8968 unregister_sched_domain_sysctl();
8970 8969
8971 /* Let architecture update cpu core mappings. */ 8970 /* Let architecture update cpu core mappings. */
8972 new_topology = arch_update_cpu_topology(); 8971 new_topology = arch_update_cpu_topology();
8973 8972
8974 n = doms_new ? ndoms_new : 0; 8973 n = doms_new ? ndoms_new : 0;
8975 8974
8976 /* Destroy deleted domains */ 8975 /* Destroy deleted domains */
8977 for (i = 0; i < ndoms_cur; i++) { 8976 for (i = 0; i < ndoms_cur; i++) {
8978 for (j = 0; j < n && !new_topology; j++) { 8977 for (j = 0; j < n && !new_topology; j++) {
8979 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 8978 if (cpumask_equal(&doms_cur[i], &doms_new[j])
8980 && dattrs_equal(dattr_cur, i, dattr_new, j)) 8979 && dattrs_equal(dattr_cur, i, dattr_new, j))
8981 goto match1; 8980 goto match1;
8982 } 8981 }
8983 /* no match - a current sched domain not in new doms_new[] */ 8982 /* no match - a current sched domain not in new doms_new[] */
8984 detach_destroy_domains(doms_cur + i); 8983 detach_destroy_domains(doms_cur + i);
8985 match1: 8984 match1:
8986 ; 8985 ;
8987 } 8986 }
8988 8987
8989 if (doms_new == NULL) { 8988 if (doms_new == NULL) {
8990 ndoms_cur = 0; 8989 ndoms_cur = 0;
8991 doms_new = fallback_doms; 8990 doms_new = fallback_doms;
8992 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 8991 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
8993 WARN_ON_ONCE(dattr_new); 8992 WARN_ON_ONCE(dattr_new);
8994 } 8993 }
8995 8994
8996 /* Build new domains */ 8995 /* Build new domains */
8997 for (i = 0; i < ndoms_new; i++) { 8996 for (i = 0; i < ndoms_new; i++) {
8998 for (j = 0; j < ndoms_cur && !new_topology; j++) { 8997 for (j = 0; j < ndoms_cur && !new_topology; j++) {
8999 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 8998 if (cpumask_equal(&doms_new[i], &doms_cur[j])
9000 && dattrs_equal(dattr_new, i, dattr_cur, j)) 8999 && dattrs_equal(dattr_new, i, dattr_cur, j))
9001 goto match2; 9000 goto match2;
9002 } 9001 }
9003 /* no match - add a new doms_new */ 9002 /* no match - add a new doms_new */
9004 __build_sched_domains(doms_new + i, 9003 __build_sched_domains(doms_new + i,
9005 dattr_new ? dattr_new + i : NULL); 9004 dattr_new ? dattr_new + i : NULL);
9006 match2: 9005 match2:
9007 ; 9006 ;
9008 } 9007 }
9009 9008
9010 /* Remember the new sched domains */ 9009 /* Remember the new sched domains */
9011 if (doms_cur != fallback_doms) 9010 if (doms_cur != fallback_doms)
9012 kfree(doms_cur); 9011 kfree(doms_cur);
9013 kfree(dattr_cur); /* kfree(NULL) is safe */ 9012 kfree(dattr_cur); /* kfree(NULL) is safe */
9014 doms_cur = doms_new; 9013 doms_cur = doms_new;
9015 dattr_cur = dattr_new; 9014 dattr_cur = dattr_new;
9016 ndoms_cur = ndoms_new; 9015 ndoms_cur = ndoms_new;
9017 9016
9018 register_sched_domain_sysctl(); 9017 register_sched_domain_sysctl();
9019 9018
9020 mutex_unlock(&sched_domains_mutex); 9019 mutex_unlock(&sched_domains_mutex);
9021 } 9020 }
9022 9021
9023 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 9022 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
9024 static void arch_reinit_sched_domains(void) 9023 static void arch_reinit_sched_domains(void)
9025 { 9024 {
9026 get_online_cpus(); 9025 get_online_cpus();
9027 9026
9028 /* Destroy domains first to force the rebuild */ 9027 /* Destroy domains first to force the rebuild */
9029 partition_sched_domains(0, NULL, NULL); 9028 partition_sched_domains(0, NULL, NULL);
9030 9029
9031 rebuild_sched_domains(); 9030 rebuild_sched_domains();
9032 put_online_cpus(); 9031 put_online_cpus();
9033 } 9032 }
9034 9033
9035 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 9034 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9036 { 9035 {
9037 unsigned int level = 0; 9036 unsigned int level = 0;
9038 9037
9039 if (sscanf(buf, "%u", &level) != 1) 9038 if (sscanf(buf, "%u", &level) != 1)
9040 return -EINVAL; 9039 return -EINVAL;
9041 9040
9042 /* 9041 /*
9043 * level is always be positive so don't check for 9042 * level is always be positive so don't check for
9044 * level < POWERSAVINGS_BALANCE_NONE which is 0 9043 * level < POWERSAVINGS_BALANCE_NONE which is 0
9045 * What happens on 0 or 1 byte write, 9044 * What happens on 0 or 1 byte write,
9046 * need to check for count as well? 9045 * need to check for count as well?
9047 */ 9046 */
9048 9047
9049 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) 9048 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
9050 return -EINVAL; 9049 return -EINVAL;
9051 9050
9052 if (smt) 9051 if (smt)
9053 sched_smt_power_savings = level; 9052 sched_smt_power_savings = level;
9054 else 9053 else
9055 sched_mc_power_savings = level; 9054 sched_mc_power_savings = level;
9056 9055
9057 arch_reinit_sched_domains(); 9056 arch_reinit_sched_domains();
9058 9057
9059 return count; 9058 return count;
9060 } 9059 }
9061 9060
9062 #ifdef CONFIG_SCHED_MC 9061 #ifdef CONFIG_SCHED_MC
9063 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 9062 static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
9064 char *page) 9063 char *page)
9065 { 9064 {
9066 return sprintf(page, "%u\n", sched_mc_power_savings); 9065 return sprintf(page, "%u\n", sched_mc_power_savings);
9067 } 9066 }
9068 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 9067 static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
9069 const char *buf, size_t count) 9068 const char *buf, size_t count)
9070 { 9069 {
9071 return sched_power_savings_store(buf, count, 0); 9070 return sched_power_savings_store(buf, count, 0);
9072 } 9071 }
9073 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, 9072 static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9074 sched_mc_power_savings_show, 9073 sched_mc_power_savings_show,
9075 sched_mc_power_savings_store); 9074 sched_mc_power_savings_store);
9076 #endif 9075 #endif
9077 9076
9078 #ifdef CONFIG_SCHED_SMT 9077 #ifdef CONFIG_SCHED_SMT
9079 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 9078 static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
9080 char *page) 9079 char *page)
9081 { 9080 {
9082 return sprintf(page, "%u\n", sched_smt_power_savings); 9081 return sprintf(page, "%u\n", sched_smt_power_savings);
9083 } 9082 }
9084 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 9083 static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
9085 const char *buf, size_t count) 9084 const char *buf, size_t count)
9086 { 9085 {
9087 return sched_power_savings_store(buf, count, 1); 9086 return sched_power_savings_store(buf, count, 1);
9088 } 9087 }
9089 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, 9088 static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
9090 sched_smt_power_savings_show, 9089 sched_smt_power_savings_show,
9091 sched_smt_power_savings_store); 9090 sched_smt_power_savings_store);
9092 #endif 9091 #endif
9093 9092
9094 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 9093 int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
9095 { 9094 {
9096 int err = 0; 9095 int err = 0;
9097 9096
9098 #ifdef CONFIG_SCHED_SMT 9097 #ifdef CONFIG_SCHED_SMT
9099 if (smt_capable()) 9098 if (smt_capable())
9100 err = sysfs_create_file(&cls->kset.kobj, 9099 err = sysfs_create_file(&cls->kset.kobj,
9101 &attr_sched_smt_power_savings.attr); 9100 &attr_sched_smt_power_savings.attr);
9102 #endif 9101 #endif
9103 #ifdef CONFIG_SCHED_MC 9102 #ifdef CONFIG_SCHED_MC
9104 if (!err && mc_capable()) 9103 if (!err && mc_capable())
9105 err = sysfs_create_file(&cls->kset.kobj, 9104 err = sysfs_create_file(&cls->kset.kobj,
9106 &attr_sched_mc_power_savings.attr); 9105 &attr_sched_mc_power_savings.attr);
9107 #endif 9106 #endif
9108 return err; 9107 return err;
9109 } 9108 }
9110 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 9109 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
9111 9110
9112 #ifndef CONFIG_CPUSETS 9111 #ifndef CONFIG_CPUSETS
9113 /* 9112 /*
9114 * Add online and remove offline CPUs from the scheduler domains. 9113 * Add online and remove offline CPUs from the scheduler domains.
9115 * When cpusets are enabled they take over this function. 9114 * When cpusets are enabled they take over this function.
9116 */ 9115 */
9117 static int update_sched_domains(struct notifier_block *nfb, 9116 static int update_sched_domains(struct notifier_block *nfb,
9118 unsigned long action, void *hcpu) 9117 unsigned long action, void *hcpu)
9119 { 9118 {
9120 switch (action) { 9119 switch (action) {
9121 case CPU_ONLINE: 9120 case CPU_ONLINE:
9122 case CPU_ONLINE_FROZEN: 9121 case CPU_ONLINE_FROZEN:
9123 case CPU_DEAD: 9122 case CPU_DEAD:
9124 case CPU_DEAD_FROZEN: 9123 case CPU_DEAD_FROZEN:
9125 partition_sched_domains(1, NULL, NULL); 9124 partition_sched_domains(1, NULL, NULL);
9126 return NOTIFY_OK; 9125 return NOTIFY_OK;
9127 9126
9128 default: 9127 default:
9129 return NOTIFY_DONE; 9128 return NOTIFY_DONE;
9130 } 9129 }
9131 } 9130 }
9132 #endif 9131 #endif
9133 9132
9134 static int update_runtime(struct notifier_block *nfb, 9133 static int update_runtime(struct notifier_block *nfb,
9135 unsigned long action, void *hcpu) 9134 unsigned long action, void *hcpu)
9136 { 9135 {
9137 int cpu = (int)(long)hcpu; 9136 int cpu = (int)(long)hcpu;
9138 9137
9139 switch (action) { 9138 switch (action) {
9140 case CPU_DOWN_PREPARE: 9139 case CPU_DOWN_PREPARE:
9141 case CPU_DOWN_PREPARE_FROZEN: 9140 case CPU_DOWN_PREPARE_FROZEN:
9142 disable_runtime(cpu_rq(cpu)); 9141 disable_runtime(cpu_rq(cpu));
9143 return NOTIFY_OK; 9142 return NOTIFY_OK;
9144 9143
9145 case CPU_DOWN_FAILED: 9144 case CPU_DOWN_FAILED:
9146 case CPU_DOWN_FAILED_FROZEN: 9145 case CPU_DOWN_FAILED_FROZEN:
9147 case CPU_ONLINE: 9146 case CPU_ONLINE:
9148 case CPU_ONLINE_FROZEN: 9147 case CPU_ONLINE_FROZEN:
9149 enable_runtime(cpu_rq(cpu)); 9148 enable_runtime(cpu_rq(cpu));
9150 return NOTIFY_OK; 9149 return NOTIFY_OK;
9151 9150
9152 default: 9151 default:
9153 return NOTIFY_DONE; 9152 return NOTIFY_DONE;
9154 } 9153 }
9155 } 9154 }
9156 9155
9157 void __init sched_init_smp(void) 9156 void __init sched_init_smp(void)
9158 { 9157 {
9159 cpumask_var_t non_isolated_cpus; 9158 cpumask_var_t non_isolated_cpus;
9160 9159
9161 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9160 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9162 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 9161 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9163 9162
9164 #if defined(CONFIG_NUMA) 9163 #if defined(CONFIG_NUMA)
9165 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9164 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
9166 GFP_KERNEL); 9165 GFP_KERNEL);
9167 BUG_ON(sched_group_nodes_bycpu == NULL); 9166 BUG_ON(sched_group_nodes_bycpu == NULL);
9168 #endif 9167 #endif
9169 get_online_cpus(); 9168 get_online_cpus();
9170 mutex_lock(&sched_domains_mutex); 9169 mutex_lock(&sched_domains_mutex);
9171 arch_init_sched_domains(cpu_online_mask); 9170 arch_init_sched_domains(cpu_online_mask);
9172 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9171 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9173 if (cpumask_empty(non_isolated_cpus)) 9172 if (cpumask_empty(non_isolated_cpus))
9174 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9173 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
9175 mutex_unlock(&sched_domains_mutex); 9174 mutex_unlock(&sched_domains_mutex);
9176 put_online_cpus(); 9175 put_online_cpus();
9177 9176
9178 #ifndef CONFIG_CPUSETS 9177 #ifndef CONFIG_CPUSETS
9179 /* XXX: Theoretical race here - CPU may be hotplugged now */ 9178 /* XXX: Theoretical race here - CPU may be hotplugged now */
9180 hotcpu_notifier(update_sched_domains, 0); 9179 hotcpu_notifier(update_sched_domains, 0);
9181 #endif 9180 #endif
9182 9181
9183 /* RT runtime code needs to handle some hotplug events */ 9182 /* RT runtime code needs to handle some hotplug events */
9184 hotcpu_notifier(update_runtime, 0); 9183 hotcpu_notifier(update_runtime, 0);
9185 9184
9186 init_hrtick(); 9185 init_hrtick();
9187 9186
9188 /* Move init over to a non-isolated CPU */ 9187 /* Move init over to a non-isolated CPU */
9189 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) 9188 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
9190 BUG(); 9189 BUG();
9191 sched_init_granularity(); 9190 sched_init_granularity();
9192 free_cpumask_var(non_isolated_cpus); 9191 free_cpumask_var(non_isolated_cpus);
9193 9192
9194 init_sched_rt_class(); 9193 init_sched_rt_class();
9195 } 9194 }
9196 #else 9195 #else
9197 void __init sched_init_smp(void) 9196 void __init sched_init_smp(void)
9198 { 9197 {
9199 sched_init_granularity(); 9198 sched_init_granularity();
9200 } 9199 }
9201 #endif /* CONFIG_SMP */ 9200 #endif /* CONFIG_SMP */
9202 9201
9203 const_debug unsigned int sysctl_timer_migration = 1; 9202 const_debug unsigned int sysctl_timer_migration = 1;
9204 9203
9205 int in_sched_functions(unsigned long addr) 9204 int in_sched_functions(unsigned long addr)
9206 { 9205 {
9207 return in_lock_functions(addr) || 9206 return in_lock_functions(addr) ||
9208 (addr >= (unsigned long)__sched_text_start 9207 (addr >= (unsigned long)__sched_text_start
9209 && addr < (unsigned long)__sched_text_end); 9208 && addr < (unsigned long)__sched_text_end);
9210 } 9209 }
9211 9210
9212 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 9211 static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
9213 { 9212 {
9214 cfs_rq->tasks_timeline = RB_ROOT; 9213 cfs_rq->tasks_timeline = RB_ROOT;
9215 INIT_LIST_HEAD(&cfs_rq->tasks); 9214 INIT_LIST_HEAD(&cfs_rq->tasks);
9216 #ifdef CONFIG_FAIR_GROUP_SCHED 9215 #ifdef CONFIG_FAIR_GROUP_SCHED
9217 cfs_rq->rq = rq; 9216 cfs_rq->rq = rq;
9218 #endif 9217 #endif
9219 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 9218 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9220 } 9219 }
9221 9220
9222 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) 9221 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9223 { 9222 {
9224 struct rt_prio_array *array; 9223 struct rt_prio_array *array;
9225 int i; 9224 int i;
9226 9225
9227 array = &rt_rq->active; 9226 array = &rt_rq->active;
9228 for (i = 0; i < MAX_RT_PRIO; i++) { 9227 for (i = 0; i < MAX_RT_PRIO; i++) {
9229 INIT_LIST_HEAD(array->queue + i); 9228 INIT_LIST_HEAD(array->queue + i);
9230 __clear_bit(i, array->bitmap); 9229 __clear_bit(i, array->bitmap);
9231 } 9230 }
9232 /* delimiter for bitsearch: */ 9231 /* delimiter for bitsearch: */
9233 __set_bit(MAX_RT_PRIO, array->bitmap); 9232 __set_bit(MAX_RT_PRIO, array->bitmap);
9234 9233
9235 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 9234 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
9236 rt_rq->highest_prio.curr = MAX_RT_PRIO; 9235 rt_rq->highest_prio.curr = MAX_RT_PRIO;
9237 #ifdef CONFIG_SMP 9236 #ifdef CONFIG_SMP
9238 rt_rq->highest_prio.next = MAX_RT_PRIO; 9237 rt_rq->highest_prio.next = MAX_RT_PRIO;
9239 #endif 9238 #endif
9240 #endif 9239 #endif
9241 #ifdef CONFIG_SMP 9240 #ifdef CONFIG_SMP
9242 rt_rq->rt_nr_migratory = 0; 9241 rt_rq->rt_nr_migratory = 0;
9243 rt_rq->overloaded = 0; 9242 rt_rq->overloaded = 0;
9244 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 9243 plist_head_init(&rt_rq->pushable_tasks, &rq->lock);
9245 #endif 9244 #endif
9246 9245
9247 rt_rq->rt_time = 0; 9246 rt_rq->rt_time = 0;
9248 rt_rq->rt_throttled = 0; 9247 rt_rq->rt_throttled = 0;
9249 rt_rq->rt_runtime = 0; 9248 rt_rq->rt_runtime = 0;
9250 spin_lock_init(&rt_rq->rt_runtime_lock); 9249 spin_lock_init(&rt_rq->rt_runtime_lock);
9251 9250
9252 #ifdef CONFIG_RT_GROUP_SCHED 9251 #ifdef CONFIG_RT_GROUP_SCHED
9253 rt_rq->rt_nr_boosted = 0; 9252 rt_rq->rt_nr_boosted = 0;
9254 rt_rq->rq = rq; 9253 rt_rq->rq = rq;
9255 #endif 9254 #endif
9256 } 9255 }
9257 9256
9258 #ifdef CONFIG_FAIR_GROUP_SCHED 9257 #ifdef CONFIG_FAIR_GROUP_SCHED
9259 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 9258 static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9260 struct sched_entity *se, int cpu, int add, 9259 struct sched_entity *se, int cpu, int add,
9261 struct sched_entity *parent) 9260 struct sched_entity *parent)
9262 { 9261 {
9263 struct rq *rq = cpu_rq(cpu); 9262 struct rq *rq = cpu_rq(cpu);
9264 tg->cfs_rq[cpu] = cfs_rq; 9263 tg->cfs_rq[cpu] = cfs_rq;
9265 init_cfs_rq(cfs_rq, rq); 9264 init_cfs_rq(cfs_rq, rq);
9266 cfs_rq->tg = tg; 9265 cfs_rq->tg = tg;
9267 if (add) 9266 if (add)
9268 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 9267 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
9269 9268
9270 tg->se[cpu] = se; 9269 tg->se[cpu] = se;
9271 /* se could be NULL for init_task_group */ 9270 /* se could be NULL for init_task_group */
9272 if (!se) 9271 if (!se)
9273 return; 9272 return;
9274 9273
9275 if (!parent) 9274 if (!parent)
9276 se->cfs_rq = &rq->cfs; 9275 se->cfs_rq = &rq->cfs;
9277 else 9276 else
9278 se->cfs_rq = parent->my_q; 9277 se->cfs_rq = parent->my_q;
9279 9278
9280 se->my_q = cfs_rq; 9279 se->my_q = cfs_rq;
9281 se->load.weight = tg->shares; 9280 se->load.weight = tg->shares;
9282 se->load.inv_weight = 0; 9281 se->load.inv_weight = 0;
9283 se->parent = parent; 9282 se->parent = parent;
9284 } 9283 }
9285 #endif 9284 #endif
9286 9285
9287 #ifdef CONFIG_RT_GROUP_SCHED 9286 #ifdef CONFIG_RT_GROUP_SCHED
9288 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 9287 static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9289 struct sched_rt_entity *rt_se, int cpu, int add, 9288 struct sched_rt_entity *rt_se, int cpu, int add,
9290 struct sched_rt_entity *parent) 9289 struct sched_rt_entity *parent)
9291 { 9290 {
9292 struct rq *rq = cpu_rq(cpu); 9291 struct rq *rq = cpu_rq(cpu);
9293 9292
9294 tg->rt_rq[cpu] = rt_rq; 9293 tg->rt_rq[cpu] = rt_rq;
9295 init_rt_rq(rt_rq, rq); 9294 init_rt_rq(rt_rq, rq);
9296 rt_rq->tg = tg; 9295 rt_rq->tg = tg;
9297 rt_rq->rt_se = rt_se; 9296 rt_rq->rt_se = rt_se;
9298 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 9297 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9299 if (add) 9298 if (add)
9300 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 9299 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
9301 9300
9302 tg->rt_se[cpu] = rt_se; 9301 tg->rt_se[cpu] = rt_se;
9303 if (!rt_se) 9302 if (!rt_se)
9304 return; 9303 return;
9305 9304
9306 if (!parent) 9305 if (!parent)
9307 rt_se->rt_rq = &rq->rt; 9306 rt_se->rt_rq = &rq->rt;
9308 else 9307 else
9309 rt_se->rt_rq = parent->my_q; 9308 rt_se->rt_rq = parent->my_q;
9310 9309
9311 rt_se->my_q = rt_rq; 9310 rt_se->my_q = rt_rq;
9312 rt_se->parent = parent; 9311 rt_se->parent = parent;
9313 INIT_LIST_HEAD(&rt_se->run_list); 9312 INIT_LIST_HEAD(&rt_se->run_list);
9314 } 9313 }
9315 #endif 9314 #endif
9316 9315
9317 void __init sched_init(void) 9316 void __init sched_init(void)
9318 { 9317 {
9319 int i, j; 9318 int i, j;
9320 unsigned long alloc_size = 0, ptr; 9319 unsigned long alloc_size = 0, ptr;
9321 9320
9322 #ifdef CONFIG_FAIR_GROUP_SCHED 9321 #ifdef CONFIG_FAIR_GROUP_SCHED
9323 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 9322 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9324 #endif 9323 #endif
9325 #ifdef CONFIG_RT_GROUP_SCHED 9324 #ifdef CONFIG_RT_GROUP_SCHED
9326 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 9325 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9327 #endif 9326 #endif
9328 #ifdef CONFIG_USER_SCHED 9327 #ifdef CONFIG_USER_SCHED
9329 alloc_size *= 2; 9328 alloc_size *= 2;
9330 #endif 9329 #endif
9331 #ifdef CONFIG_CPUMASK_OFFSTACK 9330 #ifdef CONFIG_CPUMASK_OFFSTACK
9332 alloc_size += num_possible_cpus() * cpumask_size(); 9331 alloc_size += num_possible_cpus() * cpumask_size();
9333 #endif 9332 #endif
9334 /* 9333 /*
9335 * As sched_init() is called before page_alloc is setup, 9334 * As sched_init() is called before page_alloc is setup,
9336 * we use alloc_bootmem(). 9335 * we use alloc_bootmem().
9337 */ 9336 */
9338 if (alloc_size) { 9337 if (alloc_size) {
9339 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9338 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9340 9339
9341 #ifdef CONFIG_FAIR_GROUP_SCHED 9340 #ifdef CONFIG_FAIR_GROUP_SCHED
9342 init_task_group.se = (struct sched_entity **)ptr; 9341 init_task_group.se = (struct sched_entity **)ptr;
9343 ptr += nr_cpu_ids * sizeof(void **); 9342 ptr += nr_cpu_ids * sizeof(void **);
9344 9343
9345 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 9344 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9346 ptr += nr_cpu_ids * sizeof(void **); 9345 ptr += nr_cpu_ids * sizeof(void **);
9347 9346
9348 #ifdef CONFIG_USER_SCHED 9347 #ifdef CONFIG_USER_SCHED
9349 root_task_group.se = (struct sched_entity **)ptr; 9348 root_task_group.se = (struct sched_entity **)ptr;
9350 ptr += nr_cpu_ids * sizeof(void **); 9349 ptr += nr_cpu_ids * sizeof(void **);
9351 9350
9352 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 9351 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9353 ptr += nr_cpu_ids * sizeof(void **); 9352 ptr += nr_cpu_ids * sizeof(void **);
9354 #endif /* CONFIG_USER_SCHED */ 9353 #endif /* CONFIG_USER_SCHED */
9355 #endif /* CONFIG_FAIR_GROUP_SCHED */ 9354 #endif /* CONFIG_FAIR_GROUP_SCHED */
9356 #ifdef CONFIG_RT_GROUP_SCHED 9355 #ifdef CONFIG_RT_GROUP_SCHED
9357 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 9356 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
9358 ptr += nr_cpu_ids * sizeof(void **); 9357 ptr += nr_cpu_ids * sizeof(void **);
9359 9358
9360 init_task_group.rt_rq = (struct rt_rq **)ptr; 9359 init_task_group.rt_rq = (struct rt_rq **)ptr;
9361 ptr += nr_cpu_ids * sizeof(void **); 9360 ptr += nr_cpu_ids * sizeof(void **);
9362 9361
9363 #ifdef CONFIG_USER_SCHED 9362 #ifdef CONFIG_USER_SCHED
9364 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 9363 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9365 ptr += nr_cpu_ids * sizeof(void **); 9364 ptr += nr_cpu_ids * sizeof(void **);
9366 9365
9367 root_task_group.rt_rq = (struct rt_rq **)ptr; 9366 root_task_group.rt_rq = (struct rt_rq **)ptr;
9368 ptr += nr_cpu_ids * sizeof(void **); 9367 ptr += nr_cpu_ids * sizeof(void **);
9369 #endif /* CONFIG_USER_SCHED */ 9368 #endif /* CONFIG_USER_SCHED */
9370 #endif /* CONFIG_RT_GROUP_SCHED */ 9369 #endif /* CONFIG_RT_GROUP_SCHED */
9371 #ifdef CONFIG_CPUMASK_OFFSTACK 9370 #ifdef CONFIG_CPUMASK_OFFSTACK
9372 for_each_possible_cpu(i) { 9371 for_each_possible_cpu(i) {
9373 per_cpu(load_balance_tmpmask, i) = (void *)ptr; 9372 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
9374 ptr += cpumask_size(); 9373 ptr += cpumask_size();
9375 } 9374 }
9376 #endif /* CONFIG_CPUMASK_OFFSTACK */ 9375 #endif /* CONFIG_CPUMASK_OFFSTACK */
9377 } 9376 }
9378 9377
9379 #ifdef CONFIG_SMP 9378 #ifdef CONFIG_SMP
9380 init_defrootdomain(); 9379 init_defrootdomain();
9381 #endif 9380 #endif
9382 9381
9383 init_rt_bandwidth(&def_rt_bandwidth, 9382 init_rt_bandwidth(&def_rt_bandwidth,
9384 global_rt_period(), global_rt_runtime()); 9383 global_rt_period(), global_rt_runtime());
9385 9384
9386 #ifdef CONFIG_RT_GROUP_SCHED 9385 #ifdef CONFIG_RT_GROUP_SCHED
9387 init_rt_bandwidth(&init_task_group.rt_bandwidth, 9386 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9388 global_rt_period(), global_rt_runtime()); 9387 global_rt_period(), global_rt_runtime());
9389 #ifdef CONFIG_USER_SCHED 9388 #ifdef CONFIG_USER_SCHED
9390 init_rt_bandwidth(&root_task_group.rt_bandwidth, 9389 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9391 global_rt_period(), RUNTIME_INF); 9390 global_rt_period(), RUNTIME_INF);
9392 #endif /* CONFIG_USER_SCHED */ 9391 #endif /* CONFIG_USER_SCHED */
9393 #endif /* CONFIG_RT_GROUP_SCHED */ 9392 #endif /* CONFIG_RT_GROUP_SCHED */
9394 9393
9395 #ifdef CONFIG_GROUP_SCHED 9394 #ifdef CONFIG_GROUP_SCHED
9396 list_add(&init_task_group.list, &task_groups); 9395 list_add(&init_task_group.list, &task_groups);
9397 INIT_LIST_HEAD(&init_task_group.children); 9396 INIT_LIST_HEAD(&init_task_group.children);
9398 9397
9399 #ifdef CONFIG_USER_SCHED 9398 #ifdef CONFIG_USER_SCHED
9400 INIT_LIST_HEAD(&root_task_group.children); 9399 INIT_LIST_HEAD(&root_task_group.children);
9401 init_task_group.parent = &root_task_group; 9400 init_task_group.parent = &root_task_group;
9402 list_add(&init_task_group.siblings, &root_task_group.children); 9401 list_add(&init_task_group.siblings, &root_task_group.children);
9403 #endif /* CONFIG_USER_SCHED */ 9402 #endif /* CONFIG_USER_SCHED */
9404 #endif /* CONFIG_GROUP_SCHED */ 9403 #endif /* CONFIG_GROUP_SCHED */
9405 9404
9406 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 9405 #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9407 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 9406 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9408 __alignof__(unsigned long)); 9407 __alignof__(unsigned long));
9409 #endif 9408 #endif
9410 for_each_possible_cpu(i) { 9409 for_each_possible_cpu(i) {
9411 struct rq *rq; 9410 struct rq *rq;
9412 9411
9413 rq = cpu_rq(i); 9412 rq = cpu_rq(i);
9414 spin_lock_init(&rq->lock); 9413 spin_lock_init(&rq->lock);
9415 rq->nr_running = 0; 9414 rq->nr_running = 0;
9416 rq->calc_load_active = 0; 9415 rq->calc_load_active = 0;
9417 rq->calc_load_update = jiffies + LOAD_FREQ; 9416 rq->calc_load_update = jiffies + LOAD_FREQ;
9418 init_cfs_rq(&rq->cfs, rq); 9417 init_cfs_rq(&rq->cfs, rq);
9419 init_rt_rq(&rq->rt, rq); 9418 init_rt_rq(&rq->rt, rq);
9420 #ifdef CONFIG_FAIR_GROUP_SCHED 9419 #ifdef CONFIG_FAIR_GROUP_SCHED
9421 init_task_group.shares = init_task_group_load; 9420 init_task_group.shares = init_task_group_load;
9422 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 9421 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
9423 #ifdef CONFIG_CGROUP_SCHED 9422 #ifdef CONFIG_CGROUP_SCHED
9424 /* 9423 /*
9425 * How much cpu bandwidth does init_task_group get? 9424 * How much cpu bandwidth does init_task_group get?
9426 * 9425 *
9427 * In case of task-groups formed thr' the cgroup filesystem, it 9426 * In case of task-groups formed thr' the cgroup filesystem, it
9428 * gets 100% of the cpu resources in the system. This overall 9427 * gets 100% of the cpu resources in the system. This overall
9429 * system cpu resource is divided among the tasks of 9428 * system cpu resource is divided among the tasks of
9430 * init_task_group and its child task-groups in a fair manner, 9429 * init_task_group and its child task-groups in a fair manner,
9431 * based on each entity's (task or task-group's) weight 9430 * based on each entity's (task or task-group's) weight
9432 * (se->load.weight). 9431 * (se->load.weight).
9433 * 9432 *
9434 * In other words, if init_task_group has 10 tasks of weight 9433 * In other words, if init_task_group has 10 tasks of weight
9435 * 1024) and two child groups A0 and A1 (of weight 1024 each), 9434 * 1024) and two child groups A0 and A1 (of weight 1024 each),
9436 * then A0's share of the cpu resource is: 9435 * then A0's share of the cpu resource is:
9437 * 9436 *
9438 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 9437 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
9439 * 9438 *
9440 * We achieve this by letting init_task_group's tasks sit 9439 * We achieve this by letting init_task_group's tasks sit
9441 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 9440 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9442 */ 9441 */
9443 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 9442 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9444 #elif defined CONFIG_USER_SCHED 9443 #elif defined CONFIG_USER_SCHED
9445 root_task_group.shares = NICE_0_LOAD; 9444 root_task_group.shares = NICE_0_LOAD;
9446 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL); 9445 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9447 /* 9446 /*
9448 * In case of task-groups formed thr' the user id of tasks, 9447 * In case of task-groups formed thr' the user id of tasks,
9449 * init_task_group represents tasks belonging to root user. 9448 * init_task_group represents tasks belonging to root user.
9450 * Hence it forms a sibling of all subsequent groups formed. 9449 * Hence it forms a sibling of all subsequent groups formed.
9451 * In this case, init_task_group gets only a fraction of overall 9450 * In this case, init_task_group gets only a fraction of overall
9452 * system cpu resource, based on the weight assigned to root 9451 * system cpu resource, based on the weight assigned to root
9453 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9452 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9454 * by letting tasks of init_task_group sit in a separate cfs_rq 9453 * by letting tasks of init_task_group sit in a separate cfs_rq
9455 * (init_tg_cfs_rq) and having one entity represent this group of 9454 * (init_tg_cfs_rq) and having one entity represent this group of
9456 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9455 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9457 */ 9456 */
9458 init_tg_cfs_entry(&init_task_group, 9457 init_tg_cfs_entry(&init_task_group,
9459 &per_cpu(init_tg_cfs_rq, i), 9458 &per_cpu(init_tg_cfs_rq, i),
9460 &per_cpu(init_sched_entity, i), i, 1, 9459 &per_cpu(init_sched_entity, i), i, 1,
9461 root_task_group.se[i]); 9460 root_task_group.se[i]);
9462 9461
9463 #endif 9462 #endif
9464 #endif /* CONFIG_FAIR_GROUP_SCHED */ 9463 #endif /* CONFIG_FAIR_GROUP_SCHED */
9465 9464
9466 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 9465 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
9467 #ifdef CONFIG_RT_GROUP_SCHED 9466 #ifdef CONFIG_RT_GROUP_SCHED
9468 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 9467 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9469 #ifdef CONFIG_CGROUP_SCHED 9468 #ifdef CONFIG_CGROUP_SCHED
9470 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 9469 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9471 #elif defined CONFIG_USER_SCHED 9470 #elif defined CONFIG_USER_SCHED
9472 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); 9471 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9473 init_tg_rt_entry(&init_task_group, 9472 init_tg_rt_entry(&init_task_group,
9474 &per_cpu(init_rt_rq, i), 9473 &per_cpu(init_rt_rq, i),
9475 &per_cpu(init_sched_rt_entity, i), i, 1, 9474 &per_cpu(init_sched_rt_entity, i), i, 1,
9476 root_task_group.rt_se[i]); 9475 root_task_group.rt_se[i]);
9477 #endif 9476 #endif
9478 #endif 9477 #endif
9479 9478
9480 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 9479 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
9481 rq->cpu_load[j] = 0; 9480 rq->cpu_load[j] = 0;
9482 #ifdef CONFIG_SMP 9481 #ifdef CONFIG_SMP
9483 rq->sd = NULL; 9482 rq->sd = NULL;
9484 rq->rd = NULL; 9483 rq->rd = NULL;
9485 rq->post_schedule = 0; 9484 rq->post_schedule = 0;
9486 rq->active_balance = 0; 9485 rq->active_balance = 0;
9487 rq->next_balance = jiffies; 9486 rq->next_balance = jiffies;
9488 rq->push_cpu = 0; 9487 rq->push_cpu = 0;
9489 rq->cpu = i; 9488 rq->cpu = i;
9490 rq->online = 0; 9489 rq->online = 0;
9491 rq->migration_thread = NULL; 9490 rq->migration_thread = NULL;
9492 INIT_LIST_HEAD(&rq->migration_queue); 9491 INIT_LIST_HEAD(&rq->migration_queue);
9493 rq_attach_root(rq, &def_root_domain); 9492 rq_attach_root(rq, &def_root_domain);
9494 #endif 9493 #endif
9495 init_rq_hrtick(rq); 9494 init_rq_hrtick(rq);
9496 atomic_set(&rq->nr_iowait, 0); 9495 atomic_set(&rq->nr_iowait, 0);
9497 } 9496 }
9498 9497
9499 set_load_weight(&init_task); 9498 set_load_weight(&init_task);
9500 9499
9501 #ifdef CONFIG_PREEMPT_NOTIFIERS 9500 #ifdef CONFIG_PREEMPT_NOTIFIERS
9502 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 9501 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
9503 #endif 9502 #endif
9504 9503
9505 #ifdef CONFIG_SMP 9504 #ifdef CONFIG_SMP
9506 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 9505 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
9507 #endif 9506 #endif
9508 9507
9509 #ifdef CONFIG_RT_MUTEXES 9508 #ifdef CONFIG_RT_MUTEXES
9510 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 9509 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
9511 #endif 9510 #endif
9512 9511
9513 /* 9512 /*
9514 * The boot idle thread does lazy MMU switching as well: 9513 * The boot idle thread does lazy MMU switching as well:
9515 */ 9514 */
9516 atomic_inc(&init_mm.mm_count); 9515 atomic_inc(&init_mm.mm_count);
9517 enter_lazy_tlb(&init_mm, current); 9516 enter_lazy_tlb(&init_mm, current);
9518 9517
9519 /* 9518 /*
9520 * Make us the idle thread. Technically, schedule() should not be 9519 * Make us the idle thread. Technically, schedule() should not be
9521 * called from this thread, however somewhere below it might be, 9520 * called from this thread, however somewhere below it might be,
9522 * but because we are the idle thread, we just pick up running again 9521 * but because we are the idle thread, we just pick up running again
9523 * when this runqueue becomes "idle". 9522 * when this runqueue becomes "idle".
9524 */ 9523 */
9525 init_idle(current, smp_processor_id()); 9524 init_idle(current, smp_processor_id());
9526 9525
9527 calc_load_update = jiffies + LOAD_FREQ; 9526 calc_load_update = jiffies + LOAD_FREQ;
9528 9527
9529 /* 9528 /*
9530 * During early bootup we pretend to be a normal task: 9529 * During early bootup we pretend to be a normal task:
9531 */ 9530 */
9532 current->sched_class = &fair_sched_class; 9531 current->sched_class = &fair_sched_class;
9533 9532
9534 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9533 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9535 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9534 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9536 #ifdef CONFIG_SMP 9535 #ifdef CONFIG_SMP
9537 #ifdef CONFIG_NO_HZ 9536 #ifdef CONFIG_NO_HZ
9538 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9537 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9539 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9538 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9540 #endif 9539 #endif
9541 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9540 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9542 #endif /* SMP */ 9541 #endif /* SMP */
9543 9542
9544 perf_event_init(); 9543 perf_event_init();
9545 9544
9546 scheduler_running = 1; 9545 scheduler_running = 1;
9547 } 9546 }
9548 9547
9549 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9548 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9550 static inline int preempt_count_equals(int preempt_offset) 9549 static inline int preempt_count_equals(int preempt_offset)
9551 { 9550 {
9552 int nested = preempt_count() & ~PREEMPT_ACTIVE; 9551 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9553 9552
9554 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 9553 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9555 } 9554 }
9556 9555
9557 void __might_sleep(char *file, int line, int preempt_offset) 9556 void __might_sleep(char *file, int line, int preempt_offset)
9558 { 9557 {
9559 #ifdef in_atomic 9558 #ifdef in_atomic
9560 static unsigned long prev_jiffy; /* ratelimiting */ 9559 static unsigned long prev_jiffy; /* ratelimiting */
9561 9560
9562 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || 9561 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9563 system_state != SYSTEM_RUNNING || oops_in_progress) 9562 system_state != SYSTEM_RUNNING || oops_in_progress)
9564 return; 9563 return;
9565 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9564 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9566 return; 9565 return;
9567 prev_jiffy = jiffies; 9566 prev_jiffy = jiffies;
9568 9567
9569 printk(KERN_ERR 9568 printk(KERN_ERR
9570 "BUG: sleeping function called from invalid context at %s:%d\n", 9569 "BUG: sleeping function called from invalid context at %s:%d\n",
9571 file, line); 9570 file, line);
9572 printk(KERN_ERR 9571 printk(KERN_ERR
9573 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", 9572 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
9574 in_atomic(), irqs_disabled(), 9573 in_atomic(), irqs_disabled(),
9575 current->pid, current->comm); 9574 current->pid, current->comm);
9576 9575
9577 debug_show_held_locks(current); 9576 debug_show_held_locks(current);
9578 if (irqs_disabled()) 9577 if (irqs_disabled())
9579 print_irqtrace_events(current); 9578 print_irqtrace_events(current);
9580 dump_stack(); 9579 dump_stack();
9581 #endif 9580 #endif
9582 } 9581 }
9583 EXPORT_SYMBOL(__might_sleep); 9582 EXPORT_SYMBOL(__might_sleep);
9584 #endif 9583 #endif
9585 9584
9586 #ifdef CONFIG_MAGIC_SYSRQ 9585 #ifdef CONFIG_MAGIC_SYSRQ
9587 static void normalize_task(struct rq *rq, struct task_struct *p) 9586 static void normalize_task(struct rq *rq, struct task_struct *p)
9588 { 9587 {
9589 int on_rq; 9588 int on_rq;
9590 9589
9591 update_rq_clock(rq); 9590 update_rq_clock(rq);
9592 on_rq = p->se.on_rq; 9591 on_rq = p->se.on_rq;
9593 if (on_rq) 9592 if (on_rq)
9594 deactivate_task(rq, p, 0); 9593 deactivate_task(rq, p, 0);
9595 __setscheduler(rq, p, SCHED_NORMAL, 0); 9594 __setscheduler(rq, p, SCHED_NORMAL, 0);
9596 if (on_rq) { 9595 if (on_rq) {
9597 activate_task(rq, p, 0); 9596 activate_task(rq, p, 0);
9598 resched_task(rq->curr); 9597 resched_task(rq->curr);
9599 } 9598 }
9600 } 9599 }
9601 9600
9602 void normalize_rt_tasks(void) 9601 void normalize_rt_tasks(void)
9603 { 9602 {
9604 struct task_struct *g, *p; 9603 struct task_struct *g, *p;
9605 unsigned long flags; 9604 unsigned long flags;
9606 struct rq *rq; 9605 struct rq *rq;
9607 9606
9608 read_lock_irqsave(&tasklist_lock, flags); 9607 read_lock_irqsave(&tasklist_lock, flags);
9609 do_each_thread(g, p) { 9608 do_each_thread(g, p) {
9610 /* 9609 /*
9611 * Only normalize user tasks: 9610 * Only normalize user tasks:
9612 */ 9611 */
9613 if (!p->mm) 9612 if (!p->mm)
9614 continue; 9613 continue;
9615 9614
9616 p->se.exec_start = 0; 9615 p->se.exec_start = 0;
9617 #ifdef CONFIG_SCHEDSTATS 9616 #ifdef CONFIG_SCHEDSTATS
9618 p->se.wait_start = 0; 9617 p->se.wait_start = 0;
9619 p->se.sleep_start = 0; 9618 p->se.sleep_start = 0;
9620 p->se.block_start = 0; 9619 p->se.block_start = 0;
9621 #endif 9620 #endif
9622 9621
9623 if (!rt_task(p)) { 9622 if (!rt_task(p)) {
9624 /* 9623 /*
9625 * Renice negative nice level userspace 9624 * Renice negative nice level userspace
9626 * tasks back to 0: 9625 * tasks back to 0:
9627 */ 9626 */
9628 if (TASK_NICE(p) < 0 && p->mm) 9627 if (TASK_NICE(p) < 0 && p->mm)
9629 set_user_nice(p, 0); 9628 set_user_nice(p, 0);
9630 continue; 9629 continue;
9631 } 9630 }
9632 9631
9633 spin_lock(&p->pi_lock); 9632 spin_lock(&p->pi_lock);
9634 rq = __task_rq_lock(p); 9633 rq = __task_rq_lock(p);
9635 9634
9636 normalize_task(rq, p); 9635 normalize_task(rq, p);
9637 9636
9638 __task_rq_unlock(rq); 9637 __task_rq_unlock(rq);
9639 spin_unlock(&p->pi_lock); 9638 spin_unlock(&p->pi_lock);
9640 } while_each_thread(g, p); 9639 } while_each_thread(g, p);
9641 9640
9642 read_unlock_irqrestore(&tasklist_lock, flags); 9641 read_unlock_irqrestore(&tasklist_lock, flags);
9643 } 9642 }
9644 9643
9645 #endif /* CONFIG_MAGIC_SYSRQ */ 9644 #endif /* CONFIG_MAGIC_SYSRQ */
9646 9645
9647 #ifdef CONFIG_IA64 9646 #ifdef CONFIG_IA64
9648 /* 9647 /*
9649 * These functions are only useful for the IA64 MCA handling. 9648 * These functions are only useful for the IA64 MCA handling.
9650 * 9649 *
9651 * They can only be called when the whole system has been 9650 * They can only be called when the whole system has been
9652 * stopped - every CPU needs to be quiescent, and no scheduling 9651 * stopped - every CPU needs to be quiescent, and no scheduling
9653 * activity can take place. Using them for anything else would 9652 * activity can take place. Using them for anything else would
9654 * be a serious bug, and as a result, they aren't even visible 9653 * be a serious bug, and as a result, they aren't even visible
9655 * under any other configuration. 9654 * under any other configuration.
9656 */ 9655 */
9657 9656
9658 /** 9657 /**
9659 * curr_task - return the current task for a given cpu. 9658 * curr_task - return the current task for a given cpu.
9660 * @cpu: the processor in question. 9659 * @cpu: the processor in question.
9661 * 9660 *
9662 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 9661 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9663 */ 9662 */
9664 struct task_struct *curr_task(int cpu) 9663 struct task_struct *curr_task(int cpu)
9665 { 9664 {
9666 return cpu_curr(cpu); 9665 return cpu_curr(cpu);
9667 } 9666 }
9668 9667
9669 /** 9668 /**
9670 * set_curr_task - set the current task for a given cpu. 9669 * set_curr_task - set the current task for a given cpu.
9671 * @cpu: the processor in question. 9670 * @cpu: the processor in question.
9672 * @p: the task pointer to set. 9671 * @p: the task pointer to set.
9673 * 9672 *
9674 * Description: This function must only be used when non-maskable interrupts 9673 * Description: This function must only be used when non-maskable interrupts
9675 * are serviced on a separate stack. It allows the architecture to switch the 9674 * are serviced on a separate stack. It allows the architecture to switch the
9676 * notion of the current task on a cpu in a non-blocking manner. This function 9675 * notion of the current task on a cpu in a non-blocking manner. This function
9677 * must be called with all CPU's synchronized, and interrupts disabled, the 9676 * must be called with all CPU's synchronized, and interrupts disabled, the
9678 * and caller must save the original value of the current task (see 9677 * and caller must save the original value of the current task (see
9679 * curr_task() above) and restore that value before reenabling interrupts and 9678 * curr_task() above) and restore that value before reenabling interrupts and
9680 * re-starting the system. 9679 * re-starting the system.
9681 * 9680 *
9682 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 9681 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
9683 */ 9682 */
9684 void set_curr_task(int cpu, struct task_struct *p) 9683 void set_curr_task(int cpu, struct task_struct *p)
9685 { 9684 {
9686 cpu_curr(cpu) = p; 9685 cpu_curr(cpu) = p;
9687 } 9686 }
9688 9687
9689 #endif 9688 #endif
9690 9689
9691 #ifdef CONFIG_FAIR_GROUP_SCHED 9690 #ifdef CONFIG_FAIR_GROUP_SCHED
9692 static void free_fair_sched_group(struct task_group *tg) 9691 static void free_fair_sched_group(struct task_group *tg)
9693 { 9692 {
9694 int i; 9693 int i;
9695 9694
9696 for_each_possible_cpu(i) { 9695 for_each_possible_cpu(i) {
9697 if (tg->cfs_rq) 9696 if (tg->cfs_rq)
9698 kfree(tg->cfs_rq[i]); 9697 kfree(tg->cfs_rq[i]);
9699 if (tg->se) 9698 if (tg->se)
9700 kfree(tg->se[i]); 9699 kfree(tg->se[i]);
9701 } 9700 }
9702 9701
9703 kfree(tg->cfs_rq); 9702 kfree(tg->cfs_rq);
9704 kfree(tg->se); 9703 kfree(tg->se);
9705 } 9704 }
9706 9705
9707 static 9706 static
9708 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 9707 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9709 { 9708 {
9710 struct cfs_rq *cfs_rq; 9709 struct cfs_rq *cfs_rq;
9711 struct sched_entity *se; 9710 struct sched_entity *se;
9712 struct rq *rq; 9711 struct rq *rq;
9713 int i; 9712 int i;
9714 9713
9715 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); 9714 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9716 if (!tg->cfs_rq) 9715 if (!tg->cfs_rq)
9717 goto err; 9716 goto err;
9718 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); 9717 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9719 if (!tg->se) 9718 if (!tg->se)
9720 goto err; 9719 goto err;
9721 9720
9722 tg->shares = NICE_0_LOAD; 9721 tg->shares = NICE_0_LOAD;
9723 9722
9724 for_each_possible_cpu(i) { 9723 for_each_possible_cpu(i) {
9725 rq = cpu_rq(i); 9724 rq = cpu_rq(i);
9726 9725
9727 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 9726 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9728 GFP_KERNEL, cpu_to_node(i)); 9727 GFP_KERNEL, cpu_to_node(i));
9729 if (!cfs_rq) 9728 if (!cfs_rq)
9730 goto err; 9729 goto err;
9731 9730
9732 se = kzalloc_node(sizeof(struct sched_entity), 9731 se = kzalloc_node(sizeof(struct sched_entity),
9733 GFP_KERNEL, cpu_to_node(i)); 9732 GFP_KERNEL, cpu_to_node(i));
9734 if (!se) 9733 if (!se)
9735 goto err; 9734 goto err;
9736 9735
9737 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9736 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9738 } 9737 }
9739 9738
9740 return 1; 9739 return 1;
9741 9740
9742 err: 9741 err:
9743 return 0; 9742 return 0;
9744 } 9743 }
9745 9744
9746 static inline void register_fair_sched_group(struct task_group *tg, int cpu) 9745 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9747 { 9746 {
9748 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, 9747 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
9749 &cpu_rq(cpu)->leaf_cfs_rq_list); 9748 &cpu_rq(cpu)->leaf_cfs_rq_list);
9750 } 9749 }
9751 9750
9752 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 9751 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9753 { 9752 {
9754 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 9753 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
9755 } 9754 }
9756 #else /* !CONFG_FAIR_GROUP_SCHED */ 9755 #else /* !CONFG_FAIR_GROUP_SCHED */
9757 static inline void free_fair_sched_group(struct task_group *tg) 9756 static inline void free_fair_sched_group(struct task_group *tg)
9758 { 9757 {
9759 } 9758 }
9760 9759
9761 static inline 9760 static inline
9762 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 9761 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9763 { 9762 {
9764 return 1; 9763 return 1;
9765 } 9764 }
9766 9765
9767 static inline void register_fair_sched_group(struct task_group *tg, int cpu) 9766 static inline void register_fair_sched_group(struct task_group *tg, int cpu)
9768 { 9767 {
9769 } 9768 }
9770 9769
9771 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 9770 static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
9772 { 9771 {
9773 } 9772 }
9774 #endif /* CONFIG_FAIR_GROUP_SCHED */ 9773 #endif /* CONFIG_FAIR_GROUP_SCHED */
9775 9774
9776 #ifdef CONFIG_RT_GROUP_SCHED 9775 #ifdef CONFIG_RT_GROUP_SCHED
9777 static void free_rt_sched_group(struct task_group *tg) 9776 static void free_rt_sched_group(struct task_group *tg)
9778 { 9777 {
9779 int i; 9778 int i;
9780 9779
9781 destroy_rt_bandwidth(&tg->rt_bandwidth); 9780 destroy_rt_bandwidth(&tg->rt_bandwidth);
9782 9781
9783 for_each_possible_cpu(i) { 9782 for_each_possible_cpu(i) {
9784 if (tg->rt_rq) 9783 if (tg->rt_rq)
9785 kfree(tg->rt_rq[i]); 9784 kfree(tg->rt_rq[i]);
9786 if (tg->rt_se) 9785 if (tg->rt_se)
9787 kfree(tg->rt_se[i]); 9786 kfree(tg->rt_se[i]);
9788 } 9787 }
9789 9788
9790 kfree(tg->rt_rq); 9789 kfree(tg->rt_rq);
9791 kfree(tg->rt_se); 9790 kfree(tg->rt_se);
9792 } 9791 }
9793 9792
9794 static 9793 static
9795 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 9794 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9796 { 9795 {
9797 struct rt_rq *rt_rq; 9796 struct rt_rq *rt_rq;
9798 struct sched_rt_entity *rt_se; 9797 struct sched_rt_entity *rt_se;
9799 struct rq *rq; 9798 struct rq *rq;
9800 int i; 9799 int i;
9801 9800
9802 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 9801 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
9803 if (!tg->rt_rq) 9802 if (!tg->rt_rq)
9804 goto err; 9803 goto err;
9805 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); 9804 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
9806 if (!tg->rt_se) 9805 if (!tg->rt_se)
9807 goto err; 9806 goto err;
9808 9807
9809 init_rt_bandwidth(&tg->rt_bandwidth, 9808 init_rt_bandwidth(&tg->rt_bandwidth,
9810 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 9809 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
9811 9810
9812 for_each_possible_cpu(i) { 9811 for_each_possible_cpu(i) {
9813 rq = cpu_rq(i); 9812 rq = cpu_rq(i);
9814 9813
9815 rt_rq = kzalloc_node(sizeof(struct rt_rq), 9814 rt_rq = kzalloc_node(sizeof(struct rt_rq),
9816 GFP_KERNEL, cpu_to_node(i)); 9815 GFP_KERNEL, cpu_to_node(i));
9817 if (!rt_rq) 9816 if (!rt_rq)
9818 goto err; 9817 goto err;
9819 9818
9820 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9819 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9821 GFP_KERNEL, cpu_to_node(i)); 9820 GFP_KERNEL, cpu_to_node(i));
9822 if (!rt_se) 9821 if (!rt_se)
9823 goto err; 9822 goto err;
9824 9823
9825 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9824 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9826 } 9825 }
9827 9826
9828 return 1; 9827 return 1;
9829 9828
9830 err: 9829 err:
9831 return 0; 9830 return 0;
9832 } 9831 }
9833 9832
9834 static inline void register_rt_sched_group(struct task_group *tg, int cpu) 9833 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9835 { 9834 {
9836 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, 9835 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
9837 &cpu_rq(cpu)->leaf_rt_rq_list); 9836 &cpu_rq(cpu)->leaf_rt_rq_list);
9838 } 9837 }
9839 9838
9840 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 9839 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9841 { 9840 {
9842 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 9841 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
9843 } 9842 }
9844 #else /* !CONFIG_RT_GROUP_SCHED */ 9843 #else /* !CONFIG_RT_GROUP_SCHED */
9845 static inline void free_rt_sched_group(struct task_group *tg) 9844 static inline void free_rt_sched_group(struct task_group *tg)
9846 { 9845 {
9847 } 9846 }
9848 9847
9849 static inline 9848 static inline
9850 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) 9849 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9851 { 9850 {
9852 return 1; 9851 return 1;
9853 } 9852 }
9854 9853
9855 static inline void register_rt_sched_group(struct task_group *tg, int cpu) 9854 static inline void register_rt_sched_group(struct task_group *tg, int cpu)
9856 { 9855 {
9857 } 9856 }
9858 9857
9859 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 9858 static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9860 { 9859 {
9861 } 9860 }
9862 #endif /* CONFIG_RT_GROUP_SCHED */ 9861 #endif /* CONFIG_RT_GROUP_SCHED */
9863 9862
9864 #ifdef CONFIG_GROUP_SCHED 9863 #ifdef CONFIG_GROUP_SCHED
9865 static void free_sched_group(struct task_group *tg) 9864 static void free_sched_group(struct task_group *tg)
9866 { 9865 {
9867 free_fair_sched_group(tg); 9866 free_fair_sched_group(tg);
9868 free_rt_sched_group(tg); 9867 free_rt_sched_group(tg);
9869 kfree(tg); 9868 kfree(tg);
9870 } 9869 }
9871 9870
9872 /* allocate runqueue etc for a new task group */ 9871 /* allocate runqueue etc for a new task group */
9873 struct task_group *sched_create_group(struct task_group *parent) 9872 struct task_group *sched_create_group(struct task_group *parent)
9874 { 9873 {
9875 struct task_group *tg; 9874 struct task_group *tg;
9876 unsigned long flags; 9875 unsigned long flags;
9877 int i; 9876 int i;
9878 9877
9879 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 9878 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
9880 if (!tg) 9879 if (!tg)
9881 return ERR_PTR(-ENOMEM); 9880 return ERR_PTR(-ENOMEM);
9882 9881
9883 if (!alloc_fair_sched_group(tg, parent)) 9882 if (!alloc_fair_sched_group(tg, parent))
9884 goto err; 9883 goto err;
9885 9884
9886 if (!alloc_rt_sched_group(tg, parent)) 9885 if (!alloc_rt_sched_group(tg, parent))
9887 goto err; 9886 goto err;
9888 9887
9889 spin_lock_irqsave(&task_group_lock, flags); 9888 spin_lock_irqsave(&task_group_lock, flags);
9890 for_each_possible_cpu(i) { 9889 for_each_possible_cpu(i) {
9891 register_fair_sched_group(tg, i); 9890 register_fair_sched_group(tg, i);
9892 register_rt_sched_group(tg, i); 9891 register_rt_sched_group(tg, i);
9893 } 9892 }
9894 list_add_rcu(&tg->list, &task_groups); 9893 list_add_rcu(&tg->list, &task_groups);
9895 9894
9896 WARN_ON(!parent); /* root should already exist */ 9895 WARN_ON(!parent); /* root should already exist */
9897 9896
9898 tg->parent = parent; 9897 tg->parent = parent;
9899 INIT_LIST_HEAD(&tg->children); 9898 INIT_LIST_HEAD(&tg->children);
9900 list_add_rcu(&tg->siblings, &parent->children); 9899 list_add_rcu(&tg->siblings, &parent->children);
9901 spin_unlock_irqrestore(&task_group_lock, flags); 9900 spin_unlock_irqrestore(&task_group_lock, flags);
9902 9901
9903 return tg; 9902 return tg;
9904 9903
9905 err: 9904 err:
9906 free_sched_group(tg); 9905 free_sched_group(tg);
9907 return ERR_PTR(-ENOMEM); 9906 return ERR_PTR(-ENOMEM);
9908 } 9907 }
9909 9908
9910 /* rcu callback to free various structures associated with a task group */ 9909 /* rcu callback to free various structures associated with a task group */
9911 static void free_sched_group_rcu(struct rcu_head *rhp) 9910 static void free_sched_group_rcu(struct rcu_head *rhp)
9912 { 9911 {
9913 /* now it should be safe to free those cfs_rqs */ 9912 /* now it should be safe to free those cfs_rqs */
9914 free_sched_group(container_of(rhp, struct task_group, rcu)); 9913 free_sched_group(container_of(rhp, struct task_group, rcu));
9915 } 9914 }
9916 9915
9917 /* Destroy runqueue etc associated with a task group */ 9916 /* Destroy runqueue etc associated with a task group */
9918 void sched_destroy_group(struct task_group *tg) 9917 void sched_destroy_group(struct task_group *tg)
9919 { 9918 {
9920 unsigned long flags; 9919 unsigned long flags;
9921 int i; 9920 int i;
9922 9921
9923 spin_lock_irqsave(&task_group_lock, flags); 9922 spin_lock_irqsave(&task_group_lock, flags);
9924 for_each_possible_cpu(i) { 9923 for_each_possible_cpu(i) {
9925 unregister_fair_sched_group(tg, i); 9924 unregister_fair_sched_group(tg, i);
9926 unregister_rt_sched_group(tg, i); 9925 unregister_rt_sched_group(tg, i);
9927 } 9926 }
9928 list_del_rcu(&tg->list); 9927 list_del_rcu(&tg->list);
9929 list_del_rcu(&tg->siblings); 9928 list_del_rcu(&tg->siblings);
9930 spin_unlock_irqrestore(&task_group_lock, flags); 9929 spin_unlock_irqrestore(&task_group_lock, flags);
9931 9930
9932 /* wait for possible concurrent references to cfs_rqs complete */ 9931 /* wait for possible concurrent references to cfs_rqs complete */
9933 call_rcu(&tg->rcu, free_sched_group_rcu); 9932 call_rcu(&tg->rcu, free_sched_group_rcu);
9934 } 9933 }
9935 9934
9936 /* change task's runqueue when it moves between groups. 9935 /* change task's runqueue when it moves between groups.
9937 * The caller of this function should have put the task in its new group 9936 * The caller of this function should have put the task in its new group
9938 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to 9937 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
9939 * reflect its new group. 9938 * reflect its new group.
9940 */ 9939 */
9941 void sched_move_task(struct task_struct *tsk) 9940 void sched_move_task(struct task_struct *tsk)
9942 { 9941 {
9943 int on_rq, running; 9942 int on_rq, running;
9944 unsigned long flags; 9943 unsigned long flags;
9945 struct rq *rq; 9944 struct rq *rq;
9946 9945
9947 rq = task_rq_lock(tsk, &flags); 9946 rq = task_rq_lock(tsk, &flags);
9948 9947
9949 update_rq_clock(rq); 9948 update_rq_clock(rq);
9950 9949
9951 running = task_current(rq, tsk); 9950 running = task_current(rq, tsk);
9952 on_rq = tsk->se.on_rq; 9951 on_rq = tsk->se.on_rq;
9953 9952
9954 if (on_rq) 9953 if (on_rq)
9955 dequeue_task(rq, tsk, 0); 9954 dequeue_task(rq, tsk, 0);
9956 if (unlikely(running)) 9955 if (unlikely(running))
9957 tsk->sched_class->put_prev_task(rq, tsk); 9956 tsk->sched_class->put_prev_task(rq, tsk);
9958 9957
9959 set_task_rq(tsk, task_cpu(tsk)); 9958 set_task_rq(tsk, task_cpu(tsk));
9960 9959
9961 #ifdef CONFIG_FAIR_GROUP_SCHED 9960 #ifdef CONFIG_FAIR_GROUP_SCHED
9962 if (tsk->sched_class->moved_group) 9961 if (tsk->sched_class->moved_group)
9963 tsk->sched_class->moved_group(tsk); 9962 tsk->sched_class->moved_group(tsk);
9964 #endif 9963 #endif
9965 9964
9966 if (unlikely(running)) 9965 if (unlikely(running))
9967 tsk->sched_class->set_curr_task(rq); 9966 tsk->sched_class->set_curr_task(rq);
9968 if (on_rq) 9967 if (on_rq)
9969 enqueue_task(rq, tsk, 0); 9968 enqueue_task(rq, tsk, 0);
9970 9969
9971 task_rq_unlock(rq, &flags); 9970 task_rq_unlock(rq, &flags);
9972 } 9971 }
9973 #endif /* CONFIG_GROUP_SCHED */ 9972 #endif /* CONFIG_GROUP_SCHED */
9974 9973
9975 #ifdef CONFIG_FAIR_GROUP_SCHED 9974 #ifdef CONFIG_FAIR_GROUP_SCHED
9976 static void __set_se_shares(struct sched_entity *se, unsigned long shares) 9975 static void __set_se_shares(struct sched_entity *se, unsigned long shares)
9977 { 9976 {
9978 struct cfs_rq *cfs_rq = se->cfs_rq; 9977 struct cfs_rq *cfs_rq = se->cfs_rq;
9979 int on_rq; 9978 int on_rq;
9980 9979
9981 on_rq = se->on_rq; 9980 on_rq = se->on_rq;
9982 if (on_rq) 9981 if (on_rq)
9983 dequeue_entity(cfs_rq, se, 0); 9982 dequeue_entity(cfs_rq, se, 0);
9984 9983
9985 se->load.weight = shares; 9984 se->load.weight = shares;
9986 se->load.inv_weight = 0; 9985 se->load.inv_weight = 0;
9987 9986
9988 if (on_rq) 9987 if (on_rq)
9989 enqueue_entity(cfs_rq, se, 0); 9988 enqueue_entity(cfs_rq, se, 0);
9990 } 9989 }
9991 9990
9992 static void set_se_shares(struct sched_entity *se, unsigned long shares) 9991 static void set_se_shares(struct sched_entity *se, unsigned long shares)
9993 { 9992 {
9994 struct cfs_rq *cfs_rq = se->cfs_rq; 9993 struct cfs_rq *cfs_rq = se->cfs_rq;
9995 struct rq *rq = cfs_rq->rq; 9994 struct rq *rq = cfs_rq->rq;
9996 unsigned long flags; 9995 unsigned long flags;
9997 9996
9998 spin_lock_irqsave(&rq->lock, flags); 9997 spin_lock_irqsave(&rq->lock, flags);
9999 __set_se_shares(se, shares); 9998 __set_se_shares(se, shares);
10000 spin_unlock_irqrestore(&rq->lock, flags); 9999 spin_unlock_irqrestore(&rq->lock, flags);
10001 } 10000 }
10002 10001
10003 static DEFINE_MUTEX(shares_mutex); 10002 static DEFINE_MUTEX(shares_mutex);
10004 10003
10005 int sched_group_set_shares(struct task_group *tg, unsigned long shares) 10004 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10006 { 10005 {
10007 int i; 10006 int i;
10008 unsigned long flags; 10007 unsigned long flags;
10009 10008
10010 /* 10009 /*
10011 * We can't change the weight of the root cgroup. 10010 * We can't change the weight of the root cgroup.
10012 */ 10011 */
10013 if (!tg->se[0]) 10012 if (!tg->se[0])
10014 return -EINVAL; 10013 return -EINVAL;
10015 10014
10016 if (shares < MIN_SHARES) 10015 if (shares < MIN_SHARES)
10017 shares = MIN_SHARES; 10016 shares = MIN_SHARES;
10018 else if (shares > MAX_SHARES) 10017 else if (shares > MAX_SHARES)
10019 shares = MAX_SHARES; 10018 shares = MAX_SHARES;
10020 10019
10021 mutex_lock(&shares_mutex); 10020 mutex_lock(&shares_mutex);
10022 if (tg->shares == shares) 10021 if (tg->shares == shares)
10023 goto done; 10022 goto done;
10024 10023
10025 spin_lock_irqsave(&task_group_lock, flags); 10024 spin_lock_irqsave(&task_group_lock, flags);
10026 for_each_possible_cpu(i) 10025 for_each_possible_cpu(i)
10027 unregister_fair_sched_group(tg, i); 10026 unregister_fair_sched_group(tg, i);
10028 list_del_rcu(&tg->siblings); 10027 list_del_rcu(&tg->siblings);
10029 spin_unlock_irqrestore(&task_group_lock, flags); 10028 spin_unlock_irqrestore(&task_group_lock, flags);
10030 10029
10031 /* wait for any ongoing reference to this group to finish */ 10030 /* wait for any ongoing reference to this group to finish */
10032 synchronize_sched(); 10031 synchronize_sched();
10033 10032
10034 /* 10033 /*
10035 * Now we are free to modify the group's share on each cpu 10034 * Now we are free to modify the group's share on each cpu
10036 * w/o tripping rebalance_share or load_balance_fair. 10035 * w/o tripping rebalance_share or load_balance_fair.
10037 */ 10036 */
10038 tg->shares = shares; 10037 tg->shares = shares;
10039 for_each_possible_cpu(i) { 10038 for_each_possible_cpu(i) {
10040 /* 10039 /*
10041 * force a rebalance 10040 * force a rebalance
10042 */ 10041 */
10043 cfs_rq_set_shares(tg->cfs_rq[i], 0); 10042 cfs_rq_set_shares(tg->cfs_rq[i], 0);
10044 set_se_shares(tg->se[i], shares); 10043 set_se_shares(tg->se[i], shares);
10045 } 10044 }
10046 10045
10047 /* 10046 /*
10048 * Enable load balance activity on this group, by inserting it back on 10047 * Enable load balance activity on this group, by inserting it back on
10049 * each cpu's rq->leaf_cfs_rq_list. 10048 * each cpu's rq->leaf_cfs_rq_list.
10050 */ 10049 */
10051 spin_lock_irqsave(&task_group_lock, flags); 10050 spin_lock_irqsave(&task_group_lock, flags);
10052 for_each_possible_cpu(i) 10051 for_each_possible_cpu(i)
10053 register_fair_sched_group(tg, i); 10052 register_fair_sched_group(tg, i);
10054 list_add_rcu(&tg->siblings, &tg->parent->children); 10053 list_add_rcu(&tg->siblings, &tg->parent->children);
10055 spin_unlock_irqrestore(&task_group_lock, flags); 10054 spin_unlock_irqrestore(&task_group_lock, flags);
10056 done: 10055 done:
10057 mutex_unlock(&shares_mutex); 10056 mutex_unlock(&shares_mutex);
10058 return 0; 10057 return 0;
10059 } 10058 }
10060 10059
10061 unsigned long sched_group_shares(struct task_group *tg) 10060 unsigned long sched_group_shares(struct task_group *tg)
10062 { 10061 {
10063 return tg->shares; 10062 return tg->shares;
10064 } 10063 }
10065 #endif 10064 #endif
10066 10065
10067 #ifdef CONFIG_RT_GROUP_SCHED 10066 #ifdef CONFIG_RT_GROUP_SCHED
10068 /* 10067 /*
10069 * Ensure that the real time constraints are schedulable. 10068 * Ensure that the real time constraints are schedulable.
10070 */ 10069 */
10071 static DEFINE_MUTEX(rt_constraints_mutex); 10070 static DEFINE_MUTEX(rt_constraints_mutex);
10072 10071
10073 static unsigned long to_ratio(u64 period, u64 runtime) 10072 static unsigned long to_ratio(u64 period, u64 runtime)
10074 { 10073 {
10075 if (runtime == RUNTIME_INF) 10074 if (runtime == RUNTIME_INF)
10076 return 1ULL << 20; 10075 return 1ULL << 20;
10077 10076
10078 return div64_u64(runtime << 20, period); 10077 return div64_u64(runtime << 20, period);
10079 } 10078 }
10080 10079
10081 /* Must be called with tasklist_lock held */ 10080 /* Must be called with tasklist_lock held */
10082 static inline int tg_has_rt_tasks(struct task_group *tg) 10081 static inline int tg_has_rt_tasks(struct task_group *tg)
10083 { 10082 {
10084 struct task_struct *g, *p; 10083 struct task_struct *g, *p;
10085 10084
10086 do_each_thread(g, p) { 10085 do_each_thread(g, p) {
10087 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 10086 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
10088 return 1; 10087 return 1;
10089 } while_each_thread(g, p); 10088 } while_each_thread(g, p);
10090 10089
10091 return 0; 10090 return 0;
10092 } 10091 }
10093 10092
10094 struct rt_schedulable_data { 10093 struct rt_schedulable_data {
10095 struct task_group *tg; 10094 struct task_group *tg;
10096 u64 rt_period; 10095 u64 rt_period;
10097 u64 rt_runtime; 10096 u64 rt_runtime;
10098 }; 10097 };
10099 10098
10100 static int tg_schedulable(struct task_group *tg, void *data) 10099 static int tg_schedulable(struct task_group *tg, void *data)
10101 { 10100 {
10102 struct rt_schedulable_data *d = data; 10101 struct rt_schedulable_data *d = data;
10103 struct task_group *child; 10102 struct task_group *child;
10104 unsigned long total, sum = 0; 10103 unsigned long total, sum = 0;
10105 u64 period, runtime; 10104 u64 period, runtime;
10106 10105
10107 period = ktime_to_ns(tg->rt_bandwidth.rt_period); 10106 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
10108 runtime = tg->rt_bandwidth.rt_runtime; 10107 runtime = tg->rt_bandwidth.rt_runtime;
10109 10108
10110 if (tg == d->tg) { 10109 if (tg == d->tg) {
10111 period = d->rt_period; 10110 period = d->rt_period;
10112 runtime = d->rt_runtime; 10111 runtime = d->rt_runtime;
10113 } 10112 }
10114 10113
10115 #ifdef CONFIG_USER_SCHED 10114 #ifdef CONFIG_USER_SCHED
10116 if (tg == &root_task_group) { 10115 if (tg == &root_task_group) {
10117 period = global_rt_period(); 10116 period = global_rt_period();
10118 runtime = global_rt_runtime(); 10117 runtime = global_rt_runtime();
10119 } 10118 }
10120 #endif 10119 #endif
10121 10120
10122 /* 10121 /*
10123 * Cannot have more runtime than the period. 10122 * Cannot have more runtime than the period.
10124 */ 10123 */
10125 if (runtime > period && runtime != RUNTIME_INF) 10124 if (runtime > period && runtime != RUNTIME_INF)
10126 return -EINVAL; 10125 return -EINVAL;
10127 10126
10128 /* 10127 /*
10129 * Ensure we don't starve existing RT tasks. 10128 * Ensure we don't starve existing RT tasks.
10130 */ 10129 */
10131 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) 10130 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
10132 return -EBUSY; 10131 return -EBUSY;
10133 10132
10134 total = to_ratio(period, runtime); 10133 total = to_ratio(period, runtime);
10135 10134
10136 /* 10135 /*
10137 * Nobody can have more than the global setting allows. 10136 * Nobody can have more than the global setting allows.
10138 */ 10137 */
10139 if (total > to_ratio(global_rt_period(), global_rt_runtime())) 10138 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
10140 return -EINVAL; 10139 return -EINVAL;
10141 10140
10142 /* 10141 /*
10143 * The sum of our children's runtime should not exceed our own. 10142 * The sum of our children's runtime should not exceed our own.
10144 */ 10143 */
10145 list_for_each_entry_rcu(child, &tg->children, siblings) { 10144 list_for_each_entry_rcu(child, &tg->children, siblings) {
10146 period = ktime_to_ns(child->rt_bandwidth.rt_period); 10145 period = ktime_to_ns(child->rt_bandwidth.rt_period);
10147 runtime = child->rt_bandwidth.rt_runtime; 10146 runtime = child->rt_bandwidth.rt_runtime;
10148 10147
10149 if (child == d->tg) { 10148 if (child == d->tg) {
10150 period = d->rt_period; 10149 period = d->rt_period;
10151 runtime = d->rt_runtime; 10150 runtime = d->rt_runtime;
10152 } 10151 }
10153 10152
10154 sum += to_ratio(period, runtime); 10153 sum += to_ratio(period, runtime);
10155 } 10154 }
10156 10155
10157 if (sum > total) 10156 if (sum > total)
10158 return -EINVAL; 10157 return -EINVAL;
10159 10158
10160 return 0; 10159 return 0;
10161 } 10160 }
10162 10161
10163 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 10162 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
10164 { 10163 {
10165 struct rt_schedulable_data data = { 10164 struct rt_schedulable_data data = {
10166 .tg = tg, 10165 .tg = tg,
10167 .rt_period = period, 10166 .rt_period = period,
10168 .rt_runtime = runtime, 10167 .rt_runtime = runtime,
10169 }; 10168 };
10170 10169
10171 return walk_tg_tree(tg_schedulable, tg_nop, &data); 10170 return walk_tg_tree(tg_schedulable, tg_nop, &data);
10172 } 10171 }
10173 10172
10174 static int tg_set_bandwidth(struct task_group *tg, 10173 static int tg_set_bandwidth(struct task_group *tg,
10175 u64 rt_period, u64 rt_runtime) 10174 u64 rt_period, u64 rt_runtime)
10176 { 10175 {
10177 int i, err = 0; 10176 int i, err = 0;
10178 10177
10179 mutex_lock(&rt_constraints_mutex); 10178 mutex_lock(&rt_constraints_mutex);
10180 read_lock(&tasklist_lock); 10179 read_lock(&tasklist_lock);
10181 err = __rt_schedulable(tg, rt_period, rt_runtime); 10180 err = __rt_schedulable(tg, rt_period, rt_runtime);
10182 if (err) 10181 if (err)
10183 goto unlock; 10182 goto unlock;
10184 10183
10185 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10184 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10186 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 10185 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10187 tg->rt_bandwidth.rt_runtime = rt_runtime; 10186 tg->rt_bandwidth.rt_runtime = rt_runtime;
10188 10187
10189 for_each_possible_cpu(i) { 10188 for_each_possible_cpu(i) {
10190 struct rt_rq *rt_rq = tg->rt_rq[i]; 10189 struct rt_rq *rt_rq = tg->rt_rq[i];
10191 10190
10192 spin_lock(&rt_rq->rt_runtime_lock); 10191 spin_lock(&rt_rq->rt_runtime_lock);
10193 rt_rq->rt_runtime = rt_runtime; 10192 rt_rq->rt_runtime = rt_runtime;
10194 spin_unlock(&rt_rq->rt_runtime_lock); 10193 spin_unlock(&rt_rq->rt_runtime_lock);
10195 } 10194 }
10196 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10195 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10197 unlock: 10196 unlock:
10198 read_unlock(&tasklist_lock); 10197 read_unlock(&tasklist_lock);
10199 mutex_unlock(&rt_constraints_mutex); 10198 mutex_unlock(&rt_constraints_mutex);
10200 10199
10201 return err; 10200 return err;
10202 } 10201 }
10203 10202
10204 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) 10203 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
10205 { 10204 {
10206 u64 rt_runtime, rt_period; 10205 u64 rt_runtime, rt_period;
10207 10206
10208 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 10207 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
10209 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; 10208 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
10210 if (rt_runtime_us < 0) 10209 if (rt_runtime_us < 0)
10211 rt_runtime = RUNTIME_INF; 10210 rt_runtime = RUNTIME_INF;
10212 10211
10213 return tg_set_bandwidth(tg, rt_period, rt_runtime); 10212 return tg_set_bandwidth(tg, rt_period, rt_runtime);
10214 } 10213 }
10215 10214
10216 long sched_group_rt_runtime(struct task_group *tg) 10215 long sched_group_rt_runtime(struct task_group *tg)
10217 { 10216 {
10218 u64 rt_runtime_us; 10217 u64 rt_runtime_us;
10219 10218
10220 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) 10219 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
10221 return -1; 10220 return -1;
10222 10221
10223 rt_runtime_us = tg->rt_bandwidth.rt_runtime; 10222 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
10224 do_div(rt_runtime_us, NSEC_PER_USEC); 10223 do_div(rt_runtime_us, NSEC_PER_USEC);
10225 return rt_runtime_us; 10224 return rt_runtime_us;
10226 } 10225 }
10227 10226
10228 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) 10227 int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
10229 { 10228 {
10230 u64 rt_runtime, rt_period; 10229 u64 rt_runtime, rt_period;
10231 10230
10232 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 10231 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
10233 rt_runtime = tg->rt_bandwidth.rt_runtime; 10232 rt_runtime = tg->rt_bandwidth.rt_runtime;
10234 10233
10235 if (rt_period == 0) 10234 if (rt_period == 0)
10236 return -EINVAL; 10235 return -EINVAL;
10237 10236
10238 return tg_set_bandwidth(tg, rt_period, rt_runtime); 10237 return tg_set_bandwidth(tg, rt_period, rt_runtime);
10239 } 10238 }
10240 10239
10241 long sched_group_rt_period(struct task_group *tg) 10240 long sched_group_rt_period(struct task_group *tg)
10242 { 10241 {
10243 u64 rt_period_us; 10242 u64 rt_period_us;
10244 10243
10245 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); 10244 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
10246 do_div(rt_period_us, NSEC_PER_USEC); 10245 do_div(rt_period_us, NSEC_PER_USEC);
10247 return rt_period_us; 10246 return rt_period_us;
10248 } 10247 }
10249 10248
10250 static int sched_rt_global_constraints(void) 10249 static int sched_rt_global_constraints(void)
10251 { 10250 {
10252 u64 runtime, period; 10251 u64 runtime, period;
10253 int ret = 0; 10252 int ret = 0;
10254 10253
10255 if (sysctl_sched_rt_period <= 0) 10254 if (sysctl_sched_rt_period <= 0)
10256 return -EINVAL; 10255 return -EINVAL;
10257 10256
10258 runtime = global_rt_runtime(); 10257 runtime = global_rt_runtime();
10259 period = global_rt_period(); 10258 period = global_rt_period();
10260 10259
10261 /* 10260 /*
10262 * Sanity check on the sysctl variables. 10261 * Sanity check on the sysctl variables.
10263 */ 10262 */
10264 if (runtime > period && runtime != RUNTIME_INF) 10263 if (runtime > period && runtime != RUNTIME_INF)
10265 return -EINVAL; 10264 return -EINVAL;
10266 10265
10267 mutex_lock(&rt_constraints_mutex); 10266 mutex_lock(&rt_constraints_mutex);
10268 read_lock(&tasklist_lock); 10267 read_lock(&tasklist_lock);
10269 ret = __rt_schedulable(NULL, 0, 0); 10268 ret = __rt_schedulable(NULL, 0, 0);
10270 read_unlock(&tasklist_lock); 10269 read_unlock(&tasklist_lock);
10271 mutex_unlock(&rt_constraints_mutex); 10270 mutex_unlock(&rt_constraints_mutex);
10272 10271
10273 return ret; 10272 return ret;
10274 } 10273 }
10275 10274
10276 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) 10275 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
10277 { 10276 {
10278 /* Don't accept realtime tasks when there is no way for them to run */ 10277 /* Don't accept realtime tasks when there is no way for them to run */
10279 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) 10278 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
10280 return 0; 10279 return 0;
10281 10280
10282 return 1; 10281 return 1;
10283 } 10282 }
10284 10283
10285 #else /* !CONFIG_RT_GROUP_SCHED */ 10284 #else /* !CONFIG_RT_GROUP_SCHED */
10286 static int sched_rt_global_constraints(void) 10285 static int sched_rt_global_constraints(void)
10287 { 10286 {
10288 unsigned long flags; 10287 unsigned long flags;
10289 int i; 10288 int i;
10290 10289
10291 if (sysctl_sched_rt_period <= 0) 10290 if (sysctl_sched_rt_period <= 0)
10292 return -EINVAL; 10291 return -EINVAL;
10293 10292
10294 /* 10293 /*
10295 * There's always some RT tasks in the root group 10294 * There's always some RT tasks in the root group
10296 * -- migration, kstopmachine etc.. 10295 * -- migration, kstopmachine etc..
10297 */ 10296 */
10298 if (sysctl_sched_rt_runtime == 0) 10297 if (sysctl_sched_rt_runtime == 0)
10299 return -EBUSY; 10298 return -EBUSY;
10300 10299
10301 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10300 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10302 for_each_possible_cpu(i) { 10301 for_each_possible_cpu(i) {
10303 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10302 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10304 10303
10305 spin_lock(&rt_rq->rt_runtime_lock); 10304 spin_lock(&rt_rq->rt_runtime_lock);
10306 rt_rq->rt_runtime = global_rt_runtime(); 10305 rt_rq->rt_runtime = global_rt_runtime();
10307 spin_unlock(&rt_rq->rt_runtime_lock); 10306 spin_unlock(&rt_rq->rt_runtime_lock);
10308 } 10307 }
10309 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 10308 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10310 10309
10311 return 0; 10310 return 0;
10312 } 10311 }
10313 #endif /* CONFIG_RT_GROUP_SCHED */ 10312 #endif /* CONFIG_RT_GROUP_SCHED */
10314 10313
10315 int sched_rt_handler(struct ctl_table *table, int write, 10314 int sched_rt_handler(struct ctl_table *table, int write,
10316 void __user *buffer, size_t *lenp, 10315 void __user *buffer, size_t *lenp,
10317 loff_t *ppos) 10316 loff_t *ppos)
10318 { 10317 {
10319 int ret; 10318 int ret;
10320 int old_period, old_runtime; 10319 int old_period, old_runtime;
10321 static DEFINE_MUTEX(mutex); 10320 static DEFINE_MUTEX(mutex);
10322 10321
10323 mutex_lock(&mutex); 10322 mutex_lock(&mutex);
10324 old_period = sysctl_sched_rt_period; 10323 old_period = sysctl_sched_rt_period;
10325 old_runtime = sysctl_sched_rt_runtime; 10324 old_runtime = sysctl_sched_rt_runtime;
10326 10325
10327 ret = proc_dointvec(table, write, buffer, lenp, ppos); 10326 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10328 10327
10329 if (!ret && write) { 10328 if (!ret && write) {
10330 ret = sched_rt_global_constraints(); 10329 ret = sched_rt_global_constraints();
10331 if (ret) { 10330 if (ret) {
10332 sysctl_sched_rt_period = old_period; 10331 sysctl_sched_rt_period = old_period;
10333 sysctl_sched_rt_runtime = old_runtime; 10332 sysctl_sched_rt_runtime = old_runtime;
10334 } else { 10333 } else {
10335 def_rt_bandwidth.rt_runtime = global_rt_runtime(); 10334 def_rt_bandwidth.rt_runtime = global_rt_runtime();
10336 def_rt_bandwidth.rt_period = 10335 def_rt_bandwidth.rt_period =
10337 ns_to_ktime(global_rt_period()); 10336 ns_to_ktime(global_rt_period());
10338 } 10337 }
10339 } 10338 }
10340 mutex_unlock(&mutex); 10339 mutex_unlock(&mutex);
10341 10340
10342 return ret; 10341 return ret;
10343 } 10342 }
10344 10343
10345 #ifdef CONFIG_CGROUP_SCHED 10344 #ifdef CONFIG_CGROUP_SCHED
10346 10345
10347 /* return corresponding task_group object of a cgroup */ 10346 /* return corresponding task_group object of a cgroup */
10348 static inline struct task_group *cgroup_tg(struct cgroup *cgrp) 10347 static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
10349 { 10348 {
10350 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), 10349 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
10351 struct task_group, css); 10350 struct task_group, css);
10352 } 10351 }
10353 10352
10354 static struct cgroup_subsys_state * 10353 static struct cgroup_subsys_state *
10355 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) 10354 cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
10356 { 10355 {
10357 struct task_group *tg, *parent; 10356 struct task_group *tg, *parent;
10358 10357
10359 if (!cgrp->parent) { 10358 if (!cgrp->parent) {
10360 /* This is early initialization for the top cgroup */ 10359 /* This is early initialization for the top cgroup */
10361 return &init_task_group.css; 10360 return &init_task_group.css;
10362 } 10361 }
10363 10362
10364 parent = cgroup_tg(cgrp->parent); 10363 parent = cgroup_tg(cgrp->parent);
10365 tg = sched_create_group(parent); 10364 tg = sched_create_group(parent);
10366 if (IS_ERR(tg)) 10365 if (IS_ERR(tg))
10367 return ERR_PTR(-ENOMEM); 10366 return ERR_PTR(-ENOMEM);
10368 10367
10369 return &tg->css; 10368 return &tg->css;
10370 } 10369 }
10371 10370
10372 static void 10371 static void
10373 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 10372 cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10374 { 10373 {
10375 struct task_group *tg = cgroup_tg(cgrp); 10374 struct task_group *tg = cgroup_tg(cgrp);
10376 10375
10377 sched_destroy_group(tg); 10376 sched_destroy_group(tg);
10378 } 10377 }
10379 10378
10380 static int 10379 static int
10381 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 10380 cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10382 { 10381 {
10383 #ifdef CONFIG_RT_GROUP_SCHED 10382 #ifdef CONFIG_RT_GROUP_SCHED
10384 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10383 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
10385 return -EINVAL; 10384 return -EINVAL;
10386 #else 10385 #else
10387 /* We don't support RT-tasks being in separate groups */ 10386 /* We don't support RT-tasks being in separate groups */
10388 if (tsk->sched_class != &fair_sched_class) 10387 if (tsk->sched_class != &fair_sched_class)
10389 return -EINVAL; 10388 return -EINVAL;
10390 #endif 10389 #endif
10391 return 0; 10390 return 0;
10392 } 10391 }
10393 10392
10394 static int 10393 static int
10395 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10394 cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10396 struct task_struct *tsk, bool threadgroup) 10395 struct task_struct *tsk, bool threadgroup)
10397 { 10396 {
10398 int retval = cpu_cgroup_can_attach_task(cgrp, tsk); 10397 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10399 if (retval) 10398 if (retval)
10400 return retval; 10399 return retval;
10401 if (threadgroup) { 10400 if (threadgroup) {
10402 struct task_struct *c; 10401 struct task_struct *c;
10403 rcu_read_lock(); 10402 rcu_read_lock();
10404 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 10403 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10405 retval = cpu_cgroup_can_attach_task(cgrp, c); 10404 retval = cpu_cgroup_can_attach_task(cgrp, c);
10406 if (retval) { 10405 if (retval) {
10407 rcu_read_unlock(); 10406 rcu_read_unlock();
10408 return retval; 10407 return retval;
10409 } 10408 }
10410 } 10409 }
10411 rcu_read_unlock(); 10410 rcu_read_unlock();
10412 } 10411 }
10413 return 0; 10412 return 0;
10414 } 10413 }
10415 10414
10416 static void 10415 static void
10417 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10416 cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10418 struct cgroup *old_cont, struct task_struct *tsk, 10417 struct cgroup *old_cont, struct task_struct *tsk,
10419 bool threadgroup) 10418 bool threadgroup)
10420 { 10419 {
10421 sched_move_task(tsk); 10420 sched_move_task(tsk);
10422 if (threadgroup) { 10421 if (threadgroup) {
10423 struct task_struct *c; 10422 struct task_struct *c;
10424 rcu_read_lock(); 10423 rcu_read_lock();
10425 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 10424 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10426 sched_move_task(c); 10425 sched_move_task(c);
10427 } 10426 }
10428 rcu_read_unlock(); 10427 rcu_read_unlock();
10429 } 10428 }
10430 } 10429 }
10431 10430
10432 #ifdef CONFIG_FAIR_GROUP_SCHED 10431 #ifdef CONFIG_FAIR_GROUP_SCHED
10433 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 10432 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
10434 u64 shareval) 10433 u64 shareval)
10435 { 10434 {
10436 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 10435 return sched_group_set_shares(cgroup_tg(cgrp), shareval);
10437 } 10436 }
10438 10437
10439 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 10438 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
10440 { 10439 {
10441 struct task_group *tg = cgroup_tg(cgrp); 10440 struct task_group *tg = cgroup_tg(cgrp);
10442 10441
10443 return (u64) tg->shares; 10442 return (u64) tg->shares;
10444 } 10443 }
10445 #endif /* CONFIG_FAIR_GROUP_SCHED */ 10444 #endif /* CONFIG_FAIR_GROUP_SCHED */
10446 10445
10447 #ifdef CONFIG_RT_GROUP_SCHED 10446 #ifdef CONFIG_RT_GROUP_SCHED
10448 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 10447 static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
10449 s64 val) 10448 s64 val)
10450 { 10449 {
10451 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); 10450 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
10452 } 10451 }
10453 10452
10454 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) 10453 static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
10455 { 10454 {
10456 return sched_group_rt_runtime(cgroup_tg(cgrp)); 10455 return sched_group_rt_runtime(cgroup_tg(cgrp));
10457 } 10456 }
10458 10457
10459 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, 10458 static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
10460 u64 rt_period_us) 10459 u64 rt_period_us)
10461 { 10460 {
10462 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); 10461 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
10463 } 10462 }
10464 10463
10465 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) 10464 static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
10466 { 10465 {
10467 return sched_group_rt_period(cgroup_tg(cgrp)); 10466 return sched_group_rt_period(cgroup_tg(cgrp));
10468 } 10467 }
10469 #endif /* CONFIG_RT_GROUP_SCHED */ 10468 #endif /* CONFIG_RT_GROUP_SCHED */
10470 10469
10471 static struct cftype cpu_files[] = { 10470 static struct cftype cpu_files[] = {
10472 #ifdef CONFIG_FAIR_GROUP_SCHED 10471 #ifdef CONFIG_FAIR_GROUP_SCHED
10473 { 10472 {
10474 .name = "shares", 10473 .name = "shares",
10475 .read_u64 = cpu_shares_read_u64, 10474 .read_u64 = cpu_shares_read_u64,
10476 .write_u64 = cpu_shares_write_u64, 10475 .write_u64 = cpu_shares_write_u64,
10477 }, 10476 },
10478 #endif 10477 #endif
10479 #ifdef CONFIG_RT_GROUP_SCHED 10478 #ifdef CONFIG_RT_GROUP_SCHED
10480 { 10479 {
10481 .name = "rt_runtime_us", 10480 .name = "rt_runtime_us",
10482 .read_s64 = cpu_rt_runtime_read, 10481 .read_s64 = cpu_rt_runtime_read,
10483 .write_s64 = cpu_rt_runtime_write, 10482 .write_s64 = cpu_rt_runtime_write,
10484 }, 10483 },
10485 { 10484 {
10486 .name = "rt_period_us", 10485 .name = "rt_period_us",
10487 .read_u64 = cpu_rt_period_read_uint, 10486 .read_u64 = cpu_rt_period_read_uint,
10488 .write_u64 = cpu_rt_period_write_uint, 10487 .write_u64 = cpu_rt_period_write_uint,
10489 }, 10488 },
10490 #endif 10489 #endif
10491 }; 10490 };
10492 10491
10493 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 10492 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
10494 { 10493 {
10495 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); 10494 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
10496 } 10495 }
10497 10496
10498 struct cgroup_subsys cpu_cgroup_subsys = { 10497 struct cgroup_subsys cpu_cgroup_subsys = {
10499 .name = "cpu", 10498 .name = "cpu",
10500 .create = cpu_cgroup_create, 10499 .create = cpu_cgroup_create,
10501 .destroy = cpu_cgroup_destroy, 10500 .destroy = cpu_cgroup_destroy,
10502 .can_attach = cpu_cgroup_can_attach, 10501 .can_attach = cpu_cgroup_can_attach,
10503 .attach = cpu_cgroup_attach, 10502 .attach = cpu_cgroup_attach,
10504 .populate = cpu_cgroup_populate, 10503 .populate = cpu_cgroup_populate,
10505 .subsys_id = cpu_cgroup_subsys_id, 10504 .subsys_id = cpu_cgroup_subsys_id,
10506 .early_init = 1, 10505 .early_init = 1,
10507 }; 10506 };
10508 10507
10509 #endif /* CONFIG_CGROUP_SCHED */ 10508 #endif /* CONFIG_CGROUP_SCHED */
10510 10509
10511 #ifdef CONFIG_CGROUP_CPUACCT 10510 #ifdef CONFIG_CGROUP_CPUACCT
10512 10511
10513 /* 10512 /*
10514 * CPU accounting code for task groups. 10513 * CPU accounting code for task groups.
10515 * 10514 *
10516 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 10515 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
10517 * (balbir@in.ibm.com). 10516 * (balbir@in.ibm.com).
10518 */ 10517 */
10519 10518
10520 /* track cpu usage of a group of tasks and its child groups */ 10519 /* track cpu usage of a group of tasks and its child groups */
10521 struct cpuacct { 10520 struct cpuacct {
10522 struct cgroup_subsys_state css; 10521 struct cgroup_subsys_state css;
10523 /* cpuusage holds pointer to a u64-type object on every cpu */ 10522 /* cpuusage holds pointer to a u64-type object on every cpu */
10524 u64 *cpuusage; 10523 u64 *cpuusage;
10525 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 10524 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10526 struct cpuacct *parent; 10525 struct cpuacct *parent;
10527 }; 10526 };
10528 10527
10529 struct cgroup_subsys cpuacct_subsys; 10528 struct cgroup_subsys cpuacct_subsys;
10530 10529
10531 /* return cpu accounting group corresponding to this container */ 10530 /* return cpu accounting group corresponding to this container */
10532 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) 10531 static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
10533 { 10532 {
10534 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), 10533 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
10535 struct cpuacct, css); 10534 struct cpuacct, css);
10536 } 10535 }
10537 10536
10538 /* return cpu accounting group to which this task belongs */ 10537 /* return cpu accounting group to which this task belongs */
10539 static inline struct cpuacct *task_ca(struct task_struct *tsk) 10538 static inline struct cpuacct *task_ca(struct task_struct *tsk)
10540 { 10539 {
10541 return container_of(task_subsys_state(tsk, cpuacct_subsys_id), 10540 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
10542 struct cpuacct, css); 10541 struct cpuacct, css);
10543 } 10542 }
10544 10543
10545 /* create a new cpu accounting group */ 10544 /* create a new cpu accounting group */
10546 static struct cgroup_subsys_state *cpuacct_create( 10545 static struct cgroup_subsys_state *cpuacct_create(
10547 struct cgroup_subsys *ss, struct cgroup *cgrp) 10546 struct cgroup_subsys *ss, struct cgroup *cgrp)
10548 { 10547 {
10549 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 10548 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
10550 int i; 10549 int i;
10551 10550
10552 if (!ca) 10551 if (!ca)
10553 goto out; 10552 goto out;
10554 10553
10555 ca->cpuusage = alloc_percpu(u64); 10554 ca->cpuusage = alloc_percpu(u64);
10556 if (!ca->cpuusage) 10555 if (!ca->cpuusage)
10557 goto out_free_ca; 10556 goto out_free_ca;
10558 10557
10559 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 10558 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10560 if (percpu_counter_init(&ca->cpustat[i], 0)) 10559 if (percpu_counter_init(&ca->cpustat[i], 0))
10561 goto out_free_counters; 10560 goto out_free_counters;
10562 10561
10563 if (cgrp->parent) 10562 if (cgrp->parent)
10564 ca->parent = cgroup_ca(cgrp->parent); 10563 ca->parent = cgroup_ca(cgrp->parent);
10565 10564
10566 return &ca->css; 10565 return &ca->css;
10567 10566
10568 out_free_counters: 10567 out_free_counters:
10569 while (--i >= 0) 10568 while (--i >= 0)
10570 percpu_counter_destroy(&ca->cpustat[i]); 10569 percpu_counter_destroy(&ca->cpustat[i]);
10571 free_percpu(ca->cpuusage); 10570 free_percpu(ca->cpuusage);
10572 out_free_ca: 10571 out_free_ca:
10573 kfree(ca); 10572 kfree(ca);
10574 out: 10573 out:
10575 return ERR_PTR(-ENOMEM); 10574 return ERR_PTR(-ENOMEM);
10576 } 10575 }
10577 10576
10578 /* destroy an existing cpu accounting group */ 10577 /* destroy an existing cpu accounting group */
10579 static void 10578 static void
10580 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 10579 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10581 { 10580 {
10582 struct cpuacct *ca = cgroup_ca(cgrp); 10581 struct cpuacct *ca = cgroup_ca(cgrp);
10583 int i; 10582 int i;
10584 10583
10585 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) 10584 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10586 percpu_counter_destroy(&ca->cpustat[i]); 10585 percpu_counter_destroy(&ca->cpustat[i]);
10587 free_percpu(ca->cpuusage); 10586 free_percpu(ca->cpuusage);
10588 kfree(ca); 10587 kfree(ca);
10589 } 10588 }
10590 10589
10591 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 10590 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10592 { 10591 {
10593 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 10592 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10594 u64 data; 10593 u64 data;
10595 10594
10596 #ifndef CONFIG_64BIT 10595 #ifndef CONFIG_64BIT
10597 /* 10596 /*
10598 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 10597 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10599 */ 10598 */
10600 spin_lock_irq(&cpu_rq(cpu)->lock); 10599 spin_lock_irq(&cpu_rq(cpu)->lock);
10601 data = *cpuusage; 10600 data = *cpuusage;
10602 spin_unlock_irq(&cpu_rq(cpu)->lock); 10601 spin_unlock_irq(&cpu_rq(cpu)->lock);
10603 #else 10602 #else
10604 data = *cpuusage; 10603 data = *cpuusage;
10605 #endif 10604 #endif
10606 10605
10607 return data; 10606 return data;
10608 } 10607 }
10609 10608
10610 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 10609 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10611 { 10610 {
10612 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 10611 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10613 10612
10614 #ifndef CONFIG_64BIT 10613 #ifndef CONFIG_64BIT
10615 /* 10614 /*
10616 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 10615 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10617 */ 10616 */
10618 spin_lock_irq(&cpu_rq(cpu)->lock); 10617 spin_lock_irq(&cpu_rq(cpu)->lock);
10619 *cpuusage = val; 10618 *cpuusage = val;
10620 spin_unlock_irq(&cpu_rq(cpu)->lock); 10619 spin_unlock_irq(&cpu_rq(cpu)->lock);
10621 #else 10620 #else
10622 *cpuusage = val; 10621 *cpuusage = val;
10623 #endif 10622 #endif
10624 } 10623 }
10625 10624
10626 /* return total cpu usage (in nanoseconds) of a group */ 10625 /* return total cpu usage (in nanoseconds) of a group */
10627 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) 10626 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
10628 { 10627 {
10629 struct cpuacct *ca = cgroup_ca(cgrp); 10628 struct cpuacct *ca = cgroup_ca(cgrp);
10630 u64 totalcpuusage = 0; 10629 u64 totalcpuusage = 0;
10631 int i; 10630 int i;
10632 10631
10633 for_each_present_cpu(i) 10632 for_each_present_cpu(i)
10634 totalcpuusage += cpuacct_cpuusage_read(ca, i); 10633 totalcpuusage += cpuacct_cpuusage_read(ca, i);
10635 10634
10636 return totalcpuusage; 10635 return totalcpuusage;
10637 } 10636 }
10638 10637
10639 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, 10638 static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
10640 u64 reset) 10639 u64 reset)
10641 { 10640 {
10642 struct cpuacct *ca = cgroup_ca(cgrp); 10641 struct cpuacct *ca = cgroup_ca(cgrp);
10643 int err = 0; 10642 int err = 0;
10644 int i; 10643 int i;
10645 10644
10646 if (reset) { 10645 if (reset) {
10647 err = -EINVAL; 10646 err = -EINVAL;
10648 goto out; 10647 goto out;
10649 } 10648 }
10650 10649
10651 for_each_present_cpu(i) 10650 for_each_present_cpu(i)
10652 cpuacct_cpuusage_write(ca, i, 0); 10651 cpuacct_cpuusage_write(ca, i, 0);
10653 10652
10654 out: 10653 out:
10655 return err; 10654 return err;
10656 } 10655 }
10657 10656
10658 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, 10657 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
10659 struct seq_file *m) 10658 struct seq_file *m)
10660 { 10659 {
10661 struct cpuacct *ca = cgroup_ca(cgroup); 10660 struct cpuacct *ca = cgroup_ca(cgroup);
10662 u64 percpu; 10661 u64 percpu;
10663 int i; 10662 int i;
10664 10663
10665 for_each_present_cpu(i) { 10664 for_each_present_cpu(i) {
10666 percpu = cpuacct_cpuusage_read(ca, i); 10665 percpu = cpuacct_cpuusage_read(ca, i);
10667 seq_printf(m, "%llu ", (unsigned long long) percpu); 10666 seq_printf(m, "%llu ", (unsigned long long) percpu);
10668 } 10667 }
10669 seq_printf(m, "\n"); 10668 seq_printf(m, "\n");
10670 return 0; 10669 return 0;
10671 } 10670 }
10672 10671
10673 static const char *cpuacct_stat_desc[] = { 10672 static const char *cpuacct_stat_desc[] = {
10674 [CPUACCT_STAT_USER] = "user", 10673 [CPUACCT_STAT_USER] = "user",
10675 [CPUACCT_STAT_SYSTEM] = "system", 10674 [CPUACCT_STAT_SYSTEM] = "system",
10676 }; 10675 };
10677 10676
10678 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, 10677 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
10679 struct cgroup_map_cb *cb) 10678 struct cgroup_map_cb *cb)
10680 { 10679 {
10681 struct cpuacct *ca = cgroup_ca(cgrp); 10680 struct cpuacct *ca = cgroup_ca(cgrp);
10682 int i; 10681 int i;
10683 10682
10684 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { 10683 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
10685 s64 val = percpu_counter_read(&ca->cpustat[i]); 10684 s64 val = percpu_counter_read(&ca->cpustat[i]);
10686 val = cputime64_to_clock_t(val); 10685 val = cputime64_to_clock_t(val);
10687 cb->fill(cb, cpuacct_stat_desc[i], val); 10686 cb->fill(cb, cpuacct_stat_desc[i], val);
10688 } 10687 }
10689 return 0; 10688 return 0;
10690 } 10689 }
10691 10690
10692 static struct cftype files[] = { 10691 static struct cftype files[] = {
10693 { 10692 {
10694 .name = "usage", 10693 .name = "usage",
10695 .read_u64 = cpuusage_read, 10694 .read_u64 = cpuusage_read,
10696 .write_u64 = cpuusage_write, 10695 .write_u64 = cpuusage_write,
10697 }, 10696 },
10698 { 10697 {
10699 .name = "usage_percpu", 10698 .name = "usage_percpu",
10700 .read_seq_string = cpuacct_percpu_seq_read, 10699 .read_seq_string = cpuacct_percpu_seq_read,
10701 }, 10700 },
10702 { 10701 {
10703 .name = "stat", 10702 .name = "stat",
10704 .read_map = cpuacct_stats_show, 10703 .read_map = cpuacct_stats_show,
10705 }, 10704 },
10706 }; 10705 };
10707 10706
10708 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 10707 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
10709 { 10708 {
10710 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); 10709 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
10711 } 10710 }
10712 10711
10713 /* 10712 /*
10714 * charge this task's execution time to its accounting group. 10713 * charge this task's execution time to its accounting group.
10715 * 10714 *
10716 * called with rq->lock held. 10715 * called with rq->lock held.
10717 */ 10716 */
10718 static void cpuacct_charge(struct task_struct *tsk, u64 cputime) 10717 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10719 { 10718 {
10720 struct cpuacct *ca; 10719 struct cpuacct *ca;
10721 int cpu; 10720 int cpu;
10722 10721
10723 if (unlikely(!cpuacct_subsys.active)) 10722 if (unlikely(!cpuacct_subsys.active))
10724 return; 10723 return;
10725 10724
10726 cpu = task_cpu(tsk); 10725 cpu = task_cpu(tsk);
10727 10726
10728 rcu_read_lock(); 10727 rcu_read_lock();
10729 10728
10730 ca = task_ca(tsk); 10729 ca = task_ca(tsk);
10731 10730
10732 for (; ca; ca = ca->parent) { 10731 for (; ca; ca = ca->parent) {
10733 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 10732 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10734 *cpuusage += cputime; 10733 *cpuusage += cputime;
10735 } 10734 }
10736 10735
10737 rcu_read_unlock(); 10736 rcu_read_unlock();
10738 } 10737 }
10739 10738
10740 /* 10739 /*
10741 * Charge the system/user time to the task's accounting group. 10740 * Charge the system/user time to the task's accounting group.
10742 */ 10741 */
10743 static void cpuacct_update_stats(struct task_struct *tsk, 10742 static void cpuacct_update_stats(struct task_struct *tsk,
10744 enum cpuacct_stat_index idx, cputime_t val) 10743 enum cpuacct_stat_index idx, cputime_t val)
10745 { 10744 {
10746 struct cpuacct *ca; 10745 struct cpuacct *ca;
10747 10746
10748 if (unlikely(!cpuacct_subsys.active)) 10747 if (unlikely(!cpuacct_subsys.active))
10749 return; 10748 return;
10750 10749
10751 rcu_read_lock(); 10750 rcu_read_lock();
10752 ca = task_ca(tsk); 10751 ca = task_ca(tsk);
10753 10752
10754 do { 10753 do {
10755 percpu_counter_add(&ca->cpustat[idx], val); 10754 percpu_counter_add(&ca->cpustat[idx], val);
10756 ca = ca->parent; 10755 ca = ca->parent;
10757 } while (ca); 10756 } while (ca);
10758 rcu_read_unlock(); 10757 rcu_read_unlock();
10759 } 10758 }
10760 10759
10761 struct cgroup_subsys cpuacct_subsys = { 10760 struct cgroup_subsys cpuacct_subsys = {
10762 .name = "cpuacct", 10761 .name = "cpuacct",
10763 .create = cpuacct_create, 10762 .create = cpuacct_create,
10764 .destroy = cpuacct_destroy, 10763 .destroy = cpuacct_destroy,
10765 .populate = cpuacct_populate, 10764 .populate = cpuacct_populate,
10766 .subsys_id = cpuacct_subsys_id, 10765 .subsys_id = cpuacct_subsys_id,
10767 }; 10766 };
10768 #endif /* CONFIG_CGROUP_CPUACCT */ 10767 #endif /* CONFIG_CGROUP_CPUACCT */
10769 10768
10770 #ifndef CONFIG_SMP 10769 #ifndef CONFIG_SMP
10771 10770
10772 int rcu_expedited_torture_stats(char *page) 10771 int rcu_expedited_torture_stats(char *page)
10773 { 10772 {
10774 return 0; 10773 return 0;
10775 } 10774 }
10776 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); 10775 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10777 10776
10778 void synchronize_sched_expedited(void) 10777 void synchronize_sched_expedited(void)
10779 { 10778 {
10780 } 10779 }
10781 EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 10780 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10782 10781
10783 #else /* #ifndef CONFIG_SMP */ 10782 #else /* #ifndef CONFIG_SMP */
10784 10783
10785 static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 10784 static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10786 static DEFINE_MUTEX(rcu_sched_expedited_mutex); 10785 static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10787 10786
10788 #define RCU_EXPEDITED_STATE_POST -2 10787 #define RCU_EXPEDITED_STATE_POST -2
10789 #define RCU_EXPEDITED_STATE_IDLE -1 10788 #define RCU_EXPEDITED_STATE_IDLE -1
10790 10789
10791 static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10790 static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10792 10791
10793 int rcu_expedited_torture_stats(char *page) 10792 int rcu_expedited_torture_stats(char *page)
10794 { 10793 {
10795 int cnt = 0; 10794 int cnt = 0;
10796 int cpu; 10795 int cpu;
10797 10796
10798 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 10797 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10799 for_each_online_cpu(cpu) { 10798 for_each_online_cpu(cpu) {
10800 cnt += sprintf(&page[cnt], " %d:%d", 10799 cnt += sprintf(&page[cnt], " %d:%d",
10801 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 10800 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10802 } 10801 }
10803 cnt += sprintf(&page[cnt], "\n"); 10802 cnt += sprintf(&page[cnt], "\n");
10804 return cnt; 10803 return cnt;
10805 } 10804 }
10806 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); 10805 EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10807 10806
10808 static long synchronize_sched_expedited_count; 10807 static long synchronize_sched_expedited_count;
10809 10808
10810 /* 10809 /*
10811 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 10810 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10812 * approach to force grace period to end quickly. This consumes 10811 * approach to force grace period to end quickly. This consumes
10813 * significant time on all CPUs, and is thus not recommended for 10812 * significant time on all CPUs, and is thus not recommended for
10814 * any sort of common-case code. 10813 * any sort of common-case code.
10815 * 10814 *
10816 * Note that it is illegal to call this function while holding any 10815 * Note that it is illegal to call this function while holding any
10817 * lock that is acquired by a CPU-hotplug notifier. Failing to 10816 * lock that is acquired by a CPU-hotplug notifier. Failing to
10818 * observe this restriction will result in deadlock. 10817 * observe this restriction will result in deadlock.
10819 */ 10818 */
10820 void synchronize_sched_expedited(void) 10819 void synchronize_sched_expedited(void)
10821 { 10820 {
10822 int cpu; 10821 int cpu;
10823 unsigned long flags; 10822 unsigned long flags;
10824 bool need_full_sync = 0; 10823 bool need_full_sync = 0;
10825 struct rq *rq; 10824 struct rq *rq;
10826 struct migration_req *req; 10825 struct migration_req *req;
10827 long snap; 10826 long snap;
10828 int trycount = 0; 10827 int trycount = 0;
10829 10828
10830 smp_mb(); /* ensure prior mod happens before capturing snap. */ 10829 smp_mb(); /* ensure prior mod happens before capturing snap. */
10831 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 10830 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10832 get_online_cpus(); 10831 get_online_cpus();
10833 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 10832 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10834 put_online_cpus(); 10833 put_online_cpus();
10835 if (trycount++ < 10) 10834 if (trycount++ < 10)
10836 udelay(trycount * num_online_cpus()); 10835 udelay(trycount * num_online_cpus());
10837 else { 10836 else {
10838 synchronize_sched(); 10837 synchronize_sched();
10839 return; 10838 return;
10840 } 10839 }
10841 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 10840 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10842 smp_mb(); /* ensure test happens before caller kfree */ 10841 smp_mb(); /* ensure test happens before caller kfree */
10843 return; 10842 return;
10844 } 10843 }
10845 get_online_cpus(); 10844 get_online_cpus();
10846 } 10845 }
10847 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 10846 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10848 for_each_online_cpu(cpu) { 10847 for_each_online_cpu(cpu) {
10849 rq = cpu_rq(cpu); 10848 rq = cpu_rq(cpu);
10850 req = &per_cpu(rcu_migration_req, cpu); 10849 req = &per_cpu(rcu_migration_req, cpu);
10851 init_completion(&req->done); 10850 init_completion(&req->done);
10852 req->task = NULL; 10851 req->task = NULL;
10853 req->dest_cpu = RCU_MIGRATION_NEED_QS; 10852 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10854 spin_lock_irqsave(&rq->lock, flags); 10853 spin_lock_irqsave(&rq->lock, flags);
10855 list_add(&req->list, &rq->migration_queue); 10854 list_add(&req->list, &rq->migration_queue);
10856 spin_unlock_irqrestore(&rq->lock, flags); 10855 spin_unlock_irqrestore(&rq->lock, flags);
10857 wake_up_process(rq->migration_thread); 10856 wake_up_process(rq->migration_thread);
10858 } 10857 }
10859 for_each_online_cpu(cpu) { 10858 for_each_online_cpu(cpu) {
10860 rcu_expedited_state = cpu; 10859 rcu_expedited_state = cpu;
10861 req = &per_cpu(rcu_migration_req, cpu); 10860 req = &per_cpu(rcu_migration_req, cpu);
10862 rq = cpu_rq(cpu); 10861 rq = cpu_rq(cpu);
10863 wait_for_completion(&req->done); 10862 wait_for_completion(&req->done);
10864 spin_lock_irqsave(&rq->lock, flags); 10863 spin_lock_irqsave(&rq->lock, flags);
10865 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 10864 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10866 need_full_sync = 1; 10865 need_full_sync = 1;
10867 req->dest_cpu = RCU_MIGRATION_IDLE; 10866 req->dest_cpu = RCU_MIGRATION_IDLE;
10868 spin_unlock_irqrestore(&rq->lock, flags); 10867 spin_unlock_irqrestore(&rq->lock, flags);
10869 } 10868 }
10870 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 10869 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10871 mutex_unlock(&rcu_sched_expedited_mutex); 10870 mutex_unlock(&rcu_sched_expedited_mutex);
10872 put_online_cpus(); 10871 put_online_cpus();
10873 if (need_full_sync) 10872 if (need_full_sync)
10874 synchronize_sched(); 10873 synchronize_sched();
10875 } 10874 }
10876 EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 10875 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10877 10876
10878 #endif /* #else #ifndef CONFIG_SMP */ 10877 #endif /* #else #ifndef CONFIG_SMP */
10879 10878
1 /* Worker thread pool for slow items, such as filesystem lookups or mkdirs 1 /* Worker thread pool for slow items, such as filesystem lookups or mkdirs
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence 7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version. 9 * 2 of the Licence, or (at your option) any later version.
10 * 10 *
11 * See Documentation/slow-work.txt 11 * See Documentation/slow-work.txt
12 */ 12 */
13 13
14 #include <linux/module.h> 14 #include <linux/module.h>
15 #include <linux/slow-work.h> 15 #include <linux/slow-work.h>
16 #include <linux/kthread.h> 16 #include <linux/kthread.h>
17 #include <linux/freezer.h> 17 #include <linux/freezer.h>
18 #include <linux/wait.h> 18 #include <linux/wait.h>
19 19
20 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of 20 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
21 * things to do */ 21 * things to do */
22 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after 22 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
23 * OOM */ 23 * OOM */
24 24
25 static void slow_work_cull_timeout(unsigned long); 25 static void slow_work_cull_timeout(unsigned long);
26 static void slow_work_oom_timeout(unsigned long); 26 static void slow_work_oom_timeout(unsigned long);
27 27
28 #ifdef CONFIG_SYSCTL 28 #ifdef CONFIG_SYSCTL
29 static int slow_work_min_threads_sysctl(struct ctl_table *, int, 29 static int slow_work_min_threads_sysctl(struct ctl_table *, int,
30 void __user *, size_t *, loff_t *); 30 void __user *, size_t *, loff_t *);
31 31
32 static int slow_work_max_threads_sysctl(struct ctl_table *, int , 32 static int slow_work_max_threads_sysctl(struct ctl_table *, int ,
33 void __user *, size_t *, loff_t *); 33 void __user *, size_t *, loff_t *);
34 #endif 34 #endif
35 35
36 /* 36 /*
37 * The pool of threads has at least min threads in it as long as someone is 37 * The pool of threads has at least min threads in it as long as someone is
38 * using the facility, and may have as many as max. 38 * using the facility, and may have as many as max.
39 * 39 *
40 * A portion of the pool may be processing very slow operations. 40 * A portion of the pool may be processing very slow operations.
41 */ 41 */
42 static unsigned slow_work_min_threads = 2; 42 static unsigned slow_work_min_threads = 2;
43 static unsigned slow_work_max_threads = 4; 43 static unsigned slow_work_max_threads = 4;
44 static unsigned vslow_work_proportion = 50; /* % of threads that may process 44 static unsigned vslow_work_proportion = 50; /* % of threads that may process
45 * very slow work */ 45 * very slow work */
46 46
47 #ifdef CONFIG_SYSCTL 47 #ifdef CONFIG_SYSCTL
48 static const int slow_work_min_min_threads = 2; 48 static const int slow_work_min_min_threads = 2;
49 static int slow_work_max_max_threads = 255; 49 static int slow_work_max_max_threads = 255;
50 static const int slow_work_min_vslow = 1; 50 static const int slow_work_min_vslow = 1;
51 static const int slow_work_max_vslow = 99; 51 static const int slow_work_max_vslow = 99;
52 52
53 ctl_table slow_work_sysctls[] = { 53 ctl_table slow_work_sysctls[] = {
54 { 54 {
55 .ctl_name = CTL_UNNUMBERED,
56 .procname = "min-threads", 55 .procname = "min-threads",
57 .data = &slow_work_min_threads, 56 .data = &slow_work_min_threads,
58 .maxlen = sizeof(unsigned), 57 .maxlen = sizeof(unsigned),
59 .mode = 0644, 58 .mode = 0644,
60 .proc_handler = slow_work_min_threads_sysctl, 59 .proc_handler = slow_work_min_threads_sysctl,
61 .extra1 = (void *) &slow_work_min_min_threads, 60 .extra1 = (void *) &slow_work_min_min_threads,
62 .extra2 = &slow_work_max_threads, 61 .extra2 = &slow_work_max_threads,
63 }, 62 },
64 { 63 {
65 .ctl_name = CTL_UNNUMBERED,
66 .procname = "max-threads", 64 .procname = "max-threads",
67 .data = &slow_work_max_threads, 65 .data = &slow_work_max_threads,
68 .maxlen = sizeof(unsigned), 66 .maxlen = sizeof(unsigned),
69 .mode = 0644, 67 .mode = 0644,
70 .proc_handler = slow_work_max_threads_sysctl, 68 .proc_handler = slow_work_max_threads_sysctl,
71 .extra1 = &slow_work_min_threads, 69 .extra1 = &slow_work_min_threads,
72 .extra2 = (void *) &slow_work_max_max_threads, 70 .extra2 = (void *) &slow_work_max_max_threads,
73 }, 71 },
74 { 72 {
75 .ctl_name = CTL_UNNUMBERED,
76 .procname = "vslow-percentage", 73 .procname = "vslow-percentage",
77 .data = &vslow_work_proportion, 74 .data = &vslow_work_proportion,
78 .maxlen = sizeof(unsigned), 75 .maxlen = sizeof(unsigned),
79 .mode = 0644, 76 .mode = 0644,
80 .proc_handler = &proc_dointvec_minmax, 77 .proc_handler = &proc_dointvec_minmax,
81 .extra1 = (void *) &slow_work_min_vslow, 78 .extra1 = (void *) &slow_work_min_vslow,
82 .extra2 = (void *) &slow_work_max_vslow, 79 .extra2 = (void *) &slow_work_max_vslow,
83 }, 80 },
84 { .ctl_name = 0 } 81 {}
85 }; 82 };
86 #endif 83 #endif
87 84
88 /* 85 /*
89 * The active state of the thread pool 86 * The active state of the thread pool
90 */ 87 */
91 static atomic_t slow_work_thread_count; 88 static atomic_t slow_work_thread_count;
92 static atomic_t vslow_work_executing_count; 89 static atomic_t vslow_work_executing_count;
93 90
94 static bool slow_work_may_not_start_new_thread; 91 static bool slow_work_may_not_start_new_thread;
95 static bool slow_work_cull; /* cull a thread due to lack of activity */ 92 static bool slow_work_cull; /* cull a thread due to lack of activity */
96 static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); 93 static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
97 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); 94 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
98 static struct slow_work slow_work_new_thread; /* new thread starter */ 95 static struct slow_work slow_work_new_thread; /* new thread starter */
99 96
100 /* 97 /*
101 * The queues of work items and the lock governing access to them. These are 98 * The queues of work items and the lock governing access to them. These are
102 * shared between all the CPUs. It doesn't make sense to have per-CPU queues 99 * shared between all the CPUs. It doesn't make sense to have per-CPU queues
103 * as the number of threads bears no relation to the number of CPUs. 100 * as the number of threads bears no relation to the number of CPUs.
104 * 101 *
105 * There are two queues of work items: one for slow work items, and one for 102 * There are two queues of work items: one for slow work items, and one for
106 * very slow work items. 103 * very slow work items.
107 */ 104 */
108 static LIST_HEAD(slow_work_queue); 105 static LIST_HEAD(slow_work_queue);
109 static LIST_HEAD(vslow_work_queue); 106 static LIST_HEAD(vslow_work_queue);
110 static DEFINE_SPINLOCK(slow_work_queue_lock); 107 static DEFINE_SPINLOCK(slow_work_queue_lock);
111 108
112 /* 109 /*
113 * The thread controls. A variable used to signal to the threads that they 110 * The thread controls. A variable used to signal to the threads that they
114 * should exit when the queue is empty, a waitqueue used by the threads to wait 111 * should exit when the queue is empty, a waitqueue used by the threads to wait
115 * for signals, and a completion set by the last thread to exit. 112 * for signals, and a completion set by the last thread to exit.
116 */ 113 */
117 static bool slow_work_threads_should_exit; 114 static bool slow_work_threads_should_exit;
118 static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); 115 static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
119 static DECLARE_COMPLETION(slow_work_last_thread_exited); 116 static DECLARE_COMPLETION(slow_work_last_thread_exited);
120 117
121 /* 118 /*
122 * The number of users of the thread pool and its lock. Whilst this is zero we 119 * The number of users of the thread pool and its lock. Whilst this is zero we
123 * have no threads hanging around, and when this reaches zero, we wait for all 120 * have no threads hanging around, and when this reaches zero, we wait for all
124 * active or queued work items to complete and kill all the threads we do have. 121 * active or queued work items to complete and kill all the threads we do have.
125 */ 122 */
126 static int slow_work_user_count; 123 static int slow_work_user_count;
127 static DEFINE_MUTEX(slow_work_user_lock); 124 static DEFINE_MUTEX(slow_work_user_lock);
128 125
129 /* 126 /*
130 * Calculate the maximum number of active threads in the pool that are 127 * Calculate the maximum number of active threads in the pool that are
131 * permitted to process very slow work items. 128 * permitted to process very slow work items.
132 * 129 *
133 * The answer is rounded up to at least 1, but may not equal or exceed the 130 * The answer is rounded up to at least 1, but may not equal or exceed the
134 * maximum number of the threads in the pool. This means we always have at 131 * maximum number of the threads in the pool. This means we always have at
135 * least one thread that can process slow work items, and we always have at 132 * least one thread that can process slow work items, and we always have at
136 * least one thread that won't get tied up doing so. 133 * least one thread that won't get tied up doing so.
137 */ 134 */
138 static unsigned slow_work_calc_vsmax(void) 135 static unsigned slow_work_calc_vsmax(void)
139 { 136 {
140 unsigned vsmax; 137 unsigned vsmax;
141 138
142 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; 139 vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
143 vsmax /= 100; 140 vsmax /= 100;
144 vsmax = max(vsmax, 1U); 141 vsmax = max(vsmax, 1U);
145 return min(vsmax, slow_work_max_threads - 1); 142 return min(vsmax, slow_work_max_threads - 1);
146 } 143 }
147 144
148 /* 145 /*
149 * Attempt to execute stuff queued on a slow thread. Return true if we managed 146 * Attempt to execute stuff queued on a slow thread. Return true if we managed
150 * it, false if there was nothing to do. 147 * it, false if there was nothing to do.
151 */ 148 */
152 static bool slow_work_execute(void) 149 static bool slow_work_execute(void)
153 { 150 {
154 struct slow_work *work = NULL; 151 struct slow_work *work = NULL;
155 unsigned vsmax; 152 unsigned vsmax;
156 bool very_slow; 153 bool very_slow;
157 154
158 vsmax = slow_work_calc_vsmax(); 155 vsmax = slow_work_calc_vsmax();
159 156
160 /* see if we can schedule a new thread to be started if we're not 157 /* see if we can schedule a new thread to be started if we're not
161 * keeping up with the work */ 158 * keeping up with the work */
162 if (!waitqueue_active(&slow_work_thread_wq) && 159 if (!waitqueue_active(&slow_work_thread_wq) &&
163 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && 160 (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
164 atomic_read(&slow_work_thread_count) < slow_work_max_threads && 161 atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
165 !slow_work_may_not_start_new_thread) 162 !slow_work_may_not_start_new_thread)
166 slow_work_enqueue(&slow_work_new_thread); 163 slow_work_enqueue(&slow_work_new_thread);
167 164
168 /* find something to execute */ 165 /* find something to execute */
169 spin_lock_irq(&slow_work_queue_lock); 166 spin_lock_irq(&slow_work_queue_lock);
170 if (!list_empty(&vslow_work_queue) && 167 if (!list_empty(&vslow_work_queue) &&
171 atomic_read(&vslow_work_executing_count) < vsmax) { 168 atomic_read(&vslow_work_executing_count) < vsmax) {
172 work = list_entry(vslow_work_queue.next, 169 work = list_entry(vslow_work_queue.next,
173 struct slow_work, link); 170 struct slow_work, link);
174 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) 171 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
175 BUG(); 172 BUG();
176 list_del_init(&work->link); 173 list_del_init(&work->link);
177 atomic_inc(&vslow_work_executing_count); 174 atomic_inc(&vslow_work_executing_count);
178 very_slow = true; 175 very_slow = true;
179 } else if (!list_empty(&slow_work_queue)) { 176 } else if (!list_empty(&slow_work_queue)) {
180 work = list_entry(slow_work_queue.next, 177 work = list_entry(slow_work_queue.next,
181 struct slow_work, link); 178 struct slow_work, link);
182 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) 179 if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
183 BUG(); 180 BUG();
184 list_del_init(&work->link); 181 list_del_init(&work->link);
185 very_slow = false; 182 very_slow = false;
186 } else { 183 } else {
187 very_slow = false; /* avoid the compiler warning */ 184 very_slow = false; /* avoid the compiler warning */
188 } 185 }
189 spin_unlock_irq(&slow_work_queue_lock); 186 spin_unlock_irq(&slow_work_queue_lock);
190 187
191 if (!work) 188 if (!work)
192 return false; 189 return false;
193 190
194 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) 191 if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
195 BUG(); 192 BUG();
196 193
197 work->ops->execute(work); 194 work->ops->execute(work);
198 195
199 if (very_slow) 196 if (very_slow)
200 atomic_dec(&vslow_work_executing_count); 197 atomic_dec(&vslow_work_executing_count);
201 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); 198 clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
202 199
203 /* if someone tried to enqueue the item whilst we were executing it, 200 /* if someone tried to enqueue the item whilst we were executing it,
204 * then it'll be left unenqueued to avoid multiple threads trying to 201 * then it'll be left unenqueued to avoid multiple threads trying to
205 * execute it simultaneously 202 * execute it simultaneously
206 * 203 *
207 * there is, however, a race between us testing the pending flag and 204 * there is, however, a race between us testing the pending flag and
208 * getting the spinlock, and between the enqueuer setting the pending 205 * getting the spinlock, and between the enqueuer setting the pending
209 * flag and getting the spinlock, so we use a deferral bit to tell us 206 * flag and getting the spinlock, so we use a deferral bit to tell us
210 * if the enqueuer got there first 207 * if the enqueuer got there first
211 */ 208 */
212 if (test_bit(SLOW_WORK_PENDING, &work->flags)) { 209 if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
213 spin_lock_irq(&slow_work_queue_lock); 210 spin_lock_irq(&slow_work_queue_lock);
214 211
215 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && 212 if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
216 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) 213 test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
217 goto auto_requeue; 214 goto auto_requeue;
218 215
219 spin_unlock_irq(&slow_work_queue_lock); 216 spin_unlock_irq(&slow_work_queue_lock);
220 } 217 }
221 218
222 work->ops->put_ref(work); 219 work->ops->put_ref(work);
223 return true; 220 return true;
224 221
225 auto_requeue: 222 auto_requeue:
226 /* we must complete the enqueue operation 223 /* we must complete the enqueue operation
227 * - we transfer our ref on the item back to the appropriate queue 224 * - we transfer our ref on the item back to the appropriate queue
228 * - don't wake another thread up as we're awake already 225 * - don't wake another thread up as we're awake already
229 */ 226 */
230 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 227 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
231 list_add_tail(&work->link, &vslow_work_queue); 228 list_add_tail(&work->link, &vslow_work_queue);
232 else 229 else
233 list_add_tail(&work->link, &slow_work_queue); 230 list_add_tail(&work->link, &slow_work_queue);
234 spin_unlock_irq(&slow_work_queue_lock); 231 spin_unlock_irq(&slow_work_queue_lock);
235 return true; 232 return true;
236 } 233 }
237 234
238 /** 235 /**
239 * slow_work_enqueue - Schedule a slow work item for processing 236 * slow_work_enqueue - Schedule a slow work item for processing
240 * @work: The work item to queue 237 * @work: The work item to queue
241 * 238 *
242 * Schedule a slow work item for processing. If the item is already undergoing 239 * Schedule a slow work item for processing. If the item is already undergoing
243 * execution, this guarantees not to re-enter the execution routine until the 240 * execution, this guarantees not to re-enter the execution routine until the
244 * first execution finishes. 241 * first execution finishes.
245 * 242 *
246 * The item is pinned by this function as it retains a reference to it, managed 243 * The item is pinned by this function as it retains a reference to it, managed
247 * through the item operations. The item is unpinned once it has been 244 * through the item operations. The item is unpinned once it has been
248 * executed. 245 * executed.
249 * 246 *
250 * An item may hog the thread that is running it for a relatively large amount 247 * An item may hog the thread that is running it for a relatively large amount
251 * of time, sufficient, for example, to perform several lookup, mkdir, create 248 * of time, sufficient, for example, to perform several lookup, mkdir, create
252 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. 249 * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
253 * 250 *
254 * Conversely, if a number of items are awaiting processing, it may take some 251 * Conversely, if a number of items are awaiting processing, it may take some
255 * time before any given item is given attention. The number of threads in the 252 * time before any given item is given attention. The number of threads in the
256 * pool may be increased to deal with demand, but only up to a limit. 253 * pool may be increased to deal with demand, but only up to a limit.
257 * 254 *
258 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in 255 * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
259 * the very slow queue, from which only a portion of the threads will be 256 * the very slow queue, from which only a portion of the threads will be
260 * allowed to pick items to execute. This ensures that very slow items won't 257 * allowed to pick items to execute. This ensures that very slow items won't
261 * overly block ones that are just ordinarily slow. 258 * overly block ones that are just ordinarily slow.
262 * 259 *
263 * Returns 0 if successful, -EAGAIN if not. 260 * Returns 0 if successful, -EAGAIN if not.
264 */ 261 */
265 int slow_work_enqueue(struct slow_work *work) 262 int slow_work_enqueue(struct slow_work *work)
266 { 263 {
267 unsigned long flags; 264 unsigned long flags;
268 265
269 BUG_ON(slow_work_user_count <= 0); 266 BUG_ON(slow_work_user_count <= 0);
270 BUG_ON(!work); 267 BUG_ON(!work);
271 BUG_ON(!work->ops); 268 BUG_ON(!work->ops);
272 BUG_ON(!work->ops->get_ref); 269 BUG_ON(!work->ops->get_ref);
273 270
274 /* when honouring an enqueue request, we only promise that we will run 271 /* when honouring an enqueue request, we only promise that we will run
275 * the work function in the future; we do not promise to run it once 272 * the work function in the future; we do not promise to run it once
276 * per enqueue request 273 * per enqueue request
277 * 274 *
278 * we use the PENDING bit to merge together repeat requests without 275 * we use the PENDING bit to merge together repeat requests without
279 * having to disable IRQs and take the spinlock, whilst still 276 * having to disable IRQs and take the spinlock, whilst still
280 * maintaining our promise 277 * maintaining our promise
281 */ 278 */
282 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { 279 if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
283 spin_lock_irqsave(&slow_work_queue_lock, flags); 280 spin_lock_irqsave(&slow_work_queue_lock, flags);
284 281
285 /* we promise that we will not attempt to execute the work 282 /* we promise that we will not attempt to execute the work
286 * function in more than one thread simultaneously 283 * function in more than one thread simultaneously
287 * 284 *
288 * this, however, leaves us with a problem if we're asked to 285 * this, however, leaves us with a problem if we're asked to
289 * enqueue the work whilst someone is executing the work 286 * enqueue the work whilst someone is executing the work
290 * function as simply queueing the work immediately means that 287 * function as simply queueing the work immediately means that
291 * another thread may try executing it whilst it is already 288 * another thread may try executing it whilst it is already
292 * under execution 289 * under execution
293 * 290 *
294 * to deal with this, we set the ENQ_DEFERRED bit instead of 291 * to deal with this, we set the ENQ_DEFERRED bit instead of
295 * enqueueing, and the thread currently executing the work 292 * enqueueing, and the thread currently executing the work
296 * function will enqueue the work item when the work function 293 * function will enqueue the work item when the work function
297 * returns and it has cleared the EXECUTING bit 294 * returns and it has cleared the EXECUTING bit
298 */ 295 */
299 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { 296 if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
300 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); 297 set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
301 } else { 298 } else {
302 if (work->ops->get_ref(work) < 0) 299 if (work->ops->get_ref(work) < 0)
303 goto cant_get_ref; 300 goto cant_get_ref;
304 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) 301 if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
305 list_add_tail(&work->link, &vslow_work_queue); 302 list_add_tail(&work->link, &vslow_work_queue);
306 else 303 else
307 list_add_tail(&work->link, &slow_work_queue); 304 list_add_tail(&work->link, &slow_work_queue);
308 wake_up(&slow_work_thread_wq); 305 wake_up(&slow_work_thread_wq);
309 } 306 }
310 307
311 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 308 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
312 } 309 }
313 return 0; 310 return 0;
314 311
315 cant_get_ref: 312 cant_get_ref:
316 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 313 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
317 return -EAGAIN; 314 return -EAGAIN;
318 } 315 }
319 EXPORT_SYMBOL(slow_work_enqueue); 316 EXPORT_SYMBOL(slow_work_enqueue);
320 317
321 /* 318 /*
322 * Schedule a cull of the thread pool at some time in the near future 319 * Schedule a cull of the thread pool at some time in the near future
323 */ 320 */
324 static void slow_work_schedule_cull(void) 321 static void slow_work_schedule_cull(void)
325 { 322 {
326 mod_timer(&slow_work_cull_timer, 323 mod_timer(&slow_work_cull_timer,
327 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT)); 324 round_jiffies(jiffies + SLOW_WORK_CULL_TIMEOUT));
328 } 325 }
329 326
330 /* 327 /*
331 * Worker thread culling algorithm 328 * Worker thread culling algorithm
332 */ 329 */
333 static bool slow_work_cull_thread(void) 330 static bool slow_work_cull_thread(void)
334 { 331 {
335 unsigned long flags; 332 unsigned long flags;
336 bool do_cull = false; 333 bool do_cull = false;
337 334
338 spin_lock_irqsave(&slow_work_queue_lock, flags); 335 spin_lock_irqsave(&slow_work_queue_lock, flags);
339 336
340 if (slow_work_cull) { 337 if (slow_work_cull) {
341 slow_work_cull = false; 338 slow_work_cull = false;
342 339
343 if (list_empty(&slow_work_queue) && 340 if (list_empty(&slow_work_queue) &&
344 list_empty(&vslow_work_queue) && 341 list_empty(&vslow_work_queue) &&
345 atomic_read(&slow_work_thread_count) > 342 atomic_read(&slow_work_thread_count) >
346 slow_work_min_threads) { 343 slow_work_min_threads) {
347 slow_work_schedule_cull(); 344 slow_work_schedule_cull();
348 do_cull = true; 345 do_cull = true;
349 } 346 }
350 } 347 }
351 348
352 spin_unlock_irqrestore(&slow_work_queue_lock, flags); 349 spin_unlock_irqrestore(&slow_work_queue_lock, flags);
353 return do_cull; 350 return do_cull;
354 } 351 }
355 352
356 /* 353 /*
357 * Determine if there is slow work available for dispatch 354 * Determine if there is slow work available for dispatch
358 */ 355 */
359 static inline bool slow_work_available(int vsmax) 356 static inline bool slow_work_available(int vsmax)
360 { 357 {
361 return !list_empty(&slow_work_queue) || 358 return !list_empty(&slow_work_queue) ||
362 (!list_empty(&vslow_work_queue) && 359 (!list_empty(&vslow_work_queue) &&
363 atomic_read(&vslow_work_executing_count) < vsmax); 360 atomic_read(&vslow_work_executing_count) < vsmax);
364 } 361 }
365 362
366 /* 363 /*
367 * Worker thread dispatcher 364 * Worker thread dispatcher
368 */ 365 */
369 static int slow_work_thread(void *_data) 366 static int slow_work_thread(void *_data)
370 { 367 {
371 int vsmax; 368 int vsmax;
372 369
373 DEFINE_WAIT(wait); 370 DEFINE_WAIT(wait);
374 371
375 set_freezable(); 372 set_freezable();
376 set_user_nice(current, -5); 373 set_user_nice(current, -5);
377 374
378 for (;;) { 375 for (;;) {
379 vsmax = vslow_work_proportion; 376 vsmax = vslow_work_proportion;
380 vsmax *= atomic_read(&slow_work_thread_count); 377 vsmax *= atomic_read(&slow_work_thread_count);
381 vsmax /= 100; 378 vsmax /= 100;
382 379
383 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait, 380 prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
384 TASK_INTERRUPTIBLE); 381 TASK_INTERRUPTIBLE);
385 if (!freezing(current) && 382 if (!freezing(current) &&
386 !slow_work_threads_should_exit && 383 !slow_work_threads_should_exit &&
387 !slow_work_available(vsmax) && 384 !slow_work_available(vsmax) &&
388 !slow_work_cull) 385 !slow_work_cull)
389 schedule(); 386 schedule();
390 finish_wait(&slow_work_thread_wq, &wait); 387 finish_wait(&slow_work_thread_wq, &wait);
391 388
392 try_to_freeze(); 389 try_to_freeze();
393 390
394 vsmax = vslow_work_proportion; 391 vsmax = vslow_work_proportion;
395 vsmax *= atomic_read(&slow_work_thread_count); 392 vsmax *= atomic_read(&slow_work_thread_count);
396 vsmax /= 100; 393 vsmax /= 100;
397 394
398 if (slow_work_available(vsmax) && slow_work_execute()) { 395 if (slow_work_available(vsmax) && slow_work_execute()) {
399 cond_resched(); 396 cond_resched();
400 if (list_empty(&slow_work_queue) && 397 if (list_empty(&slow_work_queue) &&
401 list_empty(&vslow_work_queue) && 398 list_empty(&vslow_work_queue) &&
402 atomic_read(&slow_work_thread_count) > 399 atomic_read(&slow_work_thread_count) >
403 slow_work_min_threads) 400 slow_work_min_threads)
404 slow_work_schedule_cull(); 401 slow_work_schedule_cull();
405 continue; 402 continue;
406 } 403 }
407 404
408 if (slow_work_threads_should_exit) 405 if (slow_work_threads_should_exit)
409 break; 406 break;
410 407
411 if (slow_work_cull && slow_work_cull_thread()) 408 if (slow_work_cull && slow_work_cull_thread())
412 break; 409 break;
413 } 410 }
414 411
415 if (atomic_dec_and_test(&slow_work_thread_count)) 412 if (atomic_dec_and_test(&slow_work_thread_count))
416 complete_and_exit(&slow_work_last_thread_exited, 0); 413 complete_and_exit(&slow_work_last_thread_exited, 0);
417 return 0; 414 return 0;
418 } 415 }
419 416
420 /* 417 /*
421 * Handle thread cull timer expiration 418 * Handle thread cull timer expiration
422 */ 419 */
423 static void slow_work_cull_timeout(unsigned long data) 420 static void slow_work_cull_timeout(unsigned long data)
424 { 421 {
425 slow_work_cull = true; 422 slow_work_cull = true;
426 wake_up(&slow_work_thread_wq); 423 wake_up(&slow_work_thread_wq);
427 } 424 }
428 425
429 /* 426 /*
430 * Get a reference on slow work thread starter 427 * Get a reference on slow work thread starter
431 */ 428 */
432 static int slow_work_new_thread_get_ref(struct slow_work *work) 429 static int slow_work_new_thread_get_ref(struct slow_work *work)
433 { 430 {
434 return 0; 431 return 0;
435 } 432 }
436 433
437 /* 434 /*
438 * Drop a reference on slow work thread starter 435 * Drop a reference on slow work thread starter
439 */ 436 */
440 static void slow_work_new_thread_put_ref(struct slow_work *work) 437 static void slow_work_new_thread_put_ref(struct slow_work *work)
441 { 438 {
442 } 439 }
443 440
444 /* 441 /*
445 * Start a new slow work thread 442 * Start a new slow work thread
446 */ 443 */
447 static void slow_work_new_thread_execute(struct slow_work *work) 444 static void slow_work_new_thread_execute(struct slow_work *work)
448 { 445 {
449 struct task_struct *p; 446 struct task_struct *p;
450 447
451 if (slow_work_threads_should_exit) 448 if (slow_work_threads_should_exit)
452 return; 449 return;
453 450
454 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) 451 if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
455 return; 452 return;
456 453
457 if (!mutex_trylock(&slow_work_user_lock)) 454 if (!mutex_trylock(&slow_work_user_lock))
458 return; 455 return;
459 456
460 slow_work_may_not_start_new_thread = true; 457 slow_work_may_not_start_new_thread = true;
461 atomic_inc(&slow_work_thread_count); 458 atomic_inc(&slow_work_thread_count);
462 p = kthread_run(slow_work_thread, NULL, "kslowd"); 459 p = kthread_run(slow_work_thread, NULL, "kslowd");
463 if (IS_ERR(p)) { 460 if (IS_ERR(p)) {
464 printk(KERN_DEBUG "Slow work thread pool: OOM\n"); 461 printk(KERN_DEBUG "Slow work thread pool: OOM\n");
465 if (atomic_dec_and_test(&slow_work_thread_count)) 462 if (atomic_dec_and_test(&slow_work_thread_count))
466 BUG(); /* we're running on a slow work thread... */ 463 BUG(); /* we're running on a slow work thread... */
467 mod_timer(&slow_work_oom_timer, 464 mod_timer(&slow_work_oom_timer,
468 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT)); 465 round_jiffies(jiffies + SLOW_WORK_OOM_TIMEOUT));
469 } else { 466 } else {
470 /* ratelimit the starting of new threads */ 467 /* ratelimit the starting of new threads */
471 mod_timer(&slow_work_oom_timer, jiffies + 1); 468 mod_timer(&slow_work_oom_timer, jiffies + 1);
472 } 469 }
473 470
474 mutex_unlock(&slow_work_user_lock); 471 mutex_unlock(&slow_work_user_lock);
475 } 472 }
476 473
477 static const struct slow_work_ops slow_work_new_thread_ops = { 474 static const struct slow_work_ops slow_work_new_thread_ops = {
478 .get_ref = slow_work_new_thread_get_ref, 475 .get_ref = slow_work_new_thread_get_ref,
479 .put_ref = slow_work_new_thread_put_ref, 476 .put_ref = slow_work_new_thread_put_ref,
480 .execute = slow_work_new_thread_execute, 477 .execute = slow_work_new_thread_execute,
481 }; 478 };
482 479
483 /* 480 /*
484 * post-OOM new thread start suppression expiration 481 * post-OOM new thread start suppression expiration
485 */ 482 */
486 static void slow_work_oom_timeout(unsigned long data) 483 static void slow_work_oom_timeout(unsigned long data)
487 { 484 {
488 slow_work_may_not_start_new_thread = false; 485 slow_work_may_not_start_new_thread = false;
489 } 486 }
490 487
491 #ifdef CONFIG_SYSCTL 488 #ifdef CONFIG_SYSCTL
492 /* 489 /*
493 * Handle adjustment of the minimum number of threads 490 * Handle adjustment of the minimum number of threads
494 */ 491 */
495 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, 492 static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
496 void __user *buffer, 493 void __user *buffer,
497 size_t *lenp, loff_t *ppos) 494 size_t *lenp, loff_t *ppos)
498 { 495 {
499 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 496 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
500 int n; 497 int n;
501 498
502 if (ret == 0) { 499 if (ret == 0) {
503 mutex_lock(&slow_work_user_lock); 500 mutex_lock(&slow_work_user_lock);
504 if (slow_work_user_count > 0) { 501 if (slow_work_user_count > 0) {
505 /* see if we need to start or stop threads */ 502 /* see if we need to start or stop threads */
506 n = atomic_read(&slow_work_thread_count) - 503 n = atomic_read(&slow_work_thread_count) -
507 slow_work_min_threads; 504 slow_work_min_threads;
508 505
509 if (n < 0 && !slow_work_may_not_start_new_thread) 506 if (n < 0 && !slow_work_may_not_start_new_thread)
510 slow_work_enqueue(&slow_work_new_thread); 507 slow_work_enqueue(&slow_work_new_thread);
511 else if (n > 0) 508 else if (n > 0)
512 slow_work_schedule_cull(); 509 slow_work_schedule_cull();
513 } 510 }
514 mutex_unlock(&slow_work_user_lock); 511 mutex_unlock(&slow_work_user_lock);
515 } 512 }
516 513
517 return ret; 514 return ret;
518 } 515 }
519 516
520 /* 517 /*
521 * Handle adjustment of the maximum number of threads 518 * Handle adjustment of the maximum number of threads
522 */ 519 */
523 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, 520 static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
524 void __user *buffer, 521 void __user *buffer,
525 size_t *lenp, loff_t *ppos) 522 size_t *lenp, loff_t *ppos)
526 { 523 {
527 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 524 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
528 int n; 525 int n;
529 526
530 if (ret == 0) { 527 if (ret == 0) {
531 mutex_lock(&slow_work_user_lock); 528 mutex_lock(&slow_work_user_lock);
532 if (slow_work_user_count > 0) { 529 if (slow_work_user_count > 0) {
533 /* see if we need to stop threads */ 530 /* see if we need to stop threads */
534 n = slow_work_max_threads - 531 n = slow_work_max_threads -
535 atomic_read(&slow_work_thread_count); 532 atomic_read(&slow_work_thread_count);
536 533
537 if (n < 0) 534 if (n < 0)
538 slow_work_schedule_cull(); 535 slow_work_schedule_cull();
539 } 536 }
540 mutex_unlock(&slow_work_user_lock); 537 mutex_unlock(&slow_work_user_lock);
541 } 538 }
542 539
543 return ret; 540 return ret;
544 } 541 }
545 #endif /* CONFIG_SYSCTL */ 542 #endif /* CONFIG_SYSCTL */
546 543
547 /** 544 /**
548 * slow_work_register_user - Register a user of the facility 545 * slow_work_register_user - Register a user of the facility
549 * 546 *
550 * Register a user of the facility, starting up the initial threads if there 547 * Register a user of the facility, starting up the initial threads if there
551 * aren't any other users at this point. This will return 0 if successful, or 548 * aren't any other users at this point. This will return 0 if successful, or
552 * an error if not. 549 * an error if not.
553 */ 550 */
554 int slow_work_register_user(void) 551 int slow_work_register_user(void)
555 { 552 {
556 struct task_struct *p; 553 struct task_struct *p;
557 int loop; 554 int loop;
558 555
559 mutex_lock(&slow_work_user_lock); 556 mutex_lock(&slow_work_user_lock);
560 557
561 if (slow_work_user_count == 0) { 558 if (slow_work_user_count == 0) {
562 printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); 559 printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
563 init_completion(&slow_work_last_thread_exited); 560 init_completion(&slow_work_last_thread_exited);
564 561
565 slow_work_threads_should_exit = false; 562 slow_work_threads_should_exit = false;
566 slow_work_init(&slow_work_new_thread, 563 slow_work_init(&slow_work_new_thread,
567 &slow_work_new_thread_ops); 564 &slow_work_new_thread_ops);
568 slow_work_may_not_start_new_thread = false; 565 slow_work_may_not_start_new_thread = false;
569 slow_work_cull = false; 566 slow_work_cull = false;
570 567
571 /* start the minimum number of threads */ 568 /* start the minimum number of threads */
572 for (loop = 0; loop < slow_work_min_threads; loop++) { 569 for (loop = 0; loop < slow_work_min_threads; loop++) {
573 atomic_inc(&slow_work_thread_count); 570 atomic_inc(&slow_work_thread_count);
574 p = kthread_run(slow_work_thread, NULL, "kslowd"); 571 p = kthread_run(slow_work_thread, NULL, "kslowd");
575 if (IS_ERR(p)) 572 if (IS_ERR(p))
576 goto error; 573 goto error;
577 } 574 }
578 printk(KERN_NOTICE "Slow work thread pool: Ready\n"); 575 printk(KERN_NOTICE "Slow work thread pool: Ready\n");
579 } 576 }
580 577
581 slow_work_user_count++; 578 slow_work_user_count++;
582 mutex_unlock(&slow_work_user_lock); 579 mutex_unlock(&slow_work_user_lock);
583 return 0; 580 return 0;
584 581
585 error: 582 error:
586 if (atomic_dec_and_test(&slow_work_thread_count)) 583 if (atomic_dec_and_test(&slow_work_thread_count))
587 complete(&slow_work_last_thread_exited); 584 complete(&slow_work_last_thread_exited);
588 if (loop > 0) { 585 if (loop > 0) {
589 printk(KERN_ERR "Slow work thread pool:" 586 printk(KERN_ERR "Slow work thread pool:"
590 " Aborting startup on ENOMEM\n"); 587 " Aborting startup on ENOMEM\n");
591 slow_work_threads_should_exit = true; 588 slow_work_threads_should_exit = true;
592 wake_up_all(&slow_work_thread_wq); 589 wake_up_all(&slow_work_thread_wq);
593 wait_for_completion(&slow_work_last_thread_exited); 590 wait_for_completion(&slow_work_last_thread_exited);
594 printk(KERN_ERR "Slow work thread pool: Aborted\n"); 591 printk(KERN_ERR "Slow work thread pool: Aborted\n");
595 } 592 }
596 mutex_unlock(&slow_work_user_lock); 593 mutex_unlock(&slow_work_user_lock);
597 return PTR_ERR(p); 594 return PTR_ERR(p);
598 } 595 }
599 EXPORT_SYMBOL(slow_work_register_user); 596 EXPORT_SYMBOL(slow_work_register_user);
600 597
601 /** 598 /**
602 * slow_work_unregister_user - Unregister a user of the facility 599 * slow_work_unregister_user - Unregister a user of the facility
603 * 600 *
604 * Unregister a user of the facility, killing all the threads if this was the 601 * Unregister a user of the facility, killing all the threads if this was the
605 * last one. 602 * last one.
606 */ 603 */
607 void slow_work_unregister_user(void) 604 void slow_work_unregister_user(void)
608 { 605 {
609 mutex_lock(&slow_work_user_lock); 606 mutex_lock(&slow_work_user_lock);
610 607
611 BUG_ON(slow_work_user_count <= 0); 608 BUG_ON(slow_work_user_count <= 0);
612 609
613 slow_work_user_count--; 610 slow_work_user_count--;
614 if (slow_work_user_count == 0) { 611 if (slow_work_user_count == 0) {
615 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); 612 printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
616 slow_work_threads_should_exit = true; 613 slow_work_threads_should_exit = true;
617 del_timer_sync(&slow_work_cull_timer); 614 del_timer_sync(&slow_work_cull_timer);
618 del_timer_sync(&slow_work_oom_timer); 615 del_timer_sync(&slow_work_oom_timer);
619 wake_up_all(&slow_work_thread_wq); 616 wake_up_all(&slow_work_thread_wq);
620 wait_for_completion(&slow_work_last_thread_exited); 617 wait_for_completion(&slow_work_last_thread_exited);
621 printk(KERN_NOTICE "Slow work thread pool:" 618 printk(KERN_NOTICE "Slow work thread pool:"
622 " Shut down complete\n"); 619 " Shut down complete\n");
623 } 620 }
624 621
625 mutex_unlock(&slow_work_user_lock); 622 mutex_unlock(&slow_work_user_lock);
626 } 623 }
627 EXPORT_SYMBOL(slow_work_unregister_user); 624 EXPORT_SYMBOL(slow_work_unregister_user);
628 625
629 /* 626 /*
630 * Initialise the slow work facility 627 * Initialise the slow work facility
631 */ 628 */
632 static int __init init_slow_work(void) 629 static int __init init_slow_work(void)
633 { 630 {
634 unsigned nr_cpus = num_possible_cpus(); 631 unsigned nr_cpus = num_possible_cpus();
635 632
636 if (slow_work_max_threads < nr_cpus) 633 if (slow_work_max_threads < nr_cpus)
637 slow_work_max_threads = nr_cpus; 634 slow_work_max_threads = nr_cpus;
638 #ifdef CONFIG_SYSCTL 635 #ifdef CONFIG_SYSCTL
639 if (slow_work_max_max_threads < nr_cpus * 2) 636 if (slow_work_max_max_threads < nr_cpus * 2)
640 slow_work_max_max_threads = nr_cpus * 2; 637 slow_work_max_max_threads = nr_cpus * 2;
641 #endif 638 #endif
642 return 0; 639 return 0;
643 } 640 }
644 641
645 subsys_initcall(init_slow_work); 642 subsys_initcall(init_slow_work);
646 643
kernel/utsname_sysctl.c
1 /* 1 /*
2 * Copyright (C) 2007 2 * Copyright (C) 2007
3 * 3 *
4 * Author: Eric Biederman <ebiederm@xmision.com> 4 * Author: Eric Biederman <ebiederm@xmision.com>
5 * 5 *
6 * This program is free software; you can redistribute it and/or 6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as 7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation, version 2 of the 8 * published by the Free Software Foundation, version 2 of the
9 * License. 9 * License.
10 */ 10 */
11 11
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/uts.h> 13 #include <linux/uts.h>
14 #include <linux/utsname.h> 14 #include <linux/utsname.h>
15 #include <linux/sysctl.h> 15 #include <linux/sysctl.h>
16 16
17 static void *get_uts(ctl_table *table, int write) 17 static void *get_uts(ctl_table *table, int write)
18 { 18 {
19 char *which = table->data; 19 char *which = table->data;
20 struct uts_namespace *uts_ns; 20 struct uts_namespace *uts_ns;
21 21
22 uts_ns = current->nsproxy->uts_ns; 22 uts_ns = current->nsproxy->uts_ns;
23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns; 23 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
24 24
25 if (!write) 25 if (!write)
26 down_read(&uts_sem); 26 down_read(&uts_sem);
27 else 27 else
28 down_write(&uts_sem); 28 down_write(&uts_sem);
29 return which; 29 return which;
30 } 30 }
31 31
32 static void put_uts(ctl_table *table, int write, void *which) 32 static void put_uts(ctl_table *table, int write, void *which)
33 { 33 {
34 if (!write) 34 if (!write)
35 up_read(&uts_sem); 35 up_read(&uts_sem);
36 else 36 else
37 up_write(&uts_sem); 37 up_write(&uts_sem);
38 } 38 }
39 39
40 #ifdef CONFIG_PROC_SYSCTL 40 #ifdef CONFIG_PROC_SYSCTL
41 /* 41 /*
42 * Special case of dostring for the UTS structure. This has locks 42 * Special case of dostring for the UTS structure. This has locks
43 * to observe. Should this be in kernel/sys.c ???? 43 * to observe. Should this be in kernel/sys.c ????
44 */ 44 */
45 static int proc_do_uts_string(ctl_table *table, int write, 45 static int proc_do_uts_string(ctl_table *table, int write,
46 void __user *buffer, size_t *lenp, loff_t *ppos) 46 void __user *buffer, size_t *lenp, loff_t *ppos)
47 { 47 {
48 struct ctl_table uts_table; 48 struct ctl_table uts_table;
49 int r; 49 int r;
50 memcpy(&uts_table, table, sizeof(uts_table)); 50 memcpy(&uts_table, table, sizeof(uts_table));
51 uts_table.data = get_uts(table, write); 51 uts_table.data = get_uts(table, write);
52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos); 52 r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
53 put_uts(table, write, uts_table.data); 53 put_uts(table, write, uts_table.data);
54 return r; 54 return r;
55 } 55 }
56 #else 56 #else
57 #define proc_do_uts_string NULL 57 #define proc_do_uts_string NULL
58 #endif 58 #endif
59 59
60
61 #ifdef CONFIG_SYSCTL_SYSCALL
62 /* The generic string strategy routine: */
63 static int sysctl_uts_string(ctl_table *table,
64 void __user *oldval, size_t __user *oldlenp,
65 void __user *newval, size_t newlen)
66 {
67 struct ctl_table uts_table;
68 int r, write;
69 write = newval && newlen;
70 memcpy(&uts_table, table, sizeof(uts_table));
71 uts_table.data = get_uts(table, write);
72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
73 put_uts(table, write, uts_table.data);
74 return r;
75 }
76 #else
77 #define sysctl_uts_string NULL
78 #endif
79
80 static struct ctl_table uts_kern_table[] = { 60 static struct ctl_table uts_kern_table[] = {
81 { 61 {
82 .ctl_name = KERN_OSTYPE,
83 .procname = "ostype", 62 .procname = "ostype",
84 .data = init_uts_ns.name.sysname, 63 .data = init_uts_ns.name.sysname,
85 .maxlen = sizeof(init_uts_ns.name.sysname), 64 .maxlen = sizeof(init_uts_ns.name.sysname),
86 .mode = 0444, 65 .mode = 0444,
87 .proc_handler = proc_do_uts_string, 66 .proc_handler = proc_do_uts_string,
88 .strategy = sysctl_uts_string,
89 }, 67 },
90 { 68 {
91 .ctl_name = KERN_OSRELEASE,
92 .procname = "osrelease", 69 .procname = "osrelease",
93 .data = init_uts_ns.name.release, 70 .data = init_uts_ns.name.release,
94 .maxlen = sizeof(init_uts_ns.name.release), 71 .maxlen = sizeof(init_uts_ns.name.release),
95 .mode = 0444, 72 .mode = 0444,
96 .proc_handler = proc_do_uts_string, 73 .proc_handler = proc_do_uts_string,
97 .strategy = sysctl_uts_string,
98 }, 74 },
99 { 75 {
100 .ctl_name = KERN_VERSION,
101 .procname = "version", 76 .procname = "version",
102 .data = init_uts_ns.name.version, 77 .data = init_uts_ns.name.version,
103 .maxlen = sizeof(init_uts_ns.name.version), 78 .maxlen = sizeof(init_uts_ns.name.version),
104 .mode = 0444, 79 .mode = 0444,
105 .proc_handler = proc_do_uts_string, 80 .proc_handler = proc_do_uts_string,
106 .strategy = sysctl_uts_string,
107 }, 81 },
108 { 82 {
109 .ctl_name = KERN_NODENAME,
110 .procname = "hostname", 83 .procname = "hostname",
111 .data = init_uts_ns.name.nodename, 84 .data = init_uts_ns.name.nodename,
112 .maxlen = sizeof(init_uts_ns.name.nodename), 85 .maxlen = sizeof(init_uts_ns.name.nodename),
113 .mode = 0644, 86 .mode = 0644,
114 .proc_handler = proc_do_uts_string, 87 .proc_handler = proc_do_uts_string,
115 .strategy = sysctl_uts_string,
116 }, 88 },
117 { 89 {
118 .ctl_name = KERN_DOMAINNAME,
119 .procname = "domainname", 90 .procname = "domainname",
120 .data = init_uts_ns.name.domainname, 91 .data = init_uts_ns.name.domainname,
121 .maxlen = sizeof(init_uts_ns.name.domainname), 92 .maxlen = sizeof(init_uts_ns.name.domainname),
122 .mode = 0644, 93 .mode = 0644,
123 .proc_handler = proc_do_uts_string, 94 .proc_handler = proc_do_uts_string,
124 .strategy = sysctl_uts_string,
125 }, 95 },
126 {} 96 {}
127 }; 97 };
128 98
129 static struct ctl_table uts_root_table[] = { 99 static struct ctl_table uts_root_table[] = {
130 { 100 {
131 .ctl_name = CTL_KERN,
132 .procname = "kernel", 101 .procname = "kernel",
133 .mode = 0555, 102 .mode = 0555,
134 .child = uts_kern_table, 103 .child = uts_kern_table,
135 }, 104 },
136 {} 105 {}
137 }; 106 };
138 107
139 static int __init utsname_sysctl_init(void) 108 static int __init utsname_sysctl_init(void)
140 { 109 {
141 register_sysctl_table(uts_root_table); 110 register_sysctl_table(uts_root_table);
142 return 0; 111 return 0;
143 } 112 }
144 113
145 __initcall(utsname_sysctl_init); 114 __initcall(utsname_sysctl_init);
146 115