Commit 4301065920b0cbde3986519582347e883b166f3e

Authored by Peter Williams
Committed by Ingo Molnar
1 parent f1a438d813

sched: simplify move_tasks()

The move_tasks() function is currently multiplexed with two distinct
capabilities:

1. attempt to move a specified amount of weighted load from one run
queue to another; and
2. attempt to move a specified number of tasks from one run queue to
another.

The first of these capabilities is used in two places, load_balance()
and load_balance_idle(), and in both of these cases the return value of
move_tasks() is used purely to decide if tasks/load were moved and no
notice of the actual number of tasks moved is taken.

The second capability is used in exactly one place,
active_load_balance(), to attempt to move exactly one task and, as
before, the return value is only used as an indicator of success or failure.

This multiplexing of sched_task() was introduced, by me, as part of the
smpnice patches and was motivated by the fact that the alternative, one
function to move specified load and one to move a single task, would
have led to two functions of roughly the same complexity as the old
move_tasks() (or the new balance_tasks()).  However, the new modular
design of the new CFS scheduler allows a simpler solution to be adopted
and this patch addresses that solution by:

1. adding a new function, move_one_task(), to be used by
active_load_balance(); and
2. making move_tasks() a single purpose function that tries to move a
specified weighted load and returns 1 for success and 0 for failure.

One of the consequences of these changes is that neither move_one_task()
or the new move_tasks() care how many tasks sched_class.load_balance()
moves and this enables its interface to be simplified by returning the
amount of load moved as its result and removing the load_moved pointer
from the argument list.  This helps simplify the new move_tasks() and
slightly reduces the amount of work done in each of
sched_class.load_balance()'s implementations.

Further simplification, e.g. changes to balance_tasks(), are possible
but (slightly) complicated by the special needs of load_balance_fair()
so I've left them to a later patch (if this one gets accepted).

NB Since move_tasks() gets called with two run queue locks held even
small reductions in overhead are worthwhile.

[ mingo@elte.hu ]

this change also reduces code size nicely:

   text    data     bss     dec     hex filename
   39216    3618      24   42858    a76a sched.o.before
   39173    3618      24   42815    a73f sched.o.after

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

Showing 5 changed files with 58 additions and 49 deletions Inline Diff

include/linux/sched.h
1 #ifndef _LINUX_SCHED_H 1 #ifndef _LINUX_SCHED_H
2 #define _LINUX_SCHED_H 2 #define _LINUX_SCHED_H
3 3
4 #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ 4 #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
5 5
6 /* 6 /*
7 * cloning flags: 7 * cloning flags:
8 */ 8 */
9 #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ 9 #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
10 #define CLONE_VM 0x00000100 /* set if VM shared between processes */ 10 #define CLONE_VM 0x00000100 /* set if VM shared between processes */
11 #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ 11 #define CLONE_FS 0x00000200 /* set if fs info shared between processes */
12 #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ 12 #define CLONE_FILES 0x00000400 /* set if open files shared between processes */
13 #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ 13 #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
14 #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ 14 #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
15 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ 15 #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
16 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ 16 #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
17 #define CLONE_THREAD 0x00010000 /* Same thread group? */ 17 #define CLONE_THREAD 0x00010000 /* Same thread group? */
18 #define CLONE_NEWNS 0x00020000 /* New namespace group? */ 18 #define CLONE_NEWNS 0x00020000 /* New namespace group? */
19 #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ 19 #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
20 #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ 20 #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
21 #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ 21 #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
22 #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ 22 #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
23 #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ 23 #define CLONE_DETACHED 0x00400000 /* Unused, ignored */
24 #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ 24 #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
25 #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ 25 #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
26 #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ 26 #define CLONE_STOPPED 0x02000000 /* Start in stopped state */
27 #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ 27 #define CLONE_NEWUTS 0x04000000 /* New utsname group? */
28 #define CLONE_NEWIPC 0x08000000 /* New ipcs */ 28 #define CLONE_NEWIPC 0x08000000 /* New ipcs */
29 #define CLONE_NEWUSER 0x10000000 /* New user namespace */ 29 #define CLONE_NEWUSER 0x10000000 /* New user namespace */
30 30
31 /* 31 /*
32 * Scheduling policies 32 * Scheduling policies
33 */ 33 */
34 #define SCHED_NORMAL 0 34 #define SCHED_NORMAL 0
35 #define SCHED_FIFO 1 35 #define SCHED_FIFO 1
36 #define SCHED_RR 2 36 #define SCHED_RR 2
37 #define SCHED_BATCH 3 37 #define SCHED_BATCH 3
38 /* SCHED_ISO: reserved but not implemented yet */ 38 /* SCHED_ISO: reserved but not implemented yet */
39 #define SCHED_IDLE 5 39 #define SCHED_IDLE 5
40 40
41 #ifdef __KERNEL__ 41 #ifdef __KERNEL__
42 42
43 struct sched_param { 43 struct sched_param {
44 int sched_priority; 44 int sched_priority;
45 }; 45 };
46 46
47 #include <asm/param.h> /* for HZ */ 47 #include <asm/param.h> /* for HZ */
48 48
49 #include <linux/capability.h> 49 #include <linux/capability.h>
50 #include <linux/threads.h> 50 #include <linux/threads.h>
51 #include <linux/kernel.h> 51 #include <linux/kernel.h>
52 #include <linux/types.h> 52 #include <linux/types.h>
53 #include <linux/timex.h> 53 #include <linux/timex.h>
54 #include <linux/jiffies.h> 54 #include <linux/jiffies.h>
55 #include <linux/rbtree.h> 55 #include <linux/rbtree.h>
56 #include <linux/thread_info.h> 56 #include <linux/thread_info.h>
57 #include <linux/cpumask.h> 57 #include <linux/cpumask.h>
58 #include <linux/errno.h> 58 #include <linux/errno.h>
59 #include <linux/nodemask.h> 59 #include <linux/nodemask.h>
60 60
61 #include <asm/system.h> 61 #include <asm/system.h>
62 #include <asm/semaphore.h> 62 #include <asm/semaphore.h>
63 #include <asm/page.h> 63 #include <asm/page.h>
64 #include <asm/ptrace.h> 64 #include <asm/ptrace.h>
65 #include <asm/mmu.h> 65 #include <asm/mmu.h>
66 #include <asm/cputime.h> 66 #include <asm/cputime.h>
67 67
68 #include <linux/smp.h> 68 #include <linux/smp.h>
69 #include <linux/sem.h> 69 #include <linux/sem.h>
70 #include <linux/signal.h> 70 #include <linux/signal.h>
71 #include <linux/securebits.h> 71 #include <linux/securebits.h>
72 #include <linux/fs_struct.h> 72 #include <linux/fs_struct.h>
73 #include <linux/compiler.h> 73 #include <linux/compiler.h>
74 #include <linux/completion.h> 74 #include <linux/completion.h>
75 #include <linux/pid.h> 75 #include <linux/pid.h>
76 #include <linux/percpu.h> 76 #include <linux/percpu.h>
77 #include <linux/topology.h> 77 #include <linux/topology.h>
78 #include <linux/seccomp.h> 78 #include <linux/seccomp.h>
79 #include <linux/rcupdate.h> 79 #include <linux/rcupdate.h>
80 #include <linux/futex.h> 80 #include <linux/futex.h>
81 #include <linux/rtmutex.h> 81 #include <linux/rtmutex.h>
82 82
83 #include <linux/time.h> 83 #include <linux/time.h>
84 #include <linux/param.h> 84 #include <linux/param.h>
85 #include <linux/resource.h> 85 #include <linux/resource.h>
86 #include <linux/timer.h> 86 #include <linux/timer.h>
87 #include <linux/hrtimer.h> 87 #include <linux/hrtimer.h>
88 #include <linux/task_io_accounting.h> 88 #include <linux/task_io_accounting.h>
89 89
90 #include <asm/processor.h> 90 #include <asm/processor.h>
91 91
92 struct exec_domain; 92 struct exec_domain;
93 struct futex_pi_state; 93 struct futex_pi_state;
94 struct bio; 94 struct bio;
95 95
96 /* 96 /*
97 * List of flags we want to share for kernel threads, 97 * List of flags we want to share for kernel threads,
98 * if only because they are not used by them anyway. 98 * if only because they are not used by them anyway.
99 */ 99 */
100 #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) 100 #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
101 101
102 /* 102 /*
103 * These are the constant used to fake the fixed-point load-average 103 * These are the constant used to fake the fixed-point load-average
104 * counting. Some notes: 104 * counting. Some notes:
105 * - 11 bit fractions expand to 22 bits by the multiplies: this gives 105 * - 11 bit fractions expand to 22 bits by the multiplies: this gives
106 * a load-average precision of 10 bits integer + 11 bits fractional 106 * a load-average precision of 10 bits integer + 11 bits fractional
107 * - if you want to count load-averages more often, you need more 107 * - if you want to count load-averages more often, you need more
108 * precision, or rounding will get you. With 2-second counting freq, 108 * precision, or rounding will get you. With 2-second counting freq,
109 * the EXP_n values would be 1981, 2034 and 2043 if still using only 109 * the EXP_n values would be 1981, 2034 and 2043 if still using only
110 * 11 bit fractions. 110 * 11 bit fractions.
111 */ 111 */
112 extern unsigned long avenrun[]; /* Load averages */ 112 extern unsigned long avenrun[]; /* Load averages */
113 113
114 #define FSHIFT 11 /* nr of bits of precision */ 114 #define FSHIFT 11 /* nr of bits of precision */
115 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ 115 #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
116 #define LOAD_FREQ (5*HZ) /* 5 sec intervals */ 116 #define LOAD_FREQ (5*HZ) /* 5 sec intervals */
117 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ 117 #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
118 #define EXP_5 2014 /* 1/exp(5sec/5min) */ 118 #define EXP_5 2014 /* 1/exp(5sec/5min) */
119 #define EXP_15 2037 /* 1/exp(5sec/15min) */ 119 #define EXP_15 2037 /* 1/exp(5sec/15min) */
120 120
121 #define CALC_LOAD(load,exp,n) \ 121 #define CALC_LOAD(load,exp,n) \
122 load *= exp; \ 122 load *= exp; \
123 load += n*(FIXED_1-exp); \ 123 load += n*(FIXED_1-exp); \
124 load >>= FSHIFT; 124 load >>= FSHIFT;
125 125
126 extern unsigned long total_forks; 126 extern unsigned long total_forks;
127 extern int nr_threads; 127 extern int nr_threads;
128 DECLARE_PER_CPU(unsigned long, process_counts); 128 DECLARE_PER_CPU(unsigned long, process_counts);
129 extern int nr_processes(void); 129 extern int nr_processes(void);
130 extern unsigned long nr_running(void); 130 extern unsigned long nr_running(void);
131 extern unsigned long nr_uninterruptible(void); 131 extern unsigned long nr_uninterruptible(void);
132 extern unsigned long nr_active(void); 132 extern unsigned long nr_active(void);
133 extern unsigned long nr_iowait(void); 133 extern unsigned long nr_iowait(void);
134 extern unsigned long weighted_cpuload(const int cpu); 134 extern unsigned long weighted_cpuload(const int cpu);
135 135
136 struct seq_file; 136 struct seq_file;
137 struct cfs_rq; 137 struct cfs_rq;
138 #ifdef CONFIG_SCHED_DEBUG 138 #ifdef CONFIG_SCHED_DEBUG
139 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); 139 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
140 extern void proc_sched_set_task(struct task_struct *p); 140 extern void proc_sched_set_task(struct task_struct *p);
141 extern void 141 extern void
142 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now); 142 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
143 #else 143 #else
144 static inline void 144 static inline void
145 proc_sched_show_task(struct task_struct *p, struct seq_file *m) 145 proc_sched_show_task(struct task_struct *p, struct seq_file *m)
146 { 146 {
147 } 147 }
148 static inline void proc_sched_set_task(struct task_struct *p) 148 static inline void proc_sched_set_task(struct task_struct *p)
149 { 149 {
150 } 150 }
151 static inline void 151 static inline void
152 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) 152 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
153 { 153 {
154 } 154 }
155 #endif 155 #endif
156 156
157 /* 157 /*
158 * Task state bitmask. NOTE! These bits are also 158 * Task state bitmask. NOTE! These bits are also
159 * encoded in fs/proc/array.c: get_task_state(). 159 * encoded in fs/proc/array.c: get_task_state().
160 * 160 *
161 * We have two separate sets of flags: task->state 161 * We have two separate sets of flags: task->state
162 * is about runnability, while task->exit_state are 162 * is about runnability, while task->exit_state are
163 * about the task exiting. Confusing, but this way 163 * about the task exiting. Confusing, but this way
164 * modifying one set can't modify the other one by 164 * modifying one set can't modify the other one by
165 * mistake. 165 * mistake.
166 */ 166 */
167 #define TASK_RUNNING 0 167 #define TASK_RUNNING 0
168 #define TASK_INTERRUPTIBLE 1 168 #define TASK_INTERRUPTIBLE 1
169 #define TASK_UNINTERRUPTIBLE 2 169 #define TASK_UNINTERRUPTIBLE 2
170 #define TASK_STOPPED 4 170 #define TASK_STOPPED 4
171 #define TASK_TRACED 8 171 #define TASK_TRACED 8
172 /* in tsk->exit_state */ 172 /* in tsk->exit_state */
173 #define EXIT_ZOMBIE 16 173 #define EXIT_ZOMBIE 16
174 #define EXIT_DEAD 32 174 #define EXIT_DEAD 32
175 /* in tsk->state again */ 175 /* in tsk->state again */
176 #define TASK_NONINTERACTIVE 64 176 #define TASK_NONINTERACTIVE 64
177 #define TASK_DEAD 128 177 #define TASK_DEAD 128
178 178
179 #define __set_task_state(tsk, state_value) \ 179 #define __set_task_state(tsk, state_value) \
180 do { (tsk)->state = (state_value); } while (0) 180 do { (tsk)->state = (state_value); } while (0)
181 #define set_task_state(tsk, state_value) \ 181 #define set_task_state(tsk, state_value) \
182 set_mb((tsk)->state, (state_value)) 182 set_mb((tsk)->state, (state_value))
183 183
184 /* 184 /*
185 * set_current_state() includes a barrier so that the write of current->state 185 * set_current_state() includes a barrier so that the write of current->state
186 * is correctly serialised wrt the caller's subsequent test of whether to 186 * is correctly serialised wrt the caller's subsequent test of whether to
187 * actually sleep: 187 * actually sleep:
188 * 188 *
189 * set_current_state(TASK_UNINTERRUPTIBLE); 189 * set_current_state(TASK_UNINTERRUPTIBLE);
190 * if (do_i_need_to_sleep()) 190 * if (do_i_need_to_sleep())
191 * schedule(); 191 * schedule();
192 * 192 *
193 * If the caller does not need such serialisation then use __set_current_state() 193 * If the caller does not need such serialisation then use __set_current_state()
194 */ 194 */
195 #define __set_current_state(state_value) \ 195 #define __set_current_state(state_value) \
196 do { current->state = (state_value); } while (0) 196 do { current->state = (state_value); } while (0)
197 #define set_current_state(state_value) \ 197 #define set_current_state(state_value) \
198 set_mb(current->state, (state_value)) 198 set_mb(current->state, (state_value))
199 199
200 /* Task command name length */ 200 /* Task command name length */
201 #define TASK_COMM_LEN 16 201 #define TASK_COMM_LEN 16
202 202
203 #include <linux/spinlock.h> 203 #include <linux/spinlock.h>
204 204
205 /* 205 /*
206 * This serializes "schedule()" and also protects 206 * This serializes "schedule()" and also protects
207 * the run-queue from deletions/modifications (but 207 * the run-queue from deletions/modifications (but
208 * _adding_ to the beginning of the run-queue has 208 * _adding_ to the beginning of the run-queue has
209 * a separate lock). 209 * a separate lock).
210 */ 210 */
211 extern rwlock_t tasklist_lock; 211 extern rwlock_t tasklist_lock;
212 extern spinlock_t mmlist_lock; 212 extern spinlock_t mmlist_lock;
213 213
214 struct task_struct; 214 struct task_struct;
215 215
216 extern void sched_init(void); 216 extern void sched_init(void);
217 extern void sched_init_smp(void); 217 extern void sched_init_smp(void);
218 extern void init_idle(struct task_struct *idle, int cpu); 218 extern void init_idle(struct task_struct *idle, int cpu);
219 extern void init_idle_bootup_task(struct task_struct *idle); 219 extern void init_idle_bootup_task(struct task_struct *idle);
220 220
221 extern cpumask_t nohz_cpu_mask; 221 extern cpumask_t nohz_cpu_mask;
222 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 222 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
223 extern int select_nohz_load_balancer(int cpu); 223 extern int select_nohz_load_balancer(int cpu);
224 #else 224 #else
225 static inline int select_nohz_load_balancer(int cpu) 225 static inline int select_nohz_load_balancer(int cpu)
226 { 226 {
227 return 0; 227 return 0;
228 } 228 }
229 #endif 229 #endif
230 230
231 /* 231 /*
232 * Only dump TASK_* tasks. (0 for all tasks) 232 * Only dump TASK_* tasks. (0 for all tasks)
233 */ 233 */
234 extern void show_state_filter(unsigned long state_filter); 234 extern void show_state_filter(unsigned long state_filter);
235 235
236 static inline void show_state(void) 236 static inline void show_state(void)
237 { 237 {
238 show_state_filter(0); 238 show_state_filter(0);
239 } 239 }
240 240
241 extern void show_regs(struct pt_regs *); 241 extern void show_regs(struct pt_regs *);
242 242
243 /* 243 /*
244 * TASK is a pointer to the task whose backtrace we want to see (or NULL for current 244 * TASK is a pointer to the task whose backtrace we want to see (or NULL for current
245 * task), SP is the stack pointer of the first frame that should be shown in the back 245 * task), SP is the stack pointer of the first frame that should be shown in the back
246 * trace (or NULL if the entire call-chain of the task should be shown). 246 * trace (or NULL if the entire call-chain of the task should be shown).
247 */ 247 */
248 extern void show_stack(struct task_struct *task, unsigned long *sp); 248 extern void show_stack(struct task_struct *task, unsigned long *sp);
249 249
250 void io_schedule(void); 250 void io_schedule(void);
251 long io_schedule_timeout(long timeout); 251 long io_schedule_timeout(long timeout);
252 252
253 extern void cpu_init (void); 253 extern void cpu_init (void);
254 extern void trap_init(void); 254 extern void trap_init(void);
255 extern void update_process_times(int user); 255 extern void update_process_times(int user);
256 extern void scheduler_tick(void); 256 extern void scheduler_tick(void);
257 257
258 #ifdef CONFIG_DETECT_SOFTLOCKUP 258 #ifdef CONFIG_DETECT_SOFTLOCKUP
259 extern void softlockup_tick(void); 259 extern void softlockup_tick(void);
260 extern void spawn_softlockup_task(void); 260 extern void spawn_softlockup_task(void);
261 extern void touch_softlockup_watchdog(void); 261 extern void touch_softlockup_watchdog(void);
262 extern void touch_all_softlockup_watchdogs(void); 262 extern void touch_all_softlockup_watchdogs(void);
263 #else 263 #else
264 static inline void softlockup_tick(void) 264 static inline void softlockup_tick(void)
265 { 265 {
266 } 266 }
267 static inline void spawn_softlockup_task(void) 267 static inline void spawn_softlockup_task(void)
268 { 268 {
269 } 269 }
270 static inline void touch_softlockup_watchdog(void) 270 static inline void touch_softlockup_watchdog(void)
271 { 271 {
272 } 272 }
273 static inline void touch_all_softlockup_watchdogs(void) 273 static inline void touch_all_softlockup_watchdogs(void)
274 { 274 {
275 } 275 }
276 #endif 276 #endif
277 277
278 278
279 /* Attach to any functions which should be ignored in wchan output. */ 279 /* Attach to any functions which should be ignored in wchan output. */
280 #define __sched __attribute__((__section__(".sched.text"))) 280 #define __sched __attribute__((__section__(".sched.text")))
281 /* Is this address in the __sched functions? */ 281 /* Is this address in the __sched functions? */
282 extern int in_sched_functions(unsigned long addr); 282 extern int in_sched_functions(unsigned long addr);
283 283
284 #define MAX_SCHEDULE_TIMEOUT LONG_MAX 284 #define MAX_SCHEDULE_TIMEOUT LONG_MAX
285 extern signed long FASTCALL(schedule_timeout(signed long timeout)); 285 extern signed long FASTCALL(schedule_timeout(signed long timeout));
286 extern signed long schedule_timeout_interruptible(signed long timeout); 286 extern signed long schedule_timeout_interruptible(signed long timeout);
287 extern signed long schedule_timeout_uninterruptible(signed long timeout); 287 extern signed long schedule_timeout_uninterruptible(signed long timeout);
288 asmlinkage void schedule(void); 288 asmlinkage void schedule(void);
289 289
290 struct nsproxy; 290 struct nsproxy;
291 struct user_namespace; 291 struct user_namespace;
292 292
293 /* Maximum number of active map areas.. This is a random (large) number */ 293 /* Maximum number of active map areas.. This is a random (large) number */
294 #define DEFAULT_MAX_MAP_COUNT 65536 294 #define DEFAULT_MAX_MAP_COUNT 65536
295 295
296 extern int sysctl_max_map_count; 296 extern int sysctl_max_map_count;
297 297
298 #include <linux/aio.h> 298 #include <linux/aio.h>
299 299
300 extern unsigned long 300 extern unsigned long
301 arch_get_unmapped_area(struct file *, unsigned long, unsigned long, 301 arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
302 unsigned long, unsigned long); 302 unsigned long, unsigned long);
303 extern unsigned long 303 extern unsigned long
304 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, 304 arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
305 unsigned long len, unsigned long pgoff, 305 unsigned long len, unsigned long pgoff,
306 unsigned long flags); 306 unsigned long flags);
307 extern void arch_unmap_area(struct mm_struct *, unsigned long); 307 extern void arch_unmap_area(struct mm_struct *, unsigned long);
308 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); 308 extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
309 309
310 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 310 #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
311 /* 311 /*
312 * The mm counters are not protected by its page_table_lock, 312 * The mm counters are not protected by its page_table_lock,
313 * so must be incremented atomically. 313 * so must be incremented atomically.
314 */ 314 */
315 #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) 315 #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value)
316 #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) 316 #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member))
317 #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) 317 #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member)
318 #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) 318 #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member)
319 #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) 319 #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member)
320 typedef atomic_long_t mm_counter_t; 320 typedef atomic_long_t mm_counter_t;
321 321
322 #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 322 #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
323 /* 323 /*
324 * The mm counters are protected by its page_table_lock, 324 * The mm counters are protected by its page_table_lock,
325 * so can be incremented directly. 325 * so can be incremented directly.
326 */ 326 */
327 #define set_mm_counter(mm, member, value) (mm)->_##member = (value) 327 #define set_mm_counter(mm, member, value) (mm)->_##member = (value)
328 #define get_mm_counter(mm, member) ((mm)->_##member) 328 #define get_mm_counter(mm, member) ((mm)->_##member)
329 #define add_mm_counter(mm, member, value) (mm)->_##member += (value) 329 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
330 #define inc_mm_counter(mm, member) (mm)->_##member++ 330 #define inc_mm_counter(mm, member) (mm)->_##member++
331 #define dec_mm_counter(mm, member) (mm)->_##member-- 331 #define dec_mm_counter(mm, member) (mm)->_##member--
332 typedef unsigned long mm_counter_t; 332 typedef unsigned long mm_counter_t;
333 333
334 #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ 334 #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
335 335
336 #define get_mm_rss(mm) \ 336 #define get_mm_rss(mm) \
337 (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) 337 (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss))
338 #define update_hiwater_rss(mm) do { \ 338 #define update_hiwater_rss(mm) do { \
339 unsigned long _rss = get_mm_rss(mm); \ 339 unsigned long _rss = get_mm_rss(mm); \
340 if ((mm)->hiwater_rss < _rss) \ 340 if ((mm)->hiwater_rss < _rss) \
341 (mm)->hiwater_rss = _rss; \ 341 (mm)->hiwater_rss = _rss; \
342 } while (0) 342 } while (0)
343 #define update_hiwater_vm(mm) do { \ 343 #define update_hiwater_vm(mm) do { \
344 if ((mm)->hiwater_vm < (mm)->total_vm) \ 344 if ((mm)->hiwater_vm < (mm)->total_vm) \
345 (mm)->hiwater_vm = (mm)->total_vm; \ 345 (mm)->hiwater_vm = (mm)->total_vm; \
346 } while (0) 346 } while (0)
347 347
348 extern void set_dumpable(struct mm_struct *mm, int value); 348 extern void set_dumpable(struct mm_struct *mm, int value);
349 extern int get_dumpable(struct mm_struct *mm); 349 extern int get_dumpable(struct mm_struct *mm);
350 350
351 /* mm flags */ 351 /* mm flags */
352 /* dumpable bits */ 352 /* dumpable bits */
353 #define MMF_DUMPABLE 0 /* core dump is permitted */ 353 #define MMF_DUMPABLE 0 /* core dump is permitted */
354 #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ 354 #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */
355 #define MMF_DUMPABLE_BITS 2 355 #define MMF_DUMPABLE_BITS 2
356 356
357 /* coredump filter bits */ 357 /* coredump filter bits */
358 #define MMF_DUMP_ANON_PRIVATE 2 358 #define MMF_DUMP_ANON_PRIVATE 2
359 #define MMF_DUMP_ANON_SHARED 3 359 #define MMF_DUMP_ANON_SHARED 3
360 #define MMF_DUMP_MAPPED_PRIVATE 4 360 #define MMF_DUMP_MAPPED_PRIVATE 4
361 #define MMF_DUMP_MAPPED_SHARED 5 361 #define MMF_DUMP_MAPPED_SHARED 5
362 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS 362 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
363 #define MMF_DUMP_FILTER_BITS 4 363 #define MMF_DUMP_FILTER_BITS 4
364 #define MMF_DUMP_FILTER_MASK \ 364 #define MMF_DUMP_FILTER_MASK \
365 (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) 365 (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
366 #define MMF_DUMP_FILTER_DEFAULT \ 366 #define MMF_DUMP_FILTER_DEFAULT \
367 ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) 367 ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED))
368 368
369 struct mm_struct { 369 struct mm_struct {
370 struct vm_area_struct * mmap; /* list of VMAs */ 370 struct vm_area_struct * mmap; /* list of VMAs */
371 struct rb_root mm_rb; 371 struct rb_root mm_rb;
372 struct vm_area_struct * mmap_cache; /* last find_vma result */ 372 struct vm_area_struct * mmap_cache; /* last find_vma result */
373 unsigned long (*get_unmapped_area) (struct file *filp, 373 unsigned long (*get_unmapped_area) (struct file *filp,
374 unsigned long addr, unsigned long len, 374 unsigned long addr, unsigned long len,
375 unsigned long pgoff, unsigned long flags); 375 unsigned long pgoff, unsigned long flags);
376 void (*unmap_area) (struct mm_struct *mm, unsigned long addr); 376 void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
377 unsigned long mmap_base; /* base of mmap area */ 377 unsigned long mmap_base; /* base of mmap area */
378 unsigned long task_size; /* size of task vm space */ 378 unsigned long task_size; /* size of task vm space */
379 unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ 379 unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */
380 unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ 380 unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */
381 pgd_t * pgd; 381 pgd_t * pgd;
382 atomic_t mm_users; /* How many users with user space? */ 382 atomic_t mm_users; /* How many users with user space? */
383 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ 383 atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */
384 int map_count; /* number of VMAs */ 384 int map_count; /* number of VMAs */
385 struct rw_semaphore mmap_sem; 385 struct rw_semaphore mmap_sem;
386 spinlock_t page_table_lock; /* Protects page tables and some counters */ 386 spinlock_t page_table_lock; /* Protects page tables and some counters */
387 387
388 struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung 388 struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
389 * together off init_mm.mmlist, and are protected 389 * together off init_mm.mmlist, and are protected
390 * by mmlist_lock 390 * by mmlist_lock
391 */ 391 */
392 392
393 /* Special counters, in some configurations protected by the 393 /* Special counters, in some configurations protected by the
394 * page_table_lock, in other configurations by being atomic. 394 * page_table_lock, in other configurations by being atomic.
395 */ 395 */
396 mm_counter_t _file_rss; 396 mm_counter_t _file_rss;
397 mm_counter_t _anon_rss; 397 mm_counter_t _anon_rss;
398 398
399 unsigned long hiwater_rss; /* High-watermark of RSS usage */ 399 unsigned long hiwater_rss; /* High-watermark of RSS usage */
400 unsigned long hiwater_vm; /* High-water virtual memory usage */ 400 unsigned long hiwater_vm; /* High-water virtual memory usage */
401 401
402 unsigned long total_vm, locked_vm, shared_vm, exec_vm; 402 unsigned long total_vm, locked_vm, shared_vm, exec_vm;
403 unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; 403 unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
404 unsigned long start_code, end_code, start_data, end_data; 404 unsigned long start_code, end_code, start_data, end_data;
405 unsigned long start_brk, brk, start_stack; 405 unsigned long start_brk, brk, start_stack;
406 unsigned long arg_start, arg_end, env_start, env_end; 406 unsigned long arg_start, arg_end, env_start, env_end;
407 407
408 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 408 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
409 409
410 cpumask_t cpu_vm_mask; 410 cpumask_t cpu_vm_mask;
411 411
412 /* Architecture-specific MM context */ 412 /* Architecture-specific MM context */
413 mm_context_t context; 413 mm_context_t context;
414 414
415 /* Swap token stuff */ 415 /* Swap token stuff */
416 /* 416 /*
417 * Last value of global fault stamp as seen by this process. 417 * Last value of global fault stamp as seen by this process.
418 * In other words, this value gives an indication of how long 418 * In other words, this value gives an indication of how long
419 * it has been since this task got the token. 419 * it has been since this task got the token.
420 * Look at mm/thrash.c 420 * Look at mm/thrash.c
421 */ 421 */
422 unsigned int faultstamp; 422 unsigned int faultstamp;
423 unsigned int token_priority; 423 unsigned int token_priority;
424 unsigned int last_interval; 424 unsigned int last_interval;
425 425
426 unsigned long flags; /* Must use atomic bitops to access the bits */ 426 unsigned long flags; /* Must use atomic bitops to access the bits */
427 427
428 /* coredumping support */ 428 /* coredumping support */
429 int core_waiters; 429 int core_waiters;
430 struct completion *core_startup_done, core_done; 430 struct completion *core_startup_done, core_done;
431 431
432 /* aio bits */ 432 /* aio bits */
433 rwlock_t ioctx_list_lock; 433 rwlock_t ioctx_list_lock;
434 struct kioctx *ioctx_list; 434 struct kioctx *ioctx_list;
435 }; 435 };
436 436
437 struct sighand_struct { 437 struct sighand_struct {
438 atomic_t count; 438 atomic_t count;
439 struct k_sigaction action[_NSIG]; 439 struct k_sigaction action[_NSIG];
440 spinlock_t siglock; 440 spinlock_t siglock;
441 struct list_head signalfd_list; 441 struct list_head signalfd_list;
442 }; 442 };
443 443
444 struct pacct_struct { 444 struct pacct_struct {
445 int ac_flag; 445 int ac_flag;
446 long ac_exitcode; 446 long ac_exitcode;
447 unsigned long ac_mem; 447 unsigned long ac_mem;
448 cputime_t ac_utime, ac_stime; 448 cputime_t ac_utime, ac_stime;
449 unsigned long ac_minflt, ac_majflt; 449 unsigned long ac_minflt, ac_majflt;
450 }; 450 };
451 451
452 /* 452 /*
453 * NOTE! "signal_struct" does not have it's own 453 * NOTE! "signal_struct" does not have it's own
454 * locking, because a shared signal_struct always 454 * locking, because a shared signal_struct always
455 * implies a shared sighand_struct, so locking 455 * implies a shared sighand_struct, so locking
456 * sighand_struct is always a proper superset of 456 * sighand_struct is always a proper superset of
457 * the locking of signal_struct. 457 * the locking of signal_struct.
458 */ 458 */
459 struct signal_struct { 459 struct signal_struct {
460 atomic_t count; 460 atomic_t count;
461 atomic_t live; 461 atomic_t live;
462 462
463 wait_queue_head_t wait_chldexit; /* for wait4() */ 463 wait_queue_head_t wait_chldexit; /* for wait4() */
464 464
465 /* current thread group signal load-balancing target: */ 465 /* current thread group signal load-balancing target: */
466 struct task_struct *curr_target; 466 struct task_struct *curr_target;
467 467
468 /* shared signal handling: */ 468 /* shared signal handling: */
469 struct sigpending shared_pending; 469 struct sigpending shared_pending;
470 470
471 /* thread group exit support */ 471 /* thread group exit support */
472 int group_exit_code; 472 int group_exit_code;
473 /* overloaded: 473 /* overloaded:
474 * - notify group_exit_task when ->count is equal to notify_count 474 * - notify group_exit_task when ->count is equal to notify_count
475 * - everyone except group_exit_task is stopped during signal delivery 475 * - everyone except group_exit_task is stopped during signal delivery
476 * of fatal signals, group_exit_task processes the signal. 476 * of fatal signals, group_exit_task processes the signal.
477 */ 477 */
478 struct task_struct *group_exit_task; 478 struct task_struct *group_exit_task;
479 int notify_count; 479 int notify_count;
480 480
481 /* thread group stop support, overloads group_exit_code too */ 481 /* thread group stop support, overloads group_exit_code too */
482 int group_stop_count; 482 int group_stop_count;
483 unsigned int flags; /* see SIGNAL_* flags below */ 483 unsigned int flags; /* see SIGNAL_* flags below */
484 484
485 /* POSIX.1b Interval Timers */ 485 /* POSIX.1b Interval Timers */
486 struct list_head posix_timers; 486 struct list_head posix_timers;
487 487
488 /* ITIMER_REAL timer for the process */ 488 /* ITIMER_REAL timer for the process */
489 struct hrtimer real_timer; 489 struct hrtimer real_timer;
490 struct task_struct *tsk; 490 struct task_struct *tsk;
491 ktime_t it_real_incr; 491 ktime_t it_real_incr;
492 492
493 /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ 493 /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
494 cputime_t it_prof_expires, it_virt_expires; 494 cputime_t it_prof_expires, it_virt_expires;
495 cputime_t it_prof_incr, it_virt_incr; 495 cputime_t it_prof_incr, it_virt_incr;
496 496
497 /* job control IDs */ 497 /* job control IDs */
498 pid_t pgrp; 498 pid_t pgrp;
499 struct pid *tty_old_pgrp; 499 struct pid *tty_old_pgrp;
500 500
501 union { 501 union {
502 pid_t session __deprecated; 502 pid_t session __deprecated;
503 pid_t __session; 503 pid_t __session;
504 }; 504 };
505 505
506 /* boolean value for session group leader */ 506 /* boolean value for session group leader */
507 int leader; 507 int leader;
508 508
509 struct tty_struct *tty; /* NULL if no tty */ 509 struct tty_struct *tty; /* NULL if no tty */
510 510
511 /* 511 /*
512 * Cumulative resource counters for dead threads in the group, 512 * Cumulative resource counters for dead threads in the group,
513 * and for reaped dead child processes forked by this group. 513 * and for reaped dead child processes forked by this group.
514 * Live threads maintain their own counters and add to these 514 * Live threads maintain their own counters and add to these
515 * in __exit_signal, except for the group leader. 515 * in __exit_signal, except for the group leader.
516 */ 516 */
517 cputime_t utime, stime, cutime, cstime; 517 cputime_t utime, stime, cutime, cstime;
518 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; 518 unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
519 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; 519 unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
520 unsigned long inblock, oublock, cinblock, coublock; 520 unsigned long inblock, oublock, cinblock, coublock;
521 521
522 /* 522 /*
523 * Cumulative ns of scheduled CPU time for dead threads in the 523 * Cumulative ns of scheduled CPU time for dead threads in the
524 * group, not including a zombie group leader. (This only differs 524 * group, not including a zombie group leader. (This only differs
525 * from jiffies_to_ns(utime + stime) if sched_clock uses something 525 * from jiffies_to_ns(utime + stime) if sched_clock uses something
526 * other than jiffies.) 526 * other than jiffies.)
527 */ 527 */
528 unsigned long long sum_sched_runtime; 528 unsigned long long sum_sched_runtime;
529 529
530 /* 530 /*
531 * We don't bother to synchronize most readers of this at all, 531 * We don't bother to synchronize most readers of this at all,
532 * because there is no reader checking a limit that actually needs 532 * because there is no reader checking a limit that actually needs
533 * to get both rlim_cur and rlim_max atomically, and either one 533 * to get both rlim_cur and rlim_max atomically, and either one
534 * alone is a single word that can safely be read normally. 534 * alone is a single word that can safely be read normally.
535 * getrlimit/setrlimit use task_lock(current->group_leader) to 535 * getrlimit/setrlimit use task_lock(current->group_leader) to
536 * protect this instead of the siglock, because they really 536 * protect this instead of the siglock, because they really
537 * have no need to disable irqs. 537 * have no need to disable irqs.
538 */ 538 */
539 struct rlimit rlim[RLIM_NLIMITS]; 539 struct rlimit rlim[RLIM_NLIMITS];
540 540
541 struct list_head cpu_timers[3]; 541 struct list_head cpu_timers[3];
542 542
543 /* keep the process-shared keyrings here so that they do the right 543 /* keep the process-shared keyrings here so that they do the right
544 * thing in threads created with CLONE_THREAD */ 544 * thing in threads created with CLONE_THREAD */
545 #ifdef CONFIG_KEYS 545 #ifdef CONFIG_KEYS
546 struct key *session_keyring; /* keyring inherited over fork */ 546 struct key *session_keyring; /* keyring inherited over fork */
547 struct key *process_keyring; /* keyring private to this process */ 547 struct key *process_keyring; /* keyring private to this process */
548 #endif 548 #endif
549 #ifdef CONFIG_BSD_PROCESS_ACCT 549 #ifdef CONFIG_BSD_PROCESS_ACCT
550 struct pacct_struct pacct; /* per-process accounting information */ 550 struct pacct_struct pacct; /* per-process accounting information */
551 #endif 551 #endif
552 #ifdef CONFIG_TASKSTATS 552 #ifdef CONFIG_TASKSTATS
553 struct taskstats *stats; 553 struct taskstats *stats;
554 #endif 554 #endif
555 #ifdef CONFIG_AUDIT 555 #ifdef CONFIG_AUDIT
556 unsigned audit_tty; 556 unsigned audit_tty;
557 struct tty_audit_buf *tty_audit_buf; 557 struct tty_audit_buf *tty_audit_buf;
558 #endif 558 #endif
559 }; 559 };
560 560
561 /* Context switch must be unlocked if interrupts are to be enabled */ 561 /* Context switch must be unlocked if interrupts are to be enabled */
562 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 562 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
563 # define __ARCH_WANT_UNLOCKED_CTXSW 563 # define __ARCH_WANT_UNLOCKED_CTXSW
564 #endif 564 #endif
565 565
566 /* 566 /*
567 * Bits in flags field of signal_struct. 567 * Bits in flags field of signal_struct.
568 */ 568 */
569 #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ 569 #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
570 #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ 570 #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */
571 #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ 571 #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */
572 #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ 572 #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */
573 573
574 /* 574 /*
575 * Some day this will be a full-fledged user tracking system.. 575 * Some day this will be a full-fledged user tracking system..
576 */ 576 */
577 struct user_struct { 577 struct user_struct {
578 atomic_t __count; /* reference count */ 578 atomic_t __count; /* reference count */
579 atomic_t processes; /* How many processes does this user have? */ 579 atomic_t processes; /* How many processes does this user have? */
580 atomic_t files; /* How many open files does this user have? */ 580 atomic_t files; /* How many open files does this user have? */
581 atomic_t sigpending; /* How many pending signals does this user have? */ 581 atomic_t sigpending; /* How many pending signals does this user have? */
582 #ifdef CONFIG_INOTIFY_USER 582 #ifdef CONFIG_INOTIFY_USER
583 atomic_t inotify_watches; /* How many inotify watches does this user have? */ 583 atomic_t inotify_watches; /* How many inotify watches does this user have? */
584 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ 584 atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
585 #endif 585 #endif
586 /* protected by mq_lock */ 586 /* protected by mq_lock */
587 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ 587 unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
588 unsigned long locked_shm; /* How many pages of mlocked shm ? */ 588 unsigned long locked_shm; /* How many pages of mlocked shm ? */
589 589
590 #ifdef CONFIG_KEYS 590 #ifdef CONFIG_KEYS
591 struct key *uid_keyring; /* UID specific keyring */ 591 struct key *uid_keyring; /* UID specific keyring */
592 struct key *session_keyring; /* UID's default session keyring */ 592 struct key *session_keyring; /* UID's default session keyring */
593 #endif 593 #endif
594 594
595 /* Hash table maintenance information */ 595 /* Hash table maintenance information */
596 struct list_head uidhash_list; 596 struct list_head uidhash_list;
597 uid_t uid; 597 uid_t uid;
598 }; 598 };
599 599
600 extern struct user_struct *find_user(uid_t); 600 extern struct user_struct *find_user(uid_t);
601 601
602 extern struct user_struct root_user; 602 extern struct user_struct root_user;
603 #define INIT_USER (&root_user) 603 #define INIT_USER (&root_user)
604 604
605 struct backing_dev_info; 605 struct backing_dev_info;
606 struct reclaim_state; 606 struct reclaim_state;
607 607
608 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 608 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
609 struct sched_info { 609 struct sched_info {
610 /* cumulative counters */ 610 /* cumulative counters */
611 unsigned long pcnt; /* # of times run on this cpu */ 611 unsigned long pcnt; /* # of times run on this cpu */
612 unsigned long long cpu_time, /* time spent on the cpu */ 612 unsigned long long cpu_time, /* time spent on the cpu */
613 run_delay; /* time spent waiting on a runqueue */ 613 run_delay; /* time spent waiting on a runqueue */
614 614
615 /* timestamps */ 615 /* timestamps */
616 unsigned long long last_arrival,/* when we last ran on a cpu */ 616 unsigned long long last_arrival,/* when we last ran on a cpu */
617 last_queued; /* when we were last queued to run */ 617 last_queued; /* when we were last queued to run */
618 }; 618 };
619 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ 619 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
620 620
621 #ifdef CONFIG_SCHEDSTATS 621 #ifdef CONFIG_SCHEDSTATS
622 extern const struct file_operations proc_schedstat_operations; 622 extern const struct file_operations proc_schedstat_operations;
623 #endif /* CONFIG_SCHEDSTATS */ 623 #endif /* CONFIG_SCHEDSTATS */
624 624
625 #ifdef CONFIG_TASK_DELAY_ACCT 625 #ifdef CONFIG_TASK_DELAY_ACCT
626 struct task_delay_info { 626 struct task_delay_info {
627 spinlock_t lock; 627 spinlock_t lock;
628 unsigned int flags; /* Private per-task flags */ 628 unsigned int flags; /* Private per-task flags */
629 629
630 /* For each stat XXX, add following, aligned appropriately 630 /* For each stat XXX, add following, aligned appropriately
631 * 631 *
632 * struct timespec XXX_start, XXX_end; 632 * struct timespec XXX_start, XXX_end;
633 * u64 XXX_delay; 633 * u64 XXX_delay;
634 * u32 XXX_count; 634 * u32 XXX_count;
635 * 635 *
636 * Atomicity of updates to XXX_delay, XXX_count protected by 636 * Atomicity of updates to XXX_delay, XXX_count protected by
637 * single lock above (split into XXX_lock if contention is an issue). 637 * single lock above (split into XXX_lock if contention is an issue).
638 */ 638 */
639 639
640 /* 640 /*
641 * XXX_count is incremented on every XXX operation, the delay 641 * XXX_count is incremented on every XXX operation, the delay
642 * associated with the operation is added to XXX_delay. 642 * associated with the operation is added to XXX_delay.
643 * XXX_delay contains the accumulated delay time in nanoseconds. 643 * XXX_delay contains the accumulated delay time in nanoseconds.
644 */ 644 */
645 struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ 645 struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
646 u64 blkio_delay; /* wait for sync block io completion */ 646 u64 blkio_delay; /* wait for sync block io completion */
647 u64 swapin_delay; /* wait for swapin block io completion */ 647 u64 swapin_delay; /* wait for swapin block io completion */
648 u32 blkio_count; /* total count of the number of sync block */ 648 u32 blkio_count; /* total count of the number of sync block */
649 /* io operations performed */ 649 /* io operations performed */
650 u32 swapin_count; /* total count of the number of swapin block */ 650 u32 swapin_count; /* total count of the number of swapin block */
651 /* io operations performed */ 651 /* io operations performed */
652 }; 652 };
653 #endif /* CONFIG_TASK_DELAY_ACCT */ 653 #endif /* CONFIG_TASK_DELAY_ACCT */
654 654
655 static inline int sched_info_on(void) 655 static inline int sched_info_on(void)
656 { 656 {
657 #ifdef CONFIG_SCHEDSTATS 657 #ifdef CONFIG_SCHEDSTATS
658 return 1; 658 return 1;
659 #elif defined(CONFIG_TASK_DELAY_ACCT) 659 #elif defined(CONFIG_TASK_DELAY_ACCT)
660 extern int delayacct_on; 660 extern int delayacct_on;
661 return delayacct_on; 661 return delayacct_on;
662 #else 662 #else
663 return 0; 663 return 0;
664 #endif 664 #endif
665 } 665 }
666 666
667 enum cpu_idle_type { 667 enum cpu_idle_type {
668 CPU_IDLE, 668 CPU_IDLE,
669 CPU_NOT_IDLE, 669 CPU_NOT_IDLE,
670 CPU_NEWLY_IDLE, 670 CPU_NEWLY_IDLE,
671 CPU_MAX_IDLE_TYPES 671 CPU_MAX_IDLE_TYPES
672 }; 672 };
673 673
674 /* 674 /*
675 * sched-domains (multiprocessor balancing) declarations: 675 * sched-domains (multiprocessor balancing) declarations:
676 */ 676 */
677 677
678 /* 678 /*
679 * Increase resolution of nice-level calculations: 679 * Increase resolution of nice-level calculations:
680 */ 680 */
681 #define SCHED_LOAD_SHIFT 10 681 #define SCHED_LOAD_SHIFT 10
682 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) 682 #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
683 683
684 #define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) 684 #define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1)
685 685
686 #ifdef CONFIG_SMP 686 #ifdef CONFIG_SMP
687 #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ 687 #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */
688 #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ 688 #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */
689 #define SD_BALANCE_EXEC 4 /* Balance on exec */ 689 #define SD_BALANCE_EXEC 4 /* Balance on exec */
690 #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ 690 #define SD_BALANCE_FORK 8 /* Balance on fork, clone */
691 #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ 691 #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */
692 #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ 692 #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */
693 #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ 693 #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */
694 #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ 694 #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */
695 #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ 695 #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */
696 #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ 696 #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */
697 #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ 697 #define SD_SERIALIZE 1024 /* Only a single load balancing instance */
698 698
699 #define BALANCE_FOR_MC_POWER \ 699 #define BALANCE_FOR_MC_POWER \
700 (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) 700 (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
701 701
702 #define BALANCE_FOR_PKG_POWER \ 702 #define BALANCE_FOR_PKG_POWER \
703 ((sched_mc_power_savings || sched_smt_power_savings) ? \ 703 ((sched_mc_power_savings || sched_smt_power_savings) ? \
704 SD_POWERSAVINGS_BALANCE : 0) 704 SD_POWERSAVINGS_BALANCE : 0)
705 705
706 #define test_sd_parent(sd, flag) ((sd->parent && \ 706 #define test_sd_parent(sd, flag) ((sd->parent && \
707 (sd->parent->flags & flag)) ? 1 : 0) 707 (sd->parent->flags & flag)) ? 1 : 0)
708 708
709 709
710 struct sched_group { 710 struct sched_group {
711 struct sched_group *next; /* Must be a circular list */ 711 struct sched_group *next; /* Must be a circular list */
712 cpumask_t cpumask; 712 cpumask_t cpumask;
713 713
714 /* 714 /*
715 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 715 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
716 * single CPU. This is read only (except for setup, hotplug CPU). 716 * single CPU. This is read only (except for setup, hotplug CPU).
717 * Note : Never change cpu_power without recompute its reciprocal 717 * Note : Never change cpu_power without recompute its reciprocal
718 */ 718 */
719 unsigned int __cpu_power; 719 unsigned int __cpu_power;
720 /* 720 /*
721 * reciprocal value of cpu_power to avoid expensive divides 721 * reciprocal value of cpu_power to avoid expensive divides
722 * (see include/linux/reciprocal_div.h) 722 * (see include/linux/reciprocal_div.h)
723 */ 723 */
724 u32 reciprocal_cpu_power; 724 u32 reciprocal_cpu_power;
725 }; 725 };
726 726
727 struct sched_domain { 727 struct sched_domain {
728 /* These fields must be setup */ 728 /* These fields must be setup */
729 struct sched_domain *parent; /* top domain must be null terminated */ 729 struct sched_domain *parent; /* top domain must be null terminated */
730 struct sched_domain *child; /* bottom domain must be null terminated */ 730 struct sched_domain *child; /* bottom domain must be null terminated */
731 struct sched_group *groups; /* the balancing groups of the domain */ 731 struct sched_group *groups; /* the balancing groups of the domain */
732 cpumask_t span; /* span of all CPUs in this domain */ 732 cpumask_t span; /* span of all CPUs in this domain */
733 unsigned long min_interval; /* Minimum balance interval ms */ 733 unsigned long min_interval; /* Minimum balance interval ms */
734 unsigned long max_interval; /* Maximum balance interval ms */ 734 unsigned long max_interval; /* Maximum balance interval ms */
735 unsigned int busy_factor; /* less balancing by factor if busy */ 735 unsigned int busy_factor; /* less balancing by factor if busy */
736 unsigned int imbalance_pct; /* No balance until over watermark */ 736 unsigned int imbalance_pct; /* No balance until over watermark */
737 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 737 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
738 unsigned int busy_idx; 738 unsigned int busy_idx;
739 unsigned int idle_idx; 739 unsigned int idle_idx;
740 unsigned int newidle_idx; 740 unsigned int newidle_idx;
741 unsigned int wake_idx; 741 unsigned int wake_idx;
742 unsigned int forkexec_idx; 742 unsigned int forkexec_idx;
743 int flags; /* See SD_* */ 743 int flags; /* See SD_* */
744 744
745 /* Runtime fields. */ 745 /* Runtime fields. */
746 unsigned long last_balance; /* init to jiffies. units in jiffies */ 746 unsigned long last_balance; /* init to jiffies. units in jiffies */
747 unsigned int balance_interval; /* initialise to 1. units in ms. */ 747 unsigned int balance_interval; /* initialise to 1. units in ms. */
748 unsigned int nr_balance_failed; /* initialise to 0 */ 748 unsigned int nr_balance_failed; /* initialise to 0 */
749 749
750 #ifdef CONFIG_SCHEDSTATS 750 #ifdef CONFIG_SCHEDSTATS
751 /* load_balance() stats */ 751 /* load_balance() stats */
752 unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; 752 unsigned long lb_cnt[CPU_MAX_IDLE_TYPES];
753 unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; 753 unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
754 unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; 754 unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
755 unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; 755 unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
756 unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; 756 unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
757 unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; 757 unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
758 unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; 758 unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
759 unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; 759 unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
760 760
761 /* Active load balancing */ 761 /* Active load balancing */
762 unsigned long alb_cnt; 762 unsigned long alb_cnt;
763 unsigned long alb_failed; 763 unsigned long alb_failed;
764 unsigned long alb_pushed; 764 unsigned long alb_pushed;
765 765
766 /* SD_BALANCE_EXEC stats */ 766 /* SD_BALANCE_EXEC stats */
767 unsigned long sbe_cnt; 767 unsigned long sbe_cnt;
768 unsigned long sbe_balanced; 768 unsigned long sbe_balanced;
769 unsigned long sbe_pushed; 769 unsigned long sbe_pushed;
770 770
771 /* SD_BALANCE_FORK stats */ 771 /* SD_BALANCE_FORK stats */
772 unsigned long sbf_cnt; 772 unsigned long sbf_cnt;
773 unsigned long sbf_balanced; 773 unsigned long sbf_balanced;
774 unsigned long sbf_pushed; 774 unsigned long sbf_pushed;
775 775
776 /* try_to_wake_up() stats */ 776 /* try_to_wake_up() stats */
777 unsigned long ttwu_wake_remote; 777 unsigned long ttwu_wake_remote;
778 unsigned long ttwu_move_affine; 778 unsigned long ttwu_move_affine;
779 unsigned long ttwu_move_balance; 779 unsigned long ttwu_move_balance;
780 #endif 780 #endif
781 }; 781 };
782 782
783 extern int partition_sched_domains(cpumask_t *partition1, 783 extern int partition_sched_domains(cpumask_t *partition1,
784 cpumask_t *partition2); 784 cpumask_t *partition2);
785 785
786 #endif /* CONFIG_SMP */ 786 #endif /* CONFIG_SMP */
787 787
788 /* 788 /*
789 * A runqueue laden with a single nice 0 task scores a weighted_cpuload of 789 * A runqueue laden with a single nice 0 task scores a weighted_cpuload of
790 * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a 790 * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a
791 * task of nice 0 or enough lower priority tasks to bring up the 791 * task of nice 0 or enough lower priority tasks to bring up the
792 * weighted_cpuload 792 * weighted_cpuload
793 */ 793 */
794 static inline int above_background_load(void) 794 static inline int above_background_load(void)
795 { 795 {
796 unsigned long cpu; 796 unsigned long cpu;
797 797
798 for_each_online_cpu(cpu) { 798 for_each_online_cpu(cpu) {
799 if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) 799 if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE)
800 return 1; 800 return 1;
801 } 801 }
802 return 0; 802 return 0;
803 } 803 }
804 804
805 struct io_context; /* See blkdev.h */ 805 struct io_context; /* See blkdev.h */
806 struct cpuset; 806 struct cpuset;
807 807
808 #define NGROUPS_SMALL 32 808 #define NGROUPS_SMALL 32
809 #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) 809 #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t)))
810 struct group_info { 810 struct group_info {
811 int ngroups; 811 int ngroups;
812 atomic_t usage; 812 atomic_t usage;
813 gid_t small_block[NGROUPS_SMALL]; 813 gid_t small_block[NGROUPS_SMALL];
814 int nblocks; 814 int nblocks;
815 gid_t *blocks[0]; 815 gid_t *blocks[0];
816 }; 816 };
817 817
818 /* 818 /*
819 * get_group_info() must be called with the owning task locked (via task_lock()) 819 * get_group_info() must be called with the owning task locked (via task_lock())
820 * when task != current. The reason being that the vast majority of callers are 820 * when task != current. The reason being that the vast majority of callers are
821 * looking at current->group_info, which can not be changed except by the 821 * looking at current->group_info, which can not be changed except by the
822 * current task. Changing current->group_info requires the task lock, too. 822 * current task. Changing current->group_info requires the task lock, too.
823 */ 823 */
824 #define get_group_info(group_info) do { \ 824 #define get_group_info(group_info) do { \
825 atomic_inc(&(group_info)->usage); \ 825 atomic_inc(&(group_info)->usage); \
826 } while (0) 826 } while (0)
827 827
828 #define put_group_info(group_info) do { \ 828 #define put_group_info(group_info) do { \
829 if (atomic_dec_and_test(&(group_info)->usage)) \ 829 if (atomic_dec_and_test(&(group_info)->usage)) \
830 groups_free(group_info); \ 830 groups_free(group_info); \
831 } while (0) 831 } while (0)
832 832
833 extern struct group_info *groups_alloc(int gidsetsize); 833 extern struct group_info *groups_alloc(int gidsetsize);
834 extern void groups_free(struct group_info *group_info); 834 extern void groups_free(struct group_info *group_info);
835 extern int set_current_groups(struct group_info *group_info); 835 extern int set_current_groups(struct group_info *group_info);
836 extern int groups_search(struct group_info *group_info, gid_t grp); 836 extern int groups_search(struct group_info *group_info, gid_t grp);
837 /* access the groups "array" with this macro */ 837 /* access the groups "array" with this macro */
838 #define GROUP_AT(gi, i) \ 838 #define GROUP_AT(gi, i) \
839 ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) 839 ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
840 840
841 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK 841 #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
842 extern void prefetch_stack(struct task_struct *t); 842 extern void prefetch_stack(struct task_struct *t);
843 #else 843 #else
844 static inline void prefetch_stack(struct task_struct *t) { } 844 static inline void prefetch_stack(struct task_struct *t) { }
845 #endif 845 #endif
846 846
847 struct audit_context; /* See audit.c */ 847 struct audit_context; /* See audit.c */
848 struct mempolicy; 848 struct mempolicy;
849 struct pipe_inode_info; 849 struct pipe_inode_info;
850 struct uts_namespace; 850 struct uts_namespace;
851 851
852 struct rq; 852 struct rq;
853 struct sched_domain; 853 struct sched_domain;
854 854
855 struct sched_class { 855 struct sched_class {
856 struct sched_class *next; 856 struct sched_class *next;
857 857
858 void (*enqueue_task) (struct rq *rq, struct task_struct *p, 858 void (*enqueue_task) (struct rq *rq, struct task_struct *p,
859 int wakeup, u64 now); 859 int wakeup, u64 now);
860 void (*dequeue_task) (struct rq *rq, struct task_struct *p, 860 void (*dequeue_task) (struct rq *rq, struct task_struct *p,
861 int sleep, u64 now); 861 int sleep, u64 now);
862 void (*yield_task) (struct rq *rq, struct task_struct *p); 862 void (*yield_task) (struct rq *rq, struct task_struct *p);
863 863
864 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); 864 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
865 865
866 struct task_struct * (*pick_next_task) (struct rq *rq, u64 now); 866 struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
867 void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now); 867 void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
868 868
869 int (*load_balance) (struct rq *this_rq, int this_cpu, 869 unsigned long (*load_balance) (struct rq *this_rq, int this_cpu,
870 struct rq *busiest, 870 struct rq *busiest,
871 unsigned long max_nr_move, unsigned long max_load_move, 871 unsigned long max_nr_move, unsigned long max_load_move,
872 struct sched_domain *sd, enum cpu_idle_type idle, 872 struct sched_domain *sd, enum cpu_idle_type idle,
873 int *all_pinned, unsigned long *total_load_moved); 873 int *all_pinned);
874 874
875 void (*set_curr_task) (struct rq *rq); 875 void (*set_curr_task) (struct rq *rq);
876 void (*task_tick) (struct rq *rq, struct task_struct *p); 876 void (*task_tick) (struct rq *rq, struct task_struct *p);
877 void (*task_new) (struct rq *rq, struct task_struct *p, u64 now); 877 void (*task_new) (struct rq *rq, struct task_struct *p, u64 now);
878 }; 878 };
879 879
880 struct load_weight { 880 struct load_weight {
881 unsigned long weight, inv_weight; 881 unsigned long weight, inv_weight;
882 }; 882 };
883 883
884 /* 884 /*
885 * CFS stats for a schedulable entity (task, task-group etc) 885 * CFS stats for a schedulable entity (task, task-group etc)
886 * 886 *
887 * Current field usage histogram: 887 * Current field usage histogram:
888 * 888 *
889 * 4 se->block_start 889 * 4 se->block_start
890 * 4 se->run_node 890 * 4 se->run_node
891 * 4 se->sleep_start 891 * 4 se->sleep_start
892 * 4 se->sleep_start_fair 892 * 4 se->sleep_start_fair
893 * 6 se->load.weight 893 * 6 se->load.weight
894 * 7 se->delta_fair 894 * 7 se->delta_fair
895 * 15 se->wait_runtime 895 * 15 se->wait_runtime
896 */ 896 */
897 struct sched_entity { 897 struct sched_entity {
898 long wait_runtime; 898 long wait_runtime;
899 unsigned long delta_fair_run; 899 unsigned long delta_fair_run;
900 unsigned long delta_fair_sleep; 900 unsigned long delta_fair_sleep;
901 unsigned long delta_exec; 901 unsigned long delta_exec;
902 s64 fair_key; 902 s64 fair_key;
903 struct load_weight load; /* for load-balancing */ 903 struct load_weight load; /* for load-balancing */
904 struct rb_node run_node; 904 struct rb_node run_node;
905 unsigned int on_rq; 905 unsigned int on_rq;
906 906
907 u64 exec_start; 907 u64 exec_start;
908 u64 sum_exec_runtime; 908 u64 sum_exec_runtime;
909 u64 wait_start_fair; 909 u64 wait_start_fair;
910 u64 sleep_start_fair; 910 u64 sleep_start_fair;
911 911
912 #ifdef CONFIG_SCHEDSTATS 912 #ifdef CONFIG_SCHEDSTATS
913 u64 wait_start; 913 u64 wait_start;
914 u64 wait_max; 914 u64 wait_max;
915 s64 sum_wait_runtime; 915 s64 sum_wait_runtime;
916 916
917 u64 sleep_start; 917 u64 sleep_start;
918 u64 sleep_max; 918 u64 sleep_max;
919 s64 sum_sleep_runtime; 919 s64 sum_sleep_runtime;
920 920
921 u64 block_start; 921 u64 block_start;
922 u64 block_max; 922 u64 block_max;
923 u64 exec_max; 923 u64 exec_max;
924 924
925 unsigned long wait_runtime_overruns; 925 unsigned long wait_runtime_overruns;
926 unsigned long wait_runtime_underruns; 926 unsigned long wait_runtime_underruns;
927 #endif 927 #endif
928 928
929 #ifdef CONFIG_FAIR_GROUP_SCHED 929 #ifdef CONFIG_FAIR_GROUP_SCHED
930 struct sched_entity *parent; 930 struct sched_entity *parent;
931 /* rq on which this entity is (to be) queued: */ 931 /* rq on which this entity is (to be) queued: */
932 struct cfs_rq *cfs_rq; 932 struct cfs_rq *cfs_rq;
933 /* rq "owned" by this entity/group: */ 933 /* rq "owned" by this entity/group: */
934 struct cfs_rq *my_q; 934 struct cfs_rq *my_q;
935 #endif 935 #endif
936 }; 936 };
937 937
938 struct task_struct { 938 struct task_struct {
939 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 939 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
940 void *stack; 940 void *stack;
941 atomic_t usage; 941 atomic_t usage;
942 unsigned int flags; /* per process flags, defined below */ 942 unsigned int flags; /* per process flags, defined below */
943 unsigned int ptrace; 943 unsigned int ptrace;
944 944
945 int lock_depth; /* BKL lock depth */ 945 int lock_depth; /* BKL lock depth */
946 946
947 #ifdef CONFIG_SMP 947 #ifdef CONFIG_SMP
948 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 948 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
949 int oncpu; 949 int oncpu;
950 #endif 950 #endif
951 #endif 951 #endif
952 952
953 int prio, static_prio, normal_prio; 953 int prio, static_prio, normal_prio;
954 struct list_head run_list; 954 struct list_head run_list;
955 struct sched_class *sched_class; 955 struct sched_class *sched_class;
956 struct sched_entity se; 956 struct sched_entity se;
957 957
958 #ifdef CONFIG_PREEMPT_NOTIFIERS 958 #ifdef CONFIG_PREEMPT_NOTIFIERS
959 /* list of struct preempt_notifier: */ 959 /* list of struct preempt_notifier: */
960 struct hlist_head preempt_notifiers; 960 struct hlist_head preempt_notifiers;
961 #endif 961 #endif
962 962
963 unsigned short ioprio; 963 unsigned short ioprio;
964 #ifdef CONFIG_BLK_DEV_IO_TRACE 964 #ifdef CONFIG_BLK_DEV_IO_TRACE
965 unsigned int btrace_seq; 965 unsigned int btrace_seq;
966 #endif 966 #endif
967 967
968 unsigned int policy; 968 unsigned int policy;
969 cpumask_t cpus_allowed; 969 cpumask_t cpus_allowed;
970 unsigned int time_slice; 970 unsigned int time_slice;
971 971
972 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 972 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
973 struct sched_info sched_info; 973 struct sched_info sched_info;
974 #endif 974 #endif
975 975
976 struct list_head tasks; 976 struct list_head tasks;
977 /* 977 /*
978 * ptrace_list/ptrace_children forms the list of my children 978 * ptrace_list/ptrace_children forms the list of my children
979 * that were stolen by a ptracer. 979 * that were stolen by a ptracer.
980 */ 980 */
981 struct list_head ptrace_children; 981 struct list_head ptrace_children;
982 struct list_head ptrace_list; 982 struct list_head ptrace_list;
983 983
984 struct mm_struct *mm, *active_mm; 984 struct mm_struct *mm, *active_mm;
985 985
986 /* task state */ 986 /* task state */
987 struct linux_binfmt *binfmt; 987 struct linux_binfmt *binfmt;
988 int exit_state; 988 int exit_state;
989 int exit_code, exit_signal; 989 int exit_code, exit_signal;
990 int pdeath_signal; /* The signal sent when the parent dies */ 990 int pdeath_signal; /* The signal sent when the parent dies */
991 /* ??? */ 991 /* ??? */
992 unsigned int personality; 992 unsigned int personality;
993 unsigned did_exec:1; 993 unsigned did_exec:1;
994 pid_t pid; 994 pid_t pid;
995 pid_t tgid; 995 pid_t tgid;
996 996
997 #ifdef CONFIG_CC_STACKPROTECTOR 997 #ifdef CONFIG_CC_STACKPROTECTOR
998 /* Canary value for the -fstack-protector gcc feature */ 998 /* Canary value for the -fstack-protector gcc feature */
999 unsigned long stack_canary; 999 unsigned long stack_canary;
1000 #endif 1000 #endif
1001 /* 1001 /*
1002 * pointers to (original) parent process, youngest child, younger sibling, 1002 * pointers to (original) parent process, youngest child, younger sibling,
1003 * older sibling, respectively. (p->father can be replaced with 1003 * older sibling, respectively. (p->father can be replaced with
1004 * p->parent->pid) 1004 * p->parent->pid)
1005 */ 1005 */
1006 struct task_struct *real_parent; /* real parent process (when being debugged) */ 1006 struct task_struct *real_parent; /* real parent process (when being debugged) */
1007 struct task_struct *parent; /* parent process */ 1007 struct task_struct *parent; /* parent process */
1008 /* 1008 /*
1009 * children/sibling forms the list of my children plus the 1009 * children/sibling forms the list of my children plus the
1010 * tasks I'm ptracing. 1010 * tasks I'm ptracing.
1011 */ 1011 */
1012 struct list_head children; /* list of my children */ 1012 struct list_head children; /* list of my children */
1013 struct list_head sibling; /* linkage in my parent's children list */ 1013 struct list_head sibling; /* linkage in my parent's children list */
1014 struct task_struct *group_leader; /* threadgroup leader */ 1014 struct task_struct *group_leader; /* threadgroup leader */
1015 1015
1016 /* PID/PID hash table linkage. */ 1016 /* PID/PID hash table linkage. */
1017 struct pid_link pids[PIDTYPE_MAX]; 1017 struct pid_link pids[PIDTYPE_MAX];
1018 struct list_head thread_group; 1018 struct list_head thread_group;
1019 1019
1020 struct completion *vfork_done; /* for vfork() */ 1020 struct completion *vfork_done; /* for vfork() */
1021 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1021 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
1022 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1022 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
1023 1023
1024 unsigned int rt_priority; 1024 unsigned int rt_priority;
1025 cputime_t utime, stime; 1025 cputime_t utime, stime;
1026 unsigned long nvcsw, nivcsw; /* context switch counts */ 1026 unsigned long nvcsw, nivcsw; /* context switch counts */
1027 struct timespec start_time; /* monotonic time */ 1027 struct timespec start_time; /* monotonic time */
1028 struct timespec real_start_time; /* boot based time */ 1028 struct timespec real_start_time; /* boot based time */
1029 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ 1029 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
1030 unsigned long min_flt, maj_flt; 1030 unsigned long min_flt, maj_flt;
1031 1031
1032 cputime_t it_prof_expires, it_virt_expires; 1032 cputime_t it_prof_expires, it_virt_expires;
1033 unsigned long long it_sched_expires; 1033 unsigned long long it_sched_expires;
1034 struct list_head cpu_timers[3]; 1034 struct list_head cpu_timers[3];
1035 1035
1036 /* process credentials */ 1036 /* process credentials */
1037 uid_t uid,euid,suid,fsuid; 1037 uid_t uid,euid,suid,fsuid;
1038 gid_t gid,egid,sgid,fsgid; 1038 gid_t gid,egid,sgid,fsgid;
1039 struct group_info *group_info; 1039 struct group_info *group_info;
1040 kernel_cap_t cap_effective, cap_inheritable, cap_permitted; 1040 kernel_cap_t cap_effective, cap_inheritable, cap_permitted;
1041 unsigned keep_capabilities:1; 1041 unsigned keep_capabilities:1;
1042 struct user_struct *user; 1042 struct user_struct *user;
1043 #ifdef CONFIG_KEYS 1043 #ifdef CONFIG_KEYS
1044 struct key *request_key_auth; /* assumed request_key authority */ 1044 struct key *request_key_auth; /* assumed request_key authority */
1045 struct key *thread_keyring; /* keyring private to this thread */ 1045 struct key *thread_keyring; /* keyring private to this thread */
1046 unsigned char jit_keyring; /* default keyring to attach requested keys to */ 1046 unsigned char jit_keyring; /* default keyring to attach requested keys to */
1047 #endif 1047 #endif
1048 /* 1048 /*
1049 * fpu_counter contains the number of consecutive context switches 1049 * fpu_counter contains the number of consecutive context switches
1050 * that the FPU is used. If this is over a threshold, the lazy fpu 1050 * that the FPU is used. If this is over a threshold, the lazy fpu
1051 * saving becomes unlazy to save the trap. This is an unsigned char 1051 * saving becomes unlazy to save the trap. This is an unsigned char
1052 * so that after 256 times the counter wraps and the behavior turns 1052 * so that after 256 times the counter wraps and the behavior turns
1053 * lazy again; this to deal with bursty apps that only use FPU for 1053 * lazy again; this to deal with bursty apps that only use FPU for
1054 * a short time 1054 * a short time
1055 */ 1055 */
1056 unsigned char fpu_counter; 1056 unsigned char fpu_counter;
1057 int oomkilladj; /* OOM kill score adjustment (bit shift). */ 1057 int oomkilladj; /* OOM kill score adjustment (bit shift). */
1058 char comm[TASK_COMM_LEN]; /* executable name excluding path 1058 char comm[TASK_COMM_LEN]; /* executable name excluding path
1059 - access with [gs]et_task_comm (which lock 1059 - access with [gs]et_task_comm (which lock
1060 it with task_lock()) 1060 it with task_lock())
1061 - initialized normally by flush_old_exec */ 1061 - initialized normally by flush_old_exec */
1062 /* file system info */ 1062 /* file system info */
1063 int link_count, total_link_count; 1063 int link_count, total_link_count;
1064 #ifdef CONFIG_SYSVIPC 1064 #ifdef CONFIG_SYSVIPC
1065 /* ipc stuff */ 1065 /* ipc stuff */
1066 struct sysv_sem sysvsem; 1066 struct sysv_sem sysvsem;
1067 #endif 1067 #endif
1068 /* CPU-specific state of this task */ 1068 /* CPU-specific state of this task */
1069 struct thread_struct thread; 1069 struct thread_struct thread;
1070 /* filesystem information */ 1070 /* filesystem information */
1071 struct fs_struct *fs; 1071 struct fs_struct *fs;
1072 /* open file information */ 1072 /* open file information */
1073 struct files_struct *files; 1073 struct files_struct *files;
1074 /* namespaces */ 1074 /* namespaces */
1075 struct nsproxy *nsproxy; 1075 struct nsproxy *nsproxy;
1076 /* signal handlers */ 1076 /* signal handlers */
1077 struct signal_struct *signal; 1077 struct signal_struct *signal;
1078 struct sighand_struct *sighand; 1078 struct sighand_struct *sighand;
1079 1079
1080 sigset_t blocked, real_blocked; 1080 sigset_t blocked, real_blocked;
1081 sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ 1081 sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
1082 struct sigpending pending; 1082 struct sigpending pending;
1083 1083
1084 unsigned long sas_ss_sp; 1084 unsigned long sas_ss_sp;
1085 size_t sas_ss_size; 1085 size_t sas_ss_size;
1086 int (*notifier)(void *priv); 1086 int (*notifier)(void *priv);
1087 void *notifier_data; 1087 void *notifier_data;
1088 sigset_t *notifier_mask; 1088 sigset_t *notifier_mask;
1089 1089
1090 void *security; 1090 void *security;
1091 struct audit_context *audit_context; 1091 struct audit_context *audit_context;
1092 seccomp_t seccomp; 1092 seccomp_t seccomp;
1093 1093
1094 /* Thread group tracking */ 1094 /* Thread group tracking */
1095 u32 parent_exec_id; 1095 u32 parent_exec_id;
1096 u32 self_exec_id; 1096 u32 self_exec_id;
1097 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ 1097 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
1098 spinlock_t alloc_lock; 1098 spinlock_t alloc_lock;
1099 1099
1100 /* Protection of the PI data structures: */ 1100 /* Protection of the PI data structures: */
1101 spinlock_t pi_lock; 1101 spinlock_t pi_lock;
1102 1102
1103 #ifdef CONFIG_RT_MUTEXES 1103 #ifdef CONFIG_RT_MUTEXES
1104 /* PI waiters blocked on a rt_mutex held by this task */ 1104 /* PI waiters blocked on a rt_mutex held by this task */
1105 struct plist_head pi_waiters; 1105 struct plist_head pi_waiters;
1106 /* Deadlock detection and priority inheritance handling */ 1106 /* Deadlock detection and priority inheritance handling */
1107 struct rt_mutex_waiter *pi_blocked_on; 1107 struct rt_mutex_waiter *pi_blocked_on;
1108 #endif 1108 #endif
1109 1109
1110 #ifdef CONFIG_DEBUG_MUTEXES 1110 #ifdef CONFIG_DEBUG_MUTEXES
1111 /* mutex deadlock detection */ 1111 /* mutex deadlock detection */
1112 struct mutex_waiter *blocked_on; 1112 struct mutex_waiter *blocked_on;
1113 #endif 1113 #endif
1114 #ifdef CONFIG_TRACE_IRQFLAGS 1114 #ifdef CONFIG_TRACE_IRQFLAGS
1115 unsigned int irq_events; 1115 unsigned int irq_events;
1116 int hardirqs_enabled; 1116 int hardirqs_enabled;
1117 unsigned long hardirq_enable_ip; 1117 unsigned long hardirq_enable_ip;
1118 unsigned int hardirq_enable_event; 1118 unsigned int hardirq_enable_event;
1119 unsigned long hardirq_disable_ip; 1119 unsigned long hardirq_disable_ip;
1120 unsigned int hardirq_disable_event; 1120 unsigned int hardirq_disable_event;
1121 int softirqs_enabled; 1121 int softirqs_enabled;
1122 unsigned long softirq_disable_ip; 1122 unsigned long softirq_disable_ip;
1123 unsigned int softirq_disable_event; 1123 unsigned int softirq_disable_event;
1124 unsigned long softirq_enable_ip; 1124 unsigned long softirq_enable_ip;
1125 unsigned int softirq_enable_event; 1125 unsigned int softirq_enable_event;
1126 int hardirq_context; 1126 int hardirq_context;
1127 int softirq_context; 1127 int softirq_context;
1128 #endif 1128 #endif
1129 #ifdef CONFIG_LOCKDEP 1129 #ifdef CONFIG_LOCKDEP
1130 # define MAX_LOCK_DEPTH 30UL 1130 # define MAX_LOCK_DEPTH 30UL
1131 u64 curr_chain_key; 1131 u64 curr_chain_key;
1132 int lockdep_depth; 1132 int lockdep_depth;
1133 struct held_lock held_locks[MAX_LOCK_DEPTH]; 1133 struct held_lock held_locks[MAX_LOCK_DEPTH];
1134 unsigned int lockdep_recursion; 1134 unsigned int lockdep_recursion;
1135 #endif 1135 #endif
1136 1136
1137 /* journalling filesystem info */ 1137 /* journalling filesystem info */
1138 void *journal_info; 1138 void *journal_info;
1139 1139
1140 /* stacked block device info */ 1140 /* stacked block device info */
1141 struct bio *bio_list, **bio_tail; 1141 struct bio *bio_list, **bio_tail;
1142 1142
1143 /* VM state */ 1143 /* VM state */
1144 struct reclaim_state *reclaim_state; 1144 struct reclaim_state *reclaim_state;
1145 1145
1146 struct backing_dev_info *backing_dev_info; 1146 struct backing_dev_info *backing_dev_info;
1147 1147
1148 struct io_context *io_context; 1148 struct io_context *io_context;
1149 1149
1150 unsigned long ptrace_message; 1150 unsigned long ptrace_message;
1151 siginfo_t *last_siginfo; /* For ptrace use. */ 1151 siginfo_t *last_siginfo; /* For ptrace use. */
1152 /* 1152 /*
1153 * current io wait handle: wait queue entry to use for io waits 1153 * current io wait handle: wait queue entry to use for io waits
1154 * If this thread is processing aio, this points at the waitqueue 1154 * If this thread is processing aio, this points at the waitqueue
1155 * inside the currently handled kiocb. It may be NULL (i.e. default 1155 * inside the currently handled kiocb. It may be NULL (i.e. default
1156 * to a stack based synchronous wait) if its doing sync IO. 1156 * to a stack based synchronous wait) if its doing sync IO.
1157 */ 1157 */
1158 wait_queue_t *io_wait; 1158 wait_queue_t *io_wait;
1159 #ifdef CONFIG_TASK_XACCT 1159 #ifdef CONFIG_TASK_XACCT
1160 /* i/o counters(bytes read/written, #syscalls */ 1160 /* i/o counters(bytes read/written, #syscalls */
1161 u64 rchar, wchar, syscr, syscw; 1161 u64 rchar, wchar, syscr, syscw;
1162 #endif 1162 #endif
1163 struct task_io_accounting ioac; 1163 struct task_io_accounting ioac;
1164 #if defined(CONFIG_TASK_XACCT) 1164 #if defined(CONFIG_TASK_XACCT)
1165 u64 acct_rss_mem1; /* accumulated rss usage */ 1165 u64 acct_rss_mem1; /* accumulated rss usage */
1166 u64 acct_vm_mem1; /* accumulated virtual memory usage */ 1166 u64 acct_vm_mem1; /* accumulated virtual memory usage */
1167 cputime_t acct_stimexpd;/* stime since last update */ 1167 cputime_t acct_stimexpd;/* stime since last update */
1168 #endif 1168 #endif
1169 #ifdef CONFIG_NUMA 1169 #ifdef CONFIG_NUMA
1170 struct mempolicy *mempolicy; 1170 struct mempolicy *mempolicy;
1171 short il_next; 1171 short il_next;
1172 #endif 1172 #endif
1173 #ifdef CONFIG_CPUSETS 1173 #ifdef CONFIG_CPUSETS
1174 struct cpuset *cpuset; 1174 struct cpuset *cpuset;
1175 nodemask_t mems_allowed; 1175 nodemask_t mems_allowed;
1176 int cpuset_mems_generation; 1176 int cpuset_mems_generation;
1177 int cpuset_mem_spread_rotor; 1177 int cpuset_mem_spread_rotor;
1178 #endif 1178 #endif
1179 struct robust_list_head __user *robust_list; 1179 struct robust_list_head __user *robust_list;
1180 #ifdef CONFIG_COMPAT 1180 #ifdef CONFIG_COMPAT
1181 struct compat_robust_list_head __user *compat_robust_list; 1181 struct compat_robust_list_head __user *compat_robust_list;
1182 #endif 1182 #endif
1183 struct list_head pi_state_list; 1183 struct list_head pi_state_list;
1184 struct futex_pi_state *pi_state_cache; 1184 struct futex_pi_state *pi_state_cache;
1185 1185
1186 atomic_t fs_excl; /* holding fs exclusive resources */ 1186 atomic_t fs_excl; /* holding fs exclusive resources */
1187 struct rcu_head rcu; 1187 struct rcu_head rcu;
1188 1188
1189 /* 1189 /*
1190 * cache last used pipe for splice 1190 * cache last used pipe for splice
1191 */ 1191 */
1192 struct pipe_inode_info *splice_pipe; 1192 struct pipe_inode_info *splice_pipe;
1193 #ifdef CONFIG_TASK_DELAY_ACCT 1193 #ifdef CONFIG_TASK_DELAY_ACCT
1194 struct task_delay_info *delays; 1194 struct task_delay_info *delays;
1195 #endif 1195 #endif
1196 #ifdef CONFIG_FAULT_INJECTION 1196 #ifdef CONFIG_FAULT_INJECTION
1197 int make_it_fail; 1197 int make_it_fail;
1198 #endif 1198 #endif
1199 }; 1199 };
1200 1200
1201 /* 1201 /*
1202 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 1202 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
1203 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 1203 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
1204 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority 1204 * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
1205 * values are inverted: lower p->prio value means higher priority. 1205 * values are inverted: lower p->prio value means higher priority.
1206 * 1206 *
1207 * The MAX_USER_RT_PRIO value allows the actual maximum 1207 * The MAX_USER_RT_PRIO value allows the actual maximum
1208 * RT priority to be separate from the value exported to 1208 * RT priority to be separate from the value exported to
1209 * user-space. This allows kernel threads to set their 1209 * user-space. This allows kernel threads to set their
1210 * priority to a value higher than any user task. Note: 1210 * priority to a value higher than any user task. Note:
1211 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. 1211 * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
1212 */ 1212 */
1213 1213
1214 #define MAX_USER_RT_PRIO 100 1214 #define MAX_USER_RT_PRIO 100
1215 #define MAX_RT_PRIO MAX_USER_RT_PRIO 1215 #define MAX_RT_PRIO MAX_USER_RT_PRIO
1216 1216
1217 #define MAX_PRIO (MAX_RT_PRIO + 40) 1217 #define MAX_PRIO (MAX_RT_PRIO + 40)
1218 #define DEFAULT_PRIO (MAX_RT_PRIO + 20) 1218 #define DEFAULT_PRIO (MAX_RT_PRIO + 20)
1219 1219
1220 static inline int rt_prio(int prio) 1220 static inline int rt_prio(int prio)
1221 { 1221 {
1222 if (unlikely(prio < MAX_RT_PRIO)) 1222 if (unlikely(prio < MAX_RT_PRIO))
1223 return 1; 1223 return 1;
1224 return 0; 1224 return 0;
1225 } 1225 }
1226 1226
1227 static inline int rt_task(struct task_struct *p) 1227 static inline int rt_task(struct task_struct *p)
1228 { 1228 {
1229 return rt_prio(p->prio); 1229 return rt_prio(p->prio);
1230 } 1230 }
1231 1231
1232 static inline pid_t process_group(struct task_struct *tsk) 1232 static inline pid_t process_group(struct task_struct *tsk)
1233 { 1233 {
1234 return tsk->signal->pgrp; 1234 return tsk->signal->pgrp;
1235 } 1235 }
1236 1236
1237 static inline pid_t signal_session(struct signal_struct *sig) 1237 static inline pid_t signal_session(struct signal_struct *sig)
1238 { 1238 {
1239 return sig->__session; 1239 return sig->__session;
1240 } 1240 }
1241 1241
1242 static inline pid_t process_session(struct task_struct *tsk) 1242 static inline pid_t process_session(struct task_struct *tsk)
1243 { 1243 {
1244 return signal_session(tsk->signal); 1244 return signal_session(tsk->signal);
1245 } 1245 }
1246 1246
1247 static inline void set_signal_session(struct signal_struct *sig, pid_t session) 1247 static inline void set_signal_session(struct signal_struct *sig, pid_t session)
1248 { 1248 {
1249 sig->__session = session; 1249 sig->__session = session;
1250 } 1250 }
1251 1251
1252 static inline struct pid *task_pid(struct task_struct *task) 1252 static inline struct pid *task_pid(struct task_struct *task)
1253 { 1253 {
1254 return task->pids[PIDTYPE_PID].pid; 1254 return task->pids[PIDTYPE_PID].pid;
1255 } 1255 }
1256 1256
1257 static inline struct pid *task_tgid(struct task_struct *task) 1257 static inline struct pid *task_tgid(struct task_struct *task)
1258 { 1258 {
1259 return task->group_leader->pids[PIDTYPE_PID].pid; 1259 return task->group_leader->pids[PIDTYPE_PID].pid;
1260 } 1260 }
1261 1261
1262 static inline struct pid *task_pgrp(struct task_struct *task) 1262 static inline struct pid *task_pgrp(struct task_struct *task)
1263 { 1263 {
1264 return task->group_leader->pids[PIDTYPE_PGID].pid; 1264 return task->group_leader->pids[PIDTYPE_PGID].pid;
1265 } 1265 }
1266 1266
1267 static inline struct pid *task_session(struct task_struct *task) 1267 static inline struct pid *task_session(struct task_struct *task)
1268 { 1268 {
1269 return task->group_leader->pids[PIDTYPE_SID].pid; 1269 return task->group_leader->pids[PIDTYPE_SID].pid;
1270 } 1270 }
1271 1271
1272 /** 1272 /**
1273 * pid_alive - check that a task structure is not stale 1273 * pid_alive - check that a task structure is not stale
1274 * @p: Task structure to be checked. 1274 * @p: Task structure to be checked.
1275 * 1275 *
1276 * Test if a process is not yet dead (at most zombie state) 1276 * Test if a process is not yet dead (at most zombie state)
1277 * If pid_alive fails, then pointers within the task structure 1277 * If pid_alive fails, then pointers within the task structure
1278 * can be stale and must not be dereferenced. 1278 * can be stale and must not be dereferenced.
1279 */ 1279 */
1280 static inline int pid_alive(struct task_struct *p) 1280 static inline int pid_alive(struct task_struct *p)
1281 { 1281 {
1282 return p->pids[PIDTYPE_PID].pid != NULL; 1282 return p->pids[PIDTYPE_PID].pid != NULL;
1283 } 1283 }
1284 1284
1285 /** 1285 /**
1286 * is_init - check if a task structure is init 1286 * is_init - check if a task structure is init
1287 * @tsk: Task structure to be checked. 1287 * @tsk: Task structure to be checked.
1288 * 1288 *
1289 * Check if a task structure is the first user space task the kernel created. 1289 * Check if a task structure is the first user space task the kernel created.
1290 */ 1290 */
1291 static inline int is_init(struct task_struct *tsk) 1291 static inline int is_init(struct task_struct *tsk)
1292 { 1292 {
1293 return tsk->pid == 1; 1293 return tsk->pid == 1;
1294 } 1294 }
1295 1295
1296 extern struct pid *cad_pid; 1296 extern struct pid *cad_pid;
1297 1297
1298 extern void free_task(struct task_struct *tsk); 1298 extern void free_task(struct task_struct *tsk);
1299 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) 1299 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
1300 1300
1301 extern void __put_task_struct(struct task_struct *t); 1301 extern void __put_task_struct(struct task_struct *t);
1302 1302
1303 static inline void put_task_struct(struct task_struct *t) 1303 static inline void put_task_struct(struct task_struct *t)
1304 { 1304 {
1305 if (atomic_dec_and_test(&t->usage)) 1305 if (atomic_dec_and_test(&t->usage))
1306 __put_task_struct(t); 1306 __put_task_struct(t);
1307 } 1307 }
1308 1308
1309 /* 1309 /*
1310 * Per process flags 1310 * Per process flags
1311 */ 1311 */
1312 #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ 1312 #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
1313 /* Not implemented yet, only for 486*/ 1313 /* Not implemented yet, only for 486*/
1314 #define PF_STARTING 0x00000002 /* being created */ 1314 #define PF_STARTING 0x00000002 /* being created */
1315 #define PF_EXITING 0x00000004 /* getting shut down */ 1315 #define PF_EXITING 0x00000004 /* getting shut down */
1316 #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1316 #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
1317 #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ 1317 #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
1318 #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ 1318 #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
1319 #define PF_DUMPCORE 0x00000200 /* dumped core */ 1319 #define PF_DUMPCORE 0x00000200 /* dumped core */
1320 #define PF_SIGNALED 0x00000400 /* killed by a signal */ 1320 #define PF_SIGNALED 0x00000400 /* killed by a signal */
1321 #define PF_MEMALLOC 0x00000800 /* Allocating memory */ 1321 #define PF_MEMALLOC 0x00000800 /* Allocating memory */
1322 #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ 1322 #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */
1323 #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ 1323 #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
1324 #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ 1324 #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
1325 #define PF_FROZEN 0x00010000 /* frozen for system suspend */ 1325 #define PF_FROZEN 0x00010000 /* frozen for system suspend */
1326 #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ 1326 #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
1327 #define PF_KSWAPD 0x00040000 /* I am kswapd */ 1327 #define PF_KSWAPD 0x00040000 /* I am kswapd */
1328 #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ 1328 #define PF_SWAPOFF 0x00080000 /* I am in swapoff */
1329 #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ 1329 #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
1330 #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */ 1330 #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */
1331 #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ 1331 #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
1332 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1332 #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
1333 #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ 1333 #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
1334 #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ 1334 #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
1335 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ 1335 #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
1336 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ 1336 #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
1337 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ 1337 #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */
1338 1338
1339 /* 1339 /*
1340 * Only the _current_ task can read/write to tsk->flags, but other 1340 * Only the _current_ task can read/write to tsk->flags, but other
1341 * tasks can access tsk->flags in readonly mode for example 1341 * tasks can access tsk->flags in readonly mode for example
1342 * with tsk_used_math (like during threaded core dumping). 1342 * with tsk_used_math (like during threaded core dumping).
1343 * There is however an exception to this rule during ptrace 1343 * There is however an exception to this rule during ptrace
1344 * or during fork: the ptracer task is allowed to write to the 1344 * or during fork: the ptracer task is allowed to write to the
1345 * child->flags of its traced child (same goes for fork, the parent 1345 * child->flags of its traced child (same goes for fork, the parent
1346 * can write to the child->flags), because we're guaranteed the 1346 * can write to the child->flags), because we're guaranteed the
1347 * child is not running and in turn not changing child->flags 1347 * child is not running and in turn not changing child->flags
1348 * at the same time the parent does it. 1348 * at the same time the parent does it.
1349 */ 1349 */
1350 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) 1350 #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
1351 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) 1351 #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
1352 #define clear_used_math() clear_stopped_child_used_math(current) 1352 #define clear_used_math() clear_stopped_child_used_math(current)
1353 #define set_used_math() set_stopped_child_used_math(current) 1353 #define set_used_math() set_stopped_child_used_math(current)
1354 #define conditional_stopped_child_used_math(condition, child) \ 1354 #define conditional_stopped_child_used_math(condition, child) \
1355 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) 1355 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
1356 #define conditional_used_math(condition) \ 1356 #define conditional_used_math(condition) \
1357 conditional_stopped_child_used_math(condition, current) 1357 conditional_stopped_child_used_math(condition, current)
1358 #define copy_to_stopped_child_used_math(child) \ 1358 #define copy_to_stopped_child_used_math(child) \
1359 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) 1359 do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
1360 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ 1360 /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
1361 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1361 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1362 #define used_math() tsk_used_math(current) 1362 #define used_math() tsk_used_math(current)
1363 1363
1364 #ifdef CONFIG_SMP 1364 #ifdef CONFIG_SMP
1365 extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); 1365 extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
1366 #else 1366 #else
1367 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 1367 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1368 { 1368 {
1369 if (!cpu_isset(0, new_mask)) 1369 if (!cpu_isset(0, new_mask))
1370 return -EINVAL; 1370 return -EINVAL;
1371 return 0; 1371 return 0;
1372 } 1372 }
1373 #endif 1373 #endif
1374 1374
1375 extern unsigned long long sched_clock(void); 1375 extern unsigned long long sched_clock(void);
1376 1376
1377 /* 1377 /*
1378 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 1378 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
1379 * clock constructed from sched_clock(): 1379 * clock constructed from sched_clock():
1380 */ 1380 */
1381 extern unsigned long long cpu_clock(int cpu); 1381 extern unsigned long long cpu_clock(int cpu);
1382 1382
1383 extern unsigned long long 1383 extern unsigned long long
1384 task_sched_runtime(struct task_struct *task); 1384 task_sched_runtime(struct task_struct *task);
1385 1385
1386 /* sched_exec is called by processes performing an exec */ 1386 /* sched_exec is called by processes performing an exec */
1387 #ifdef CONFIG_SMP 1387 #ifdef CONFIG_SMP
1388 extern void sched_exec(void); 1388 extern void sched_exec(void);
1389 #else 1389 #else
1390 #define sched_exec() {} 1390 #define sched_exec() {}
1391 #endif 1391 #endif
1392 1392
1393 extern void sched_clock_unstable_event(void); 1393 extern void sched_clock_unstable_event(void);
1394 1394
1395 #ifdef CONFIG_HOTPLUG_CPU 1395 #ifdef CONFIG_HOTPLUG_CPU
1396 extern void idle_task_exit(void); 1396 extern void idle_task_exit(void);
1397 #else 1397 #else
1398 static inline void idle_task_exit(void) {} 1398 static inline void idle_task_exit(void) {}
1399 #endif 1399 #endif
1400 1400
1401 extern void sched_idle_next(void); 1401 extern void sched_idle_next(void);
1402 1402
1403 extern unsigned int sysctl_sched_granularity; 1403 extern unsigned int sysctl_sched_granularity;
1404 extern unsigned int sysctl_sched_wakeup_granularity; 1404 extern unsigned int sysctl_sched_wakeup_granularity;
1405 extern unsigned int sysctl_sched_batch_wakeup_granularity; 1405 extern unsigned int sysctl_sched_batch_wakeup_granularity;
1406 extern unsigned int sysctl_sched_stat_granularity; 1406 extern unsigned int sysctl_sched_stat_granularity;
1407 extern unsigned int sysctl_sched_runtime_limit; 1407 extern unsigned int sysctl_sched_runtime_limit;
1408 extern unsigned int sysctl_sched_child_runs_first; 1408 extern unsigned int sysctl_sched_child_runs_first;
1409 extern unsigned int sysctl_sched_features; 1409 extern unsigned int sysctl_sched_features;
1410 1410
1411 #ifdef CONFIG_RT_MUTEXES 1411 #ifdef CONFIG_RT_MUTEXES
1412 extern int rt_mutex_getprio(struct task_struct *p); 1412 extern int rt_mutex_getprio(struct task_struct *p);
1413 extern void rt_mutex_setprio(struct task_struct *p, int prio); 1413 extern void rt_mutex_setprio(struct task_struct *p, int prio);
1414 extern void rt_mutex_adjust_pi(struct task_struct *p); 1414 extern void rt_mutex_adjust_pi(struct task_struct *p);
1415 #else 1415 #else
1416 static inline int rt_mutex_getprio(struct task_struct *p) 1416 static inline int rt_mutex_getprio(struct task_struct *p)
1417 { 1417 {
1418 return p->normal_prio; 1418 return p->normal_prio;
1419 } 1419 }
1420 # define rt_mutex_adjust_pi(p) do { } while (0) 1420 # define rt_mutex_adjust_pi(p) do { } while (0)
1421 #endif 1421 #endif
1422 1422
1423 extern void set_user_nice(struct task_struct *p, long nice); 1423 extern void set_user_nice(struct task_struct *p, long nice);
1424 extern int task_prio(const struct task_struct *p); 1424 extern int task_prio(const struct task_struct *p);
1425 extern int task_nice(const struct task_struct *p); 1425 extern int task_nice(const struct task_struct *p);
1426 extern int can_nice(const struct task_struct *p, const int nice); 1426 extern int can_nice(const struct task_struct *p, const int nice);
1427 extern int task_curr(const struct task_struct *p); 1427 extern int task_curr(const struct task_struct *p);
1428 extern int idle_cpu(int cpu); 1428 extern int idle_cpu(int cpu);
1429 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); 1429 extern int sched_setscheduler(struct task_struct *, int, struct sched_param *);
1430 extern struct task_struct *idle_task(int cpu); 1430 extern struct task_struct *idle_task(int cpu);
1431 extern struct task_struct *curr_task(int cpu); 1431 extern struct task_struct *curr_task(int cpu);
1432 extern void set_curr_task(int cpu, struct task_struct *p); 1432 extern void set_curr_task(int cpu, struct task_struct *p);
1433 1433
1434 void yield(void); 1434 void yield(void);
1435 1435
1436 /* 1436 /*
1437 * The default (Linux) execution domain. 1437 * The default (Linux) execution domain.
1438 */ 1438 */
1439 extern struct exec_domain default_exec_domain; 1439 extern struct exec_domain default_exec_domain;
1440 1440
1441 union thread_union { 1441 union thread_union {
1442 struct thread_info thread_info; 1442 struct thread_info thread_info;
1443 unsigned long stack[THREAD_SIZE/sizeof(long)]; 1443 unsigned long stack[THREAD_SIZE/sizeof(long)];
1444 }; 1444 };
1445 1445
1446 #ifndef __HAVE_ARCH_KSTACK_END 1446 #ifndef __HAVE_ARCH_KSTACK_END
1447 static inline int kstack_end(void *addr) 1447 static inline int kstack_end(void *addr)
1448 { 1448 {
1449 /* Reliable end of stack detection: 1449 /* Reliable end of stack detection:
1450 * Some APM bios versions misalign the stack 1450 * Some APM bios versions misalign the stack
1451 */ 1451 */
1452 return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); 1452 return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
1453 } 1453 }
1454 #endif 1454 #endif
1455 1455
1456 extern union thread_union init_thread_union; 1456 extern union thread_union init_thread_union;
1457 extern struct task_struct init_task; 1457 extern struct task_struct init_task;
1458 1458
1459 extern struct mm_struct init_mm; 1459 extern struct mm_struct init_mm;
1460 1460
1461 #define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) 1461 #define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr)
1462 extern struct task_struct *find_task_by_pid_type(int type, int pid); 1462 extern struct task_struct *find_task_by_pid_type(int type, int pid);
1463 extern void __set_special_pids(pid_t session, pid_t pgrp); 1463 extern void __set_special_pids(pid_t session, pid_t pgrp);
1464 1464
1465 /* per-UID process charging. */ 1465 /* per-UID process charging. */
1466 extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); 1466 extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
1467 static inline struct user_struct *get_uid(struct user_struct *u) 1467 static inline struct user_struct *get_uid(struct user_struct *u)
1468 { 1468 {
1469 atomic_inc(&u->__count); 1469 atomic_inc(&u->__count);
1470 return u; 1470 return u;
1471 } 1471 }
1472 extern void free_uid(struct user_struct *); 1472 extern void free_uid(struct user_struct *);
1473 extern void switch_uid(struct user_struct *); 1473 extern void switch_uid(struct user_struct *);
1474 1474
1475 #include <asm/current.h> 1475 #include <asm/current.h>
1476 1476
1477 extern void do_timer(unsigned long ticks); 1477 extern void do_timer(unsigned long ticks);
1478 1478
1479 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); 1479 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
1480 extern int FASTCALL(wake_up_process(struct task_struct * tsk)); 1480 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
1481 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, 1481 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
1482 unsigned long clone_flags)); 1482 unsigned long clone_flags));
1483 #ifdef CONFIG_SMP 1483 #ifdef CONFIG_SMP
1484 extern void kick_process(struct task_struct *tsk); 1484 extern void kick_process(struct task_struct *tsk);
1485 #else 1485 #else
1486 static inline void kick_process(struct task_struct *tsk) { } 1486 static inline void kick_process(struct task_struct *tsk) { }
1487 #endif 1487 #endif
1488 extern void sched_fork(struct task_struct *p, int clone_flags); 1488 extern void sched_fork(struct task_struct *p, int clone_flags);
1489 extern void sched_dead(struct task_struct *p); 1489 extern void sched_dead(struct task_struct *p);
1490 1490
1491 extern int in_group_p(gid_t); 1491 extern int in_group_p(gid_t);
1492 extern int in_egroup_p(gid_t); 1492 extern int in_egroup_p(gid_t);
1493 1493
1494 extern void proc_caches_init(void); 1494 extern void proc_caches_init(void);
1495 extern void flush_signals(struct task_struct *); 1495 extern void flush_signals(struct task_struct *);
1496 extern void ignore_signals(struct task_struct *); 1496 extern void ignore_signals(struct task_struct *);
1497 extern void flush_signal_handlers(struct task_struct *, int force_default); 1497 extern void flush_signal_handlers(struct task_struct *, int force_default);
1498 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); 1498 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
1499 1499
1500 static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) 1500 static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
1501 { 1501 {
1502 unsigned long flags; 1502 unsigned long flags;
1503 int ret; 1503 int ret;
1504 1504
1505 spin_lock_irqsave(&tsk->sighand->siglock, flags); 1505 spin_lock_irqsave(&tsk->sighand->siglock, flags);
1506 ret = dequeue_signal(tsk, mask, info); 1506 ret = dequeue_signal(tsk, mask, info);
1507 spin_unlock_irqrestore(&tsk->sighand->siglock, flags); 1507 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
1508 1508
1509 return ret; 1509 return ret;
1510 } 1510 }
1511 1511
1512 extern void block_all_signals(int (*notifier)(void *priv), void *priv, 1512 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
1513 sigset_t *mask); 1513 sigset_t *mask);
1514 extern void unblock_all_signals(void); 1514 extern void unblock_all_signals(void);
1515 extern void release_task(struct task_struct * p); 1515 extern void release_task(struct task_struct * p);
1516 extern int send_sig_info(int, struct siginfo *, struct task_struct *); 1516 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
1517 extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); 1517 extern int send_group_sig_info(int, struct siginfo *, struct task_struct *);
1518 extern int force_sigsegv(int, struct task_struct *); 1518 extern int force_sigsegv(int, struct task_struct *);
1519 extern int force_sig_info(int, struct siginfo *, struct task_struct *); 1519 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
1520 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); 1520 extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
1521 extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); 1521 extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
1522 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); 1522 extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
1523 extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32); 1523 extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32);
1524 extern int kill_pgrp(struct pid *pid, int sig, int priv); 1524 extern int kill_pgrp(struct pid *pid, int sig, int priv);
1525 extern int kill_pid(struct pid *pid, int sig, int priv); 1525 extern int kill_pid(struct pid *pid, int sig, int priv);
1526 extern int kill_proc_info(int, struct siginfo *, pid_t); 1526 extern int kill_proc_info(int, struct siginfo *, pid_t);
1527 extern void do_notify_parent(struct task_struct *, int); 1527 extern void do_notify_parent(struct task_struct *, int);
1528 extern void force_sig(int, struct task_struct *); 1528 extern void force_sig(int, struct task_struct *);
1529 extern void force_sig_specific(int, struct task_struct *); 1529 extern void force_sig_specific(int, struct task_struct *);
1530 extern int send_sig(int, struct task_struct *, int); 1530 extern int send_sig(int, struct task_struct *, int);
1531 extern void zap_other_threads(struct task_struct *p); 1531 extern void zap_other_threads(struct task_struct *p);
1532 extern int kill_proc(pid_t, int, int); 1532 extern int kill_proc(pid_t, int, int);
1533 extern struct sigqueue *sigqueue_alloc(void); 1533 extern struct sigqueue *sigqueue_alloc(void);
1534 extern void sigqueue_free(struct sigqueue *); 1534 extern void sigqueue_free(struct sigqueue *);
1535 extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); 1535 extern int send_sigqueue(int, struct sigqueue *, struct task_struct *);
1536 extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); 1536 extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *);
1537 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); 1537 extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
1538 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); 1538 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
1539 1539
1540 static inline int kill_cad_pid(int sig, int priv) 1540 static inline int kill_cad_pid(int sig, int priv)
1541 { 1541 {
1542 return kill_pid(cad_pid, sig, priv); 1542 return kill_pid(cad_pid, sig, priv);
1543 } 1543 }
1544 1544
1545 /* These can be the second arg to send_sig_info/send_group_sig_info. */ 1545 /* These can be the second arg to send_sig_info/send_group_sig_info. */
1546 #define SEND_SIG_NOINFO ((struct siginfo *) 0) 1546 #define SEND_SIG_NOINFO ((struct siginfo *) 0)
1547 #define SEND_SIG_PRIV ((struct siginfo *) 1) 1547 #define SEND_SIG_PRIV ((struct siginfo *) 1)
1548 #define SEND_SIG_FORCED ((struct siginfo *) 2) 1548 #define SEND_SIG_FORCED ((struct siginfo *) 2)
1549 1549
1550 static inline int is_si_special(const struct siginfo *info) 1550 static inline int is_si_special(const struct siginfo *info)
1551 { 1551 {
1552 return info <= SEND_SIG_FORCED; 1552 return info <= SEND_SIG_FORCED;
1553 } 1553 }
1554 1554
1555 /* True if we are on the alternate signal stack. */ 1555 /* True if we are on the alternate signal stack. */
1556 1556
1557 static inline int on_sig_stack(unsigned long sp) 1557 static inline int on_sig_stack(unsigned long sp)
1558 { 1558 {
1559 return (sp - current->sas_ss_sp < current->sas_ss_size); 1559 return (sp - current->sas_ss_sp < current->sas_ss_size);
1560 } 1560 }
1561 1561
1562 static inline int sas_ss_flags(unsigned long sp) 1562 static inline int sas_ss_flags(unsigned long sp)
1563 { 1563 {
1564 return (current->sas_ss_size == 0 ? SS_DISABLE 1564 return (current->sas_ss_size == 0 ? SS_DISABLE
1565 : on_sig_stack(sp) ? SS_ONSTACK : 0); 1565 : on_sig_stack(sp) ? SS_ONSTACK : 0);
1566 } 1566 }
1567 1567
1568 /* 1568 /*
1569 * Routines for handling mm_structs 1569 * Routines for handling mm_structs
1570 */ 1570 */
1571 extern struct mm_struct * mm_alloc(void); 1571 extern struct mm_struct * mm_alloc(void);
1572 1572
1573 /* mmdrop drops the mm and the page tables */ 1573 /* mmdrop drops the mm and the page tables */
1574 extern void FASTCALL(__mmdrop(struct mm_struct *)); 1574 extern void FASTCALL(__mmdrop(struct mm_struct *));
1575 static inline void mmdrop(struct mm_struct * mm) 1575 static inline void mmdrop(struct mm_struct * mm)
1576 { 1576 {
1577 if (unlikely(atomic_dec_and_test(&mm->mm_count))) 1577 if (unlikely(atomic_dec_and_test(&mm->mm_count)))
1578 __mmdrop(mm); 1578 __mmdrop(mm);
1579 } 1579 }
1580 1580
1581 /* mmput gets rid of the mappings and all user-space */ 1581 /* mmput gets rid of the mappings and all user-space */
1582 extern void mmput(struct mm_struct *); 1582 extern void mmput(struct mm_struct *);
1583 /* Grab a reference to a task's mm, if it is not already going away */ 1583 /* Grab a reference to a task's mm, if it is not already going away */
1584 extern struct mm_struct *get_task_mm(struct task_struct *task); 1584 extern struct mm_struct *get_task_mm(struct task_struct *task);
1585 /* Remove the current tasks stale references to the old mm_struct */ 1585 /* Remove the current tasks stale references to the old mm_struct */
1586 extern void mm_release(struct task_struct *, struct mm_struct *); 1586 extern void mm_release(struct task_struct *, struct mm_struct *);
1587 1587
1588 extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); 1588 extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
1589 extern void flush_thread(void); 1589 extern void flush_thread(void);
1590 extern void exit_thread(void); 1590 extern void exit_thread(void);
1591 1591
1592 extern void exit_files(struct task_struct *); 1592 extern void exit_files(struct task_struct *);
1593 extern void __cleanup_signal(struct signal_struct *); 1593 extern void __cleanup_signal(struct signal_struct *);
1594 extern void __cleanup_sighand(struct sighand_struct *); 1594 extern void __cleanup_sighand(struct sighand_struct *);
1595 extern void exit_itimers(struct signal_struct *); 1595 extern void exit_itimers(struct signal_struct *);
1596 1596
1597 extern NORET_TYPE void do_group_exit(int); 1597 extern NORET_TYPE void do_group_exit(int);
1598 1598
1599 extern void daemonize(const char *, ...); 1599 extern void daemonize(const char *, ...);
1600 extern int allow_signal(int); 1600 extern int allow_signal(int);
1601 extern int disallow_signal(int); 1601 extern int disallow_signal(int);
1602 1602
1603 extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); 1603 extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
1604 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); 1604 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
1605 struct task_struct *fork_idle(int); 1605 struct task_struct *fork_idle(int);
1606 1606
1607 extern void set_task_comm(struct task_struct *tsk, char *from); 1607 extern void set_task_comm(struct task_struct *tsk, char *from);
1608 extern void get_task_comm(char *to, struct task_struct *tsk); 1608 extern void get_task_comm(char *to, struct task_struct *tsk);
1609 1609
1610 #ifdef CONFIG_SMP 1610 #ifdef CONFIG_SMP
1611 extern void wait_task_inactive(struct task_struct * p); 1611 extern void wait_task_inactive(struct task_struct * p);
1612 #else 1612 #else
1613 #define wait_task_inactive(p) do { } while (0) 1613 #define wait_task_inactive(p) do { } while (0)
1614 #endif 1614 #endif
1615 1615
1616 #define remove_parent(p) list_del_init(&(p)->sibling) 1616 #define remove_parent(p) list_del_init(&(p)->sibling)
1617 #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) 1617 #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)
1618 1618
1619 #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) 1619 #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
1620 1620
1621 #define for_each_process(p) \ 1621 #define for_each_process(p) \
1622 for (p = &init_task ; (p = next_task(p)) != &init_task ; ) 1622 for (p = &init_task ; (p = next_task(p)) != &init_task ; )
1623 1623
1624 /* 1624 /*
1625 * Careful: do_each_thread/while_each_thread is a double loop so 1625 * Careful: do_each_thread/while_each_thread is a double loop so
1626 * 'break' will not work as expected - use goto instead. 1626 * 'break' will not work as expected - use goto instead.
1627 */ 1627 */
1628 #define do_each_thread(g, t) \ 1628 #define do_each_thread(g, t) \
1629 for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do 1629 for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
1630 1630
1631 #define while_each_thread(g, t) \ 1631 #define while_each_thread(g, t) \
1632 while ((t = next_thread(t)) != g) 1632 while ((t = next_thread(t)) != g)
1633 1633
1634 /* de_thread depends on thread_group_leader not being a pid based check */ 1634 /* de_thread depends on thread_group_leader not being a pid based check */
1635 #define thread_group_leader(p) (p == p->group_leader) 1635 #define thread_group_leader(p) (p == p->group_leader)
1636 1636
1637 /* Do to the insanities of de_thread it is possible for a process 1637 /* Do to the insanities of de_thread it is possible for a process
1638 * to have the pid of the thread group leader without actually being 1638 * to have the pid of the thread group leader without actually being
1639 * the thread group leader. For iteration through the pids in proc 1639 * the thread group leader. For iteration through the pids in proc
1640 * all we care about is that we have a task with the appropriate 1640 * all we care about is that we have a task with the appropriate
1641 * pid, we don't actually care if we have the right task. 1641 * pid, we don't actually care if we have the right task.
1642 */ 1642 */
1643 static inline int has_group_leader_pid(struct task_struct *p) 1643 static inline int has_group_leader_pid(struct task_struct *p)
1644 { 1644 {
1645 return p->pid == p->tgid; 1645 return p->pid == p->tgid;
1646 } 1646 }
1647 1647
1648 static inline struct task_struct *next_thread(const struct task_struct *p) 1648 static inline struct task_struct *next_thread(const struct task_struct *p)
1649 { 1649 {
1650 return list_entry(rcu_dereference(p->thread_group.next), 1650 return list_entry(rcu_dereference(p->thread_group.next),
1651 struct task_struct, thread_group); 1651 struct task_struct, thread_group);
1652 } 1652 }
1653 1653
1654 static inline int thread_group_empty(struct task_struct *p) 1654 static inline int thread_group_empty(struct task_struct *p)
1655 { 1655 {
1656 return list_empty(&p->thread_group); 1656 return list_empty(&p->thread_group);
1657 } 1657 }
1658 1658
1659 #define delay_group_leader(p) \ 1659 #define delay_group_leader(p) \
1660 (thread_group_leader(p) && !thread_group_empty(p)) 1660 (thread_group_leader(p) && !thread_group_empty(p))
1661 1661
1662 /* 1662 /*
1663 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring 1663 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
1664 * subscriptions and synchronises with wait4(). Also used in procfs. Also 1664 * subscriptions and synchronises with wait4(). Also used in procfs. Also
1665 * pins the final release of task.io_context. Also protects ->cpuset. 1665 * pins the final release of task.io_context. Also protects ->cpuset.
1666 * 1666 *
1667 * Nests both inside and outside of read_lock(&tasklist_lock). 1667 * Nests both inside and outside of read_lock(&tasklist_lock).
1668 * It must not be nested with write_lock_irq(&tasklist_lock), 1668 * It must not be nested with write_lock_irq(&tasklist_lock),
1669 * neither inside nor outside. 1669 * neither inside nor outside.
1670 */ 1670 */
1671 static inline void task_lock(struct task_struct *p) 1671 static inline void task_lock(struct task_struct *p)
1672 { 1672 {
1673 spin_lock(&p->alloc_lock); 1673 spin_lock(&p->alloc_lock);
1674 } 1674 }
1675 1675
1676 static inline void task_unlock(struct task_struct *p) 1676 static inline void task_unlock(struct task_struct *p)
1677 { 1677 {
1678 spin_unlock(&p->alloc_lock); 1678 spin_unlock(&p->alloc_lock);
1679 } 1679 }
1680 1680
1681 extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk, 1681 extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
1682 unsigned long *flags); 1682 unsigned long *flags);
1683 1683
1684 static inline void unlock_task_sighand(struct task_struct *tsk, 1684 static inline void unlock_task_sighand(struct task_struct *tsk,
1685 unsigned long *flags) 1685 unsigned long *flags)
1686 { 1686 {
1687 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); 1687 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
1688 } 1688 }
1689 1689
1690 #ifndef __HAVE_THREAD_FUNCTIONS 1690 #ifndef __HAVE_THREAD_FUNCTIONS
1691 1691
1692 #define task_thread_info(task) ((struct thread_info *)(task)->stack) 1692 #define task_thread_info(task) ((struct thread_info *)(task)->stack)
1693 #define task_stack_page(task) ((task)->stack) 1693 #define task_stack_page(task) ((task)->stack)
1694 1694
1695 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) 1695 static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
1696 { 1696 {
1697 *task_thread_info(p) = *task_thread_info(org); 1697 *task_thread_info(p) = *task_thread_info(org);
1698 task_thread_info(p)->task = p; 1698 task_thread_info(p)->task = p;
1699 } 1699 }
1700 1700
1701 static inline unsigned long *end_of_stack(struct task_struct *p) 1701 static inline unsigned long *end_of_stack(struct task_struct *p)
1702 { 1702 {
1703 return (unsigned long *)(task_thread_info(p) + 1); 1703 return (unsigned long *)(task_thread_info(p) + 1);
1704 } 1704 }
1705 1705
1706 #endif 1706 #endif
1707 1707
1708 /* set thread flags in other task's structures 1708 /* set thread flags in other task's structures
1709 * - see asm/thread_info.h for TIF_xxxx flags available 1709 * - see asm/thread_info.h for TIF_xxxx flags available
1710 */ 1710 */
1711 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) 1711 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
1712 { 1712 {
1713 set_ti_thread_flag(task_thread_info(tsk), flag); 1713 set_ti_thread_flag(task_thread_info(tsk), flag);
1714 } 1714 }
1715 1715
1716 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) 1716 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
1717 { 1717 {
1718 clear_ti_thread_flag(task_thread_info(tsk), flag); 1718 clear_ti_thread_flag(task_thread_info(tsk), flag);
1719 } 1719 }
1720 1720
1721 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) 1721 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
1722 { 1722 {
1723 return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); 1723 return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
1724 } 1724 }
1725 1725
1726 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) 1726 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
1727 { 1727 {
1728 return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); 1728 return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
1729 } 1729 }
1730 1730
1731 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) 1731 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
1732 { 1732 {
1733 return test_ti_thread_flag(task_thread_info(tsk), flag); 1733 return test_ti_thread_flag(task_thread_info(tsk), flag);
1734 } 1734 }
1735 1735
1736 static inline void set_tsk_need_resched(struct task_struct *tsk) 1736 static inline void set_tsk_need_resched(struct task_struct *tsk)
1737 { 1737 {
1738 set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 1738 set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
1739 } 1739 }
1740 1740
1741 static inline void clear_tsk_need_resched(struct task_struct *tsk) 1741 static inline void clear_tsk_need_resched(struct task_struct *tsk)
1742 { 1742 {
1743 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 1743 clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
1744 } 1744 }
1745 1745
1746 static inline int signal_pending(struct task_struct *p) 1746 static inline int signal_pending(struct task_struct *p)
1747 { 1747 {
1748 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); 1748 return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
1749 } 1749 }
1750 1750
1751 static inline int need_resched(void) 1751 static inline int need_resched(void)
1752 { 1752 {
1753 return unlikely(test_thread_flag(TIF_NEED_RESCHED)); 1753 return unlikely(test_thread_flag(TIF_NEED_RESCHED));
1754 } 1754 }
1755 1755
1756 /* 1756 /*
1757 * cond_resched() and cond_resched_lock(): latency reduction via 1757 * cond_resched() and cond_resched_lock(): latency reduction via
1758 * explicit rescheduling in places that are safe. The return 1758 * explicit rescheduling in places that are safe. The return
1759 * value indicates whether a reschedule was done in fact. 1759 * value indicates whether a reschedule was done in fact.
1760 * cond_resched_lock() will drop the spinlock before scheduling, 1760 * cond_resched_lock() will drop the spinlock before scheduling,
1761 * cond_resched_softirq() will enable bhs before scheduling. 1761 * cond_resched_softirq() will enable bhs before scheduling.
1762 */ 1762 */
1763 extern int cond_resched(void); 1763 extern int cond_resched(void);
1764 extern int cond_resched_lock(spinlock_t * lock); 1764 extern int cond_resched_lock(spinlock_t * lock);
1765 extern int cond_resched_softirq(void); 1765 extern int cond_resched_softirq(void);
1766 1766
1767 /* 1767 /*
1768 * Does a critical section need to be broken due to another 1768 * Does a critical section need to be broken due to another
1769 * task waiting?: 1769 * task waiting?:
1770 */ 1770 */
1771 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) 1771 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
1772 # define need_lockbreak(lock) ((lock)->break_lock) 1772 # define need_lockbreak(lock) ((lock)->break_lock)
1773 #else 1773 #else
1774 # define need_lockbreak(lock) 0 1774 # define need_lockbreak(lock) 0
1775 #endif 1775 #endif
1776 1776
1777 /* 1777 /*
1778 * Does a critical section need to be broken due to another 1778 * Does a critical section need to be broken due to another
1779 * task waiting or preemption being signalled: 1779 * task waiting or preemption being signalled:
1780 */ 1780 */
1781 static inline int lock_need_resched(spinlock_t *lock) 1781 static inline int lock_need_resched(spinlock_t *lock)
1782 { 1782 {
1783 if (need_lockbreak(lock) || need_resched()) 1783 if (need_lockbreak(lock) || need_resched())
1784 return 1; 1784 return 1;
1785 return 0; 1785 return 0;
1786 } 1786 }
1787 1787
1788 /* 1788 /*
1789 * Reevaluate whether the task has signals pending delivery. 1789 * Reevaluate whether the task has signals pending delivery.
1790 * Wake the task if so. 1790 * Wake the task if so.
1791 * This is required every time the blocked sigset_t changes. 1791 * This is required every time the blocked sigset_t changes.
1792 * callers must hold sighand->siglock. 1792 * callers must hold sighand->siglock.
1793 */ 1793 */
1794 extern void recalc_sigpending_and_wake(struct task_struct *t); 1794 extern void recalc_sigpending_and_wake(struct task_struct *t);
1795 extern void recalc_sigpending(void); 1795 extern void recalc_sigpending(void);
1796 1796
1797 extern void signal_wake_up(struct task_struct *t, int resume_stopped); 1797 extern void signal_wake_up(struct task_struct *t, int resume_stopped);
1798 1798
1799 /* 1799 /*
1800 * Wrappers for p->thread_info->cpu access. No-op on UP. 1800 * Wrappers for p->thread_info->cpu access. No-op on UP.
1801 */ 1801 */
1802 #ifdef CONFIG_SMP 1802 #ifdef CONFIG_SMP
1803 1803
1804 static inline unsigned int task_cpu(const struct task_struct *p) 1804 static inline unsigned int task_cpu(const struct task_struct *p)
1805 { 1805 {
1806 return task_thread_info(p)->cpu; 1806 return task_thread_info(p)->cpu;
1807 } 1807 }
1808 1808
1809 extern void set_task_cpu(struct task_struct *p, unsigned int cpu); 1809 extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
1810 1810
1811 #else 1811 #else
1812 1812
1813 static inline unsigned int task_cpu(const struct task_struct *p) 1813 static inline unsigned int task_cpu(const struct task_struct *p)
1814 { 1814 {
1815 return 0; 1815 return 0;
1816 } 1816 }
1817 1817
1818 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) 1818 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
1819 { 1819 {
1820 } 1820 }
1821 1821
1822 #endif /* CONFIG_SMP */ 1822 #endif /* CONFIG_SMP */
1823 1823
1824 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT 1824 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
1825 extern void arch_pick_mmap_layout(struct mm_struct *mm); 1825 extern void arch_pick_mmap_layout(struct mm_struct *mm);
1826 #else 1826 #else
1827 static inline void arch_pick_mmap_layout(struct mm_struct *mm) 1827 static inline void arch_pick_mmap_layout(struct mm_struct *mm)
1828 { 1828 {
1829 mm->mmap_base = TASK_UNMAPPED_BASE; 1829 mm->mmap_base = TASK_UNMAPPED_BASE;
1830 mm->get_unmapped_area = arch_get_unmapped_area; 1830 mm->get_unmapped_area = arch_get_unmapped_area;
1831 mm->unmap_area = arch_unmap_area; 1831 mm->unmap_area = arch_unmap_area;
1832 } 1832 }
1833 #endif 1833 #endif
1834 1834
1835 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); 1835 extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
1836 extern long sched_getaffinity(pid_t pid, cpumask_t *mask); 1836 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
1837 1837
1838 extern int sched_mc_power_savings, sched_smt_power_savings; 1838 extern int sched_mc_power_savings, sched_smt_power_savings;
1839 1839
1840 extern void normalize_rt_tasks(void); 1840 extern void normalize_rt_tasks(void);
1841 1841
1842 #ifdef CONFIG_TASK_XACCT 1842 #ifdef CONFIG_TASK_XACCT
1843 static inline void add_rchar(struct task_struct *tsk, ssize_t amt) 1843 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
1844 { 1844 {
1845 tsk->rchar += amt; 1845 tsk->rchar += amt;
1846 } 1846 }
1847 1847
1848 static inline void add_wchar(struct task_struct *tsk, ssize_t amt) 1848 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
1849 { 1849 {
1850 tsk->wchar += amt; 1850 tsk->wchar += amt;
1851 } 1851 }
1852 1852
1853 static inline void inc_syscr(struct task_struct *tsk) 1853 static inline void inc_syscr(struct task_struct *tsk)
1854 { 1854 {
1855 tsk->syscr++; 1855 tsk->syscr++;
1856 } 1856 }
1857 1857
1858 static inline void inc_syscw(struct task_struct *tsk) 1858 static inline void inc_syscw(struct task_struct *tsk)
1859 { 1859 {
1860 tsk->syscw++; 1860 tsk->syscw++;
1861 } 1861 }
1862 #else 1862 #else
1863 static inline void add_rchar(struct task_struct *tsk, ssize_t amt) 1863 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
1864 { 1864 {
1865 } 1865 }
1866 1866
1867 static inline void add_wchar(struct task_struct *tsk, ssize_t amt) 1867 static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
1868 { 1868 {
1869 } 1869 }
1870 1870
1871 static inline void inc_syscr(struct task_struct *tsk) 1871 static inline void inc_syscr(struct task_struct *tsk)
1872 { 1872 {
1873 } 1873 }
1874 1874
1875 static inline void inc_syscw(struct task_struct *tsk) 1875 static inline void inc_syscw(struct task_struct *tsk)
1876 { 1876 {
1877 } 1877 }
1878 #endif 1878 #endif
1879 1879
1880 #endif /* __KERNEL__ */ 1880 #endif /* __KERNEL__ */
1881 1881
1882 #endif 1882 #endif
1883 1883
1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 * 2007-04-15 Work begun on replacing all interactivity tuning with a 19 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas. 20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements 21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 */ 25 */
26 26
27 #include <linux/mm.h> 27 #include <linux/mm.h>
28 #include <linux/module.h> 28 #include <linux/module.h>
29 #include <linux/nmi.h> 29 #include <linux/nmi.h>
30 #include <linux/init.h> 30 #include <linux/init.h>
31 #include <linux/uaccess.h> 31 #include <linux/uaccess.h>
32 #include <linux/highmem.h> 32 #include <linux/highmem.h>
33 #include <linux/smp_lock.h> 33 #include <linux/smp_lock.h>
34 #include <asm/mmu_context.h> 34 #include <asm/mmu_context.h>
35 #include <linux/interrupt.h> 35 #include <linux/interrupt.h>
36 #include <linux/capability.h> 36 #include <linux/capability.h>
37 #include <linux/completion.h> 37 #include <linux/completion.h>
38 #include <linux/kernel_stat.h> 38 #include <linux/kernel_stat.h>
39 #include <linux/debug_locks.h> 39 #include <linux/debug_locks.h>
40 #include <linux/security.h> 40 #include <linux/security.h>
41 #include <linux/notifier.h> 41 #include <linux/notifier.h>
42 #include <linux/profile.h> 42 #include <linux/profile.h>
43 #include <linux/freezer.h> 43 #include <linux/freezer.h>
44 #include <linux/vmalloc.h> 44 #include <linux/vmalloc.h>
45 #include <linux/blkdev.h> 45 #include <linux/blkdev.h>
46 #include <linux/delay.h> 46 #include <linux/delay.h>
47 #include <linux/smp.h> 47 #include <linux/smp.h>
48 #include <linux/threads.h> 48 #include <linux/threads.h>
49 #include <linux/timer.h> 49 #include <linux/timer.h>
50 #include <linux/rcupdate.h> 50 #include <linux/rcupdate.h>
51 #include <linux/cpu.h> 51 #include <linux/cpu.h>
52 #include <linux/cpuset.h> 52 #include <linux/cpuset.h>
53 #include <linux/percpu.h> 53 #include <linux/percpu.h>
54 #include <linux/kthread.h> 54 #include <linux/kthread.h>
55 #include <linux/seq_file.h> 55 #include <linux/seq_file.h>
56 #include <linux/sysctl.h> 56 #include <linux/sysctl.h>
57 #include <linux/syscalls.h> 57 #include <linux/syscalls.h>
58 #include <linux/times.h> 58 #include <linux/times.h>
59 #include <linux/tsacct_kern.h> 59 #include <linux/tsacct_kern.h>
60 #include <linux/kprobes.h> 60 #include <linux/kprobes.h>
61 #include <linux/delayacct.h> 61 #include <linux/delayacct.h>
62 #include <linux/reciprocal_div.h> 62 #include <linux/reciprocal_div.h>
63 #include <linux/unistd.h> 63 #include <linux/unistd.h>
64 64
65 #include <asm/tlb.h> 65 #include <asm/tlb.h>
66 66
67 /* 67 /*
68 * Scheduler clock - returns current time in nanosec units. 68 * Scheduler clock - returns current time in nanosec units.
69 * This is default implementation. 69 * This is default implementation.
70 * Architectures and sub-architectures can override this. 70 * Architectures and sub-architectures can override this.
71 */ 71 */
72 unsigned long long __attribute__((weak)) sched_clock(void) 72 unsigned long long __attribute__((weak)) sched_clock(void)
73 { 73 {
74 return (unsigned long long)jiffies * (1000000000 / HZ); 74 return (unsigned long long)jiffies * (1000000000 / HZ);
75 } 75 }
76 76
77 /* 77 /*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 78 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
80 * and back. 80 * and back.
81 */ 81 */
82 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 82 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
83 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 83 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
84 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 84 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
85 85
86 /* 86 /*
87 * 'User priority' is the nice value converted to something we 87 * 'User priority' is the nice value converted to something we
88 * can work with better when scaling various scheduler parameters, 88 * can work with better when scaling various scheduler parameters,
89 * it's a [ 0 ... 39 ] range. 89 * it's a [ 0 ... 39 ] range.
90 */ 90 */
91 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 91 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
92 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 92 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
93 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 93 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
94 94
95 /* 95 /*
96 * Some helpers for converting nanosecond timing to jiffy resolution 96 * Some helpers for converting nanosecond timing to jiffy resolution
97 */ 97 */
98 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 98 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
99 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 99 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
100 100
101 #define NICE_0_LOAD SCHED_LOAD_SCALE 101 #define NICE_0_LOAD SCHED_LOAD_SCALE
102 #define NICE_0_SHIFT SCHED_LOAD_SHIFT 102 #define NICE_0_SHIFT SCHED_LOAD_SHIFT
103 103
104 /* 104 /*
105 * These are the 'tuning knobs' of the scheduler: 105 * These are the 'tuning knobs' of the scheduler:
106 * 106 *
107 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 107 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
108 * default timeslice is 100 msecs, maximum timeslice is 800 msecs. 108 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
109 * Timeslices get refilled after they expire. 109 * Timeslices get refilled after they expire.
110 */ 110 */
111 #define MIN_TIMESLICE max(5 * HZ / 1000, 1) 111 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
112 #define DEF_TIMESLICE (100 * HZ / 1000) 112 #define DEF_TIMESLICE (100 * HZ / 1000)
113 113
114 #ifdef CONFIG_SMP 114 #ifdef CONFIG_SMP
115 /* 115 /*
116 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) 116 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
117 * Since cpu_power is a 'constant', we can use a reciprocal divide. 117 * Since cpu_power is a 'constant', we can use a reciprocal divide.
118 */ 118 */
119 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) 119 static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
120 { 120 {
121 return reciprocal_divide(load, sg->reciprocal_cpu_power); 121 return reciprocal_divide(load, sg->reciprocal_cpu_power);
122 } 122 }
123 123
124 /* 124 /*
125 * Each time a sched group cpu_power is changed, 125 * Each time a sched group cpu_power is changed,
126 * we must compute its reciprocal value 126 * we must compute its reciprocal value
127 */ 127 */
128 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) 128 static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
129 { 129 {
130 sg->__cpu_power += val; 130 sg->__cpu_power += val;
131 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); 131 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
132 } 132 }
133 #endif 133 #endif
134 134
135 #define SCALE_PRIO(x, prio) \ 135 #define SCALE_PRIO(x, prio) \
136 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) 136 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
137 137
138 /* 138 /*
139 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 139 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
140 * to time slice values: [800ms ... 100ms ... 5ms] 140 * to time slice values: [800ms ... 100ms ... 5ms]
141 */ 141 */
142 static unsigned int static_prio_timeslice(int static_prio) 142 static unsigned int static_prio_timeslice(int static_prio)
143 { 143 {
144 if (static_prio == NICE_TO_PRIO(19)) 144 if (static_prio == NICE_TO_PRIO(19))
145 return 1; 145 return 1;
146 146
147 if (static_prio < NICE_TO_PRIO(0)) 147 if (static_prio < NICE_TO_PRIO(0))
148 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); 148 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
149 else 149 else
150 return SCALE_PRIO(DEF_TIMESLICE, static_prio); 150 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
151 } 151 }
152 152
153 static inline int rt_policy(int policy) 153 static inline int rt_policy(int policy)
154 { 154 {
155 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) 155 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
156 return 1; 156 return 1;
157 return 0; 157 return 0;
158 } 158 }
159 159
160 static inline int task_has_rt_policy(struct task_struct *p) 160 static inline int task_has_rt_policy(struct task_struct *p)
161 { 161 {
162 return rt_policy(p->policy); 162 return rt_policy(p->policy);
163 } 163 }
164 164
165 /* 165 /*
166 * This is the priority-queue data structure of the RT scheduling class: 166 * This is the priority-queue data structure of the RT scheduling class:
167 */ 167 */
168 struct rt_prio_array { 168 struct rt_prio_array {
169 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ 169 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
170 struct list_head queue[MAX_RT_PRIO]; 170 struct list_head queue[MAX_RT_PRIO];
171 }; 171 };
172 172
173 struct load_stat { 173 struct load_stat {
174 struct load_weight load; 174 struct load_weight load;
175 u64 load_update_start, load_update_last; 175 u64 load_update_start, load_update_last;
176 unsigned long delta_fair, delta_exec, delta_stat; 176 unsigned long delta_fair, delta_exec, delta_stat;
177 }; 177 };
178 178
179 /* CFS-related fields in a runqueue */ 179 /* CFS-related fields in a runqueue */
180 struct cfs_rq { 180 struct cfs_rq {
181 struct load_weight load; 181 struct load_weight load;
182 unsigned long nr_running; 182 unsigned long nr_running;
183 183
184 s64 fair_clock; 184 s64 fair_clock;
185 u64 exec_clock; 185 u64 exec_clock;
186 s64 wait_runtime; 186 s64 wait_runtime;
187 u64 sleeper_bonus; 187 u64 sleeper_bonus;
188 unsigned long wait_runtime_overruns, wait_runtime_underruns; 188 unsigned long wait_runtime_overruns, wait_runtime_underruns;
189 189
190 struct rb_root tasks_timeline; 190 struct rb_root tasks_timeline;
191 struct rb_node *rb_leftmost; 191 struct rb_node *rb_leftmost;
192 struct rb_node *rb_load_balance_curr; 192 struct rb_node *rb_load_balance_curr;
193 #ifdef CONFIG_FAIR_GROUP_SCHED 193 #ifdef CONFIG_FAIR_GROUP_SCHED
194 /* 'curr' points to currently running entity on this cfs_rq. 194 /* 'curr' points to currently running entity on this cfs_rq.
195 * It is set to NULL otherwise (i.e when none are currently running). 195 * It is set to NULL otherwise (i.e when none are currently running).
196 */ 196 */
197 struct sched_entity *curr; 197 struct sched_entity *curr;
198 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 198 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
199 199
200 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 200 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
201 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 201 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
202 * (like users, containers etc.) 202 * (like users, containers etc.)
203 * 203 *
204 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 204 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
205 * list is used during load balance. 205 * list is used during load balance.
206 */ 206 */
207 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 207 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
208 #endif 208 #endif
209 }; 209 };
210 210
211 /* Real-Time classes' related field in a runqueue: */ 211 /* Real-Time classes' related field in a runqueue: */
212 struct rt_rq { 212 struct rt_rq {
213 struct rt_prio_array active; 213 struct rt_prio_array active;
214 int rt_load_balance_idx; 214 int rt_load_balance_idx;
215 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 215 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
216 }; 216 };
217 217
218 /* 218 /*
219 * This is the main, per-CPU runqueue data structure. 219 * This is the main, per-CPU runqueue data structure.
220 * 220 *
221 * Locking rule: those places that want to lock multiple runqueues 221 * Locking rule: those places that want to lock multiple runqueues
222 * (such as the load balancing or the thread migration code), lock 222 * (such as the load balancing or the thread migration code), lock
223 * acquire operations must be ordered by ascending &runqueue. 223 * acquire operations must be ordered by ascending &runqueue.
224 */ 224 */
225 struct rq { 225 struct rq {
226 spinlock_t lock; /* runqueue lock */ 226 spinlock_t lock; /* runqueue lock */
227 227
228 /* 228 /*
229 * nr_running and cpu_load should be in the same cacheline because 229 * nr_running and cpu_load should be in the same cacheline because
230 * remote CPUs use both these fields when doing load calculation. 230 * remote CPUs use both these fields when doing load calculation.
231 */ 231 */
232 unsigned long nr_running; 232 unsigned long nr_running;
233 #define CPU_LOAD_IDX_MAX 5 233 #define CPU_LOAD_IDX_MAX 5
234 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 234 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
235 unsigned char idle_at_tick; 235 unsigned char idle_at_tick;
236 #ifdef CONFIG_NO_HZ 236 #ifdef CONFIG_NO_HZ
237 unsigned char in_nohz_recently; 237 unsigned char in_nohz_recently;
238 #endif 238 #endif
239 struct load_stat ls; /* capture load from *all* tasks on this cpu */ 239 struct load_stat ls; /* capture load from *all* tasks on this cpu */
240 unsigned long nr_load_updates; 240 unsigned long nr_load_updates;
241 u64 nr_switches; 241 u64 nr_switches;
242 242
243 struct cfs_rq cfs; 243 struct cfs_rq cfs;
244 #ifdef CONFIG_FAIR_GROUP_SCHED 244 #ifdef CONFIG_FAIR_GROUP_SCHED
245 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ 245 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
246 #endif 246 #endif
247 struct rt_rq rt; 247 struct rt_rq rt;
248 248
249 /* 249 /*
250 * This is part of a global counter where only the total sum 250 * This is part of a global counter where only the total sum
251 * over all CPUs matters. A task can increase this counter on 251 * over all CPUs matters. A task can increase this counter on
252 * one CPU and if it got migrated afterwards it may decrease 252 * one CPU and if it got migrated afterwards it may decrease
253 * it on another CPU. Always updated under the runqueue lock: 253 * it on another CPU. Always updated under the runqueue lock:
254 */ 254 */
255 unsigned long nr_uninterruptible; 255 unsigned long nr_uninterruptible;
256 256
257 struct task_struct *curr, *idle; 257 struct task_struct *curr, *idle;
258 unsigned long next_balance; 258 unsigned long next_balance;
259 struct mm_struct *prev_mm; 259 struct mm_struct *prev_mm;
260 260
261 u64 clock, prev_clock_raw; 261 u64 clock, prev_clock_raw;
262 s64 clock_max_delta; 262 s64 clock_max_delta;
263 263
264 unsigned int clock_warps, clock_overflows; 264 unsigned int clock_warps, clock_overflows;
265 unsigned int clock_unstable_events; 265 unsigned int clock_unstable_events;
266 266
267 atomic_t nr_iowait; 267 atomic_t nr_iowait;
268 268
269 #ifdef CONFIG_SMP 269 #ifdef CONFIG_SMP
270 struct sched_domain *sd; 270 struct sched_domain *sd;
271 271
272 /* For active balancing */ 272 /* For active balancing */
273 int active_balance; 273 int active_balance;
274 int push_cpu; 274 int push_cpu;
275 int cpu; /* cpu of this runqueue */ 275 int cpu; /* cpu of this runqueue */
276 276
277 struct task_struct *migration_thread; 277 struct task_struct *migration_thread;
278 struct list_head migration_queue; 278 struct list_head migration_queue;
279 #endif 279 #endif
280 280
281 #ifdef CONFIG_SCHEDSTATS 281 #ifdef CONFIG_SCHEDSTATS
282 /* latency stats */ 282 /* latency stats */
283 struct sched_info rq_sched_info; 283 struct sched_info rq_sched_info;
284 284
285 /* sys_sched_yield() stats */ 285 /* sys_sched_yield() stats */
286 unsigned long yld_exp_empty; 286 unsigned long yld_exp_empty;
287 unsigned long yld_act_empty; 287 unsigned long yld_act_empty;
288 unsigned long yld_both_empty; 288 unsigned long yld_both_empty;
289 unsigned long yld_cnt; 289 unsigned long yld_cnt;
290 290
291 /* schedule() stats */ 291 /* schedule() stats */
292 unsigned long sched_switch; 292 unsigned long sched_switch;
293 unsigned long sched_cnt; 293 unsigned long sched_cnt;
294 unsigned long sched_goidle; 294 unsigned long sched_goidle;
295 295
296 /* try_to_wake_up() stats */ 296 /* try_to_wake_up() stats */
297 unsigned long ttwu_cnt; 297 unsigned long ttwu_cnt;
298 unsigned long ttwu_local; 298 unsigned long ttwu_local;
299 #endif 299 #endif
300 struct lock_class_key rq_lock_key; 300 struct lock_class_key rq_lock_key;
301 }; 301 };
302 302
303 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 303 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
304 static DEFINE_MUTEX(sched_hotcpu_mutex); 304 static DEFINE_MUTEX(sched_hotcpu_mutex);
305 305
306 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 306 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
307 { 307 {
308 rq->curr->sched_class->check_preempt_curr(rq, p); 308 rq->curr->sched_class->check_preempt_curr(rq, p);
309 } 309 }
310 310
311 static inline int cpu_of(struct rq *rq) 311 static inline int cpu_of(struct rq *rq)
312 { 312 {
313 #ifdef CONFIG_SMP 313 #ifdef CONFIG_SMP
314 return rq->cpu; 314 return rq->cpu;
315 #else 315 #else
316 return 0; 316 return 0;
317 #endif 317 #endif
318 } 318 }
319 319
320 /* 320 /*
321 * Per-runqueue clock, as finegrained as the platform can give us: 321 * Per-runqueue clock, as finegrained as the platform can give us:
322 */ 322 */
323 static unsigned long long __rq_clock(struct rq *rq) 323 static unsigned long long __rq_clock(struct rq *rq)
324 { 324 {
325 u64 prev_raw = rq->prev_clock_raw; 325 u64 prev_raw = rq->prev_clock_raw;
326 u64 now = sched_clock(); 326 u64 now = sched_clock();
327 s64 delta = now - prev_raw; 327 s64 delta = now - prev_raw;
328 u64 clock = rq->clock; 328 u64 clock = rq->clock;
329 329
330 /* 330 /*
331 * Protect against sched_clock() occasionally going backwards: 331 * Protect against sched_clock() occasionally going backwards:
332 */ 332 */
333 if (unlikely(delta < 0)) { 333 if (unlikely(delta < 0)) {
334 clock++; 334 clock++;
335 rq->clock_warps++; 335 rq->clock_warps++;
336 } else { 336 } else {
337 /* 337 /*
338 * Catch too large forward jumps too: 338 * Catch too large forward jumps too:
339 */ 339 */
340 if (unlikely(delta > 2*TICK_NSEC)) { 340 if (unlikely(delta > 2*TICK_NSEC)) {
341 clock++; 341 clock++;
342 rq->clock_overflows++; 342 rq->clock_overflows++;
343 } else { 343 } else {
344 if (unlikely(delta > rq->clock_max_delta)) 344 if (unlikely(delta > rq->clock_max_delta))
345 rq->clock_max_delta = delta; 345 rq->clock_max_delta = delta;
346 clock += delta; 346 clock += delta;
347 } 347 }
348 } 348 }
349 349
350 rq->prev_clock_raw = now; 350 rq->prev_clock_raw = now;
351 rq->clock = clock; 351 rq->clock = clock;
352 352
353 return clock; 353 return clock;
354 } 354 }
355 355
356 static inline unsigned long long rq_clock(struct rq *rq) 356 static inline unsigned long long rq_clock(struct rq *rq)
357 { 357 {
358 int this_cpu = smp_processor_id(); 358 int this_cpu = smp_processor_id();
359 359
360 if (this_cpu == cpu_of(rq)) 360 if (this_cpu == cpu_of(rq))
361 return __rq_clock(rq); 361 return __rq_clock(rq);
362 362
363 return rq->clock; 363 return rq->clock;
364 } 364 }
365 365
366 /* 366 /*
367 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 367 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
368 * See detach_destroy_domains: synchronize_sched for details. 368 * See detach_destroy_domains: synchronize_sched for details.
369 * 369 *
370 * The domain tree of any CPU may only be accessed from within 370 * The domain tree of any CPU may only be accessed from within
371 * preempt-disabled sections. 371 * preempt-disabled sections.
372 */ 372 */
373 #define for_each_domain(cpu, __sd) \ 373 #define for_each_domain(cpu, __sd) \
374 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 374 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
375 375
376 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 376 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
377 #define this_rq() (&__get_cpu_var(runqueues)) 377 #define this_rq() (&__get_cpu_var(runqueues))
378 #define task_rq(p) cpu_rq(task_cpu(p)) 378 #define task_rq(p) cpu_rq(task_cpu(p))
379 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 379 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
380 380
381 /* 381 /*
382 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 382 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
383 * clock constructed from sched_clock(): 383 * clock constructed from sched_clock():
384 */ 384 */
385 unsigned long long cpu_clock(int cpu) 385 unsigned long long cpu_clock(int cpu)
386 { 386 {
387 unsigned long long now; 387 unsigned long long now;
388 unsigned long flags; 388 unsigned long flags;
389 389
390 local_irq_save(flags); 390 local_irq_save(flags);
391 now = rq_clock(cpu_rq(cpu)); 391 now = rq_clock(cpu_rq(cpu));
392 local_irq_restore(flags); 392 local_irq_restore(flags);
393 393
394 return now; 394 return now;
395 } 395 }
396 396
397 #ifdef CONFIG_FAIR_GROUP_SCHED 397 #ifdef CONFIG_FAIR_GROUP_SCHED
398 /* Change a task's ->cfs_rq if it moves across CPUs */ 398 /* Change a task's ->cfs_rq if it moves across CPUs */
399 static inline void set_task_cfs_rq(struct task_struct *p) 399 static inline void set_task_cfs_rq(struct task_struct *p)
400 { 400 {
401 p->se.cfs_rq = &task_rq(p)->cfs; 401 p->se.cfs_rq = &task_rq(p)->cfs;
402 } 402 }
403 #else 403 #else
404 static inline void set_task_cfs_rq(struct task_struct *p) 404 static inline void set_task_cfs_rq(struct task_struct *p)
405 { 405 {
406 } 406 }
407 #endif 407 #endif
408 408
409 #ifndef prepare_arch_switch 409 #ifndef prepare_arch_switch
410 # define prepare_arch_switch(next) do { } while (0) 410 # define prepare_arch_switch(next) do { } while (0)
411 #endif 411 #endif
412 #ifndef finish_arch_switch 412 #ifndef finish_arch_switch
413 # define finish_arch_switch(prev) do { } while (0) 413 # define finish_arch_switch(prev) do { } while (0)
414 #endif 414 #endif
415 415
416 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 416 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
417 static inline int task_running(struct rq *rq, struct task_struct *p) 417 static inline int task_running(struct rq *rq, struct task_struct *p)
418 { 418 {
419 return rq->curr == p; 419 return rq->curr == p;
420 } 420 }
421 421
422 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 422 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
423 { 423 {
424 } 424 }
425 425
426 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 426 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
427 { 427 {
428 #ifdef CONFIG_DEBUG_SPINLOCK 428 #ifdef CONFIG_DEBUG_SPINLOCK
429 /* this is a valid case when another task releases the spinlock */ 429 /* this is a valid case when another task releases the spinlock */
430 rq->lock.owner = current; 430 rq->lock.owner = current;
431 #endif 431 #endif
432 /* 432 /*
433 * If we are tracking spinlock dependencies then we have to 433 * If we are tracking spinlock dependencies then we have to
434 * fix up the runqueue lock - which gets 'carried over' from 434 * fix up the runqueue lock - which gets 'carried over' from
435 * prev into current: 435 * prev into current:
436 */ 436 */
437 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 437 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
438 438
439 spin_unlock_irq(&rq->lock); 439 spin_unlock_irq(&rq->lock);
440 } 440 }
441 441
442 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 442 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
443 static inline int task_running(struct rq *rq, struct task_struct *p) 443 static inline int task_running(struct rq *rq, struct task_struct *p)
444 { 444 {
445 #ifdef CONFIG_SMP 445 #ifdef CONFIG_SMP
446 return p->oncpu; 446 return p->oncpu;
447 #else 447 #else
448 return rq->curr == p; 448 return rq->curr == p;
449 #endif 449 #endif
450 } 450 }
451 451
452 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 452 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
453 { 453 {
454 #ifdef CONFIG_SMP 454 #ifdef CONFIG_SMP
455 /* 455 /*
456 * We can optimise this out completely for !SMP, because the 456 * We can optimise this out completely for !SMP, because the
457 * SMP rebalancing from interrupt is the only thing that cares 457 * SMP rebalancing from interrupt is the only thing that cares
458 * here. 458 * here.
459 */ 459 */
460 next->oncpu = 1; 460 next->oncpu = 1;
461 #endif 461 #endif
462 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 462 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
463 spin_unlock_irq(&rq->lock); 463 spin_unlock_irq(&rq->lock);
464 #else 464 #else
465 spin_unlock(&rq->lock); 465 spin_unlock(&rq->lock);
466 #endif 466 #endif
467 } 467 }
468 468
469 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 469 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
470 { 470 {
471 #ifdef CONFIG_SMP 471 #ifdef CONFIG_SMP
472 /* 472 /*
473 * After ->oncpu is cleared, the task can be moved to a different CPU. 473 * After ->oncpu is cleared, the task can be moved to a different CPU.
474 * We must ensure this doesn't happen until the switch is completely 474 * We must ensure this doesn't happen until the switch is completely
475 * finished. 475 * finished.
476 */ 476 */
477 smp_wmb(); 477 smp_wmb();
478 prev->oncpu = 0; 478 prev->oncpu = 0;
479 #endif 479 #endif
480 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 480 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
481 local_irq_enable(); 481 local_irq_enable();
482 #endif 482 #endif
483 } 483 }
484 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 484 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
485 485
486 /* 486 /*
487 * __task_rq_lock - lock the runqueue a given task resides on. 487 * __task_rq_lock - lock the runqueue a given task resides on.
488 * Must be called interrupts disabled. 488 * Must be called interrupts disabled.
489 */ 489 */
490 static inline struct rq *__task_rq_lock(struct task_struct *p) 490 static inline struct rq *__task_rq_lock(struct task_struct *p)
491 __acquires(rq->lock) 491 __acquires(rq->lock)
492 { 492 {
493 struct rq *rq; 493 struct rq *rq;
494 494
495 repeat_lock_task: 495 repeat_lock_task:
496 rq = task_rq(p); 496 rq = task_rq(p);
497 spin_lock(&rq->lock); 497 spin_lock(&rq->lock);
498 if (unlikely(rq != task_rq(p))) { 498 if (unlikely(rq != task_rq(p))) {
499 spin_unlock(&rq->lock); 499 spin_unlock(&rq->lock);
500 goto repeat_lock_task; 500 goto repeat_lock_task;
501 } 501 }
502 return rq; 502 return rq;
503 } 503 }
504 504
505 /* 505 /*
506 * task_rq_lock - lock the runqueue a given task resides on and disable 506 * task_rq_lock - lock the runqueue a given task resides on and disable
507 * interrupts. Note the ordering: we can safely lookup the task_rq without 507 * interrupts. Note the ordering: we can safely lookup the task_rq without
508 * explicitly disabling preemption. 508 * explicitly disabling preemption.
509 */ 509 */
510 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 510 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
511 __acquires(rq->lock) 511 __acquires(rq->lock)
512 { 512 {
513 struct rq *rq; 513 struct rq *rq;
514 514
515 repeat_lock_task: 515 repeat_lock_task:
516 local_irq_save(*flags); 516 local_irq_save(*flags);
517 rq = task_rq(p); 517 rq = task_rq(p);
518 spin_lock(&rq->lock); 518 spin_lock(&rq->lock);
519 if (unlikely(rq != task_rq(p))) { 519 if (unlikely(rq != task_rq(p))) {
520 spin_unlock_irqrestore(&rq->lock, *flags); 520 spin_unlock_irqrestore(&rq->lock, *flags);
521 goto repeat_lock_task; 521 goto repeat_lock_task;
522 } 522 }
523 return rq; 523 return rq;
524 } 524 }
525 525
526 static inline void __task_rq_unlock(struct rq *rq) 526 static inline void __task_rq_unlock(struct rq *rq)
527 __releases(rq->lock) 527 __releases(rq->lock)
528 { 528 {
529 spin_unlock(&rq->lock); 529 spin_unlock(&rq->lock);
530 } 530 }
531 531
532 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 532 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
533 __releases(rq->lock) 533 __releases(rq->lock)
534 { 534 {
535 spin_unlock_irqrestore(&rq->lock, *flags); 535 spin_unlock_irqrestore(&rq->lock, *flags);
536 } 536 }
537 537
538 /* 538 /*
539 * this_rq_lock - lock this runqueue and disable interrupts. 539 * this_rq_lock - lock this runqueue and disable interrupts.
540 */ 540 */
541 static inline struct rq *this_rq_lock(void) 541 static inline struct rq *this_rq_lock(void)
542 __acquires(rq->lock) 542 __acquires(rq->lock)
543 { 543 {
544 struct rq *rq; 544 struct rq *rq;
545 545
546 local_irq_disable(); 546 local_irq_disable();
547 rq = this_rq(); 547 rq = this_rq();
548 spin_lock(&rq->lock); 548 spin_lock(&rq->lock);
549 549
550 return rq; 550 return rq;
551 } 551 }
552 552
553 /* 553 /*
554 * CPU frequency is/was unstable - start new by setting prev_clock_raw: 554 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
555 */ 555 */
556 void sched_clock_unstable_event(void) 556 void sched_clock_unstable_event(void)
557 { 557 {
558 unsigned long flags; 558 unsigned long flags;
559 struct rq *rq; 559 struct rq *rq;
560 560
561 rq = task_rq_lock(current, &flags); 561 rq = task_rq_lock(current, &flags);
562 rq->prev_clock_raw = sched_clock(); 562 rq->prev_clock_raw = sched_clock();
563 rq->clock_unstable_events++; 563 rq->clock_unstable_events++;
564 task_rq_unlock(rq, &flags); 564 task_rq_unlock(rq, &flags);
565 } 565 }
566 566
567 /* 567 /*
568 * resched_task - mark a task 'to be rescheduled now'. 568 * resched_task - mark a task 'to be rescheduled now'.
569 * 569 *
570 * On UP this means the setting of the need_resched flag, on SMP it 570 * On UP this means the setting of the need_resched flag, on SMP it
571 * might also involve a cross-CPU call to trigger the scheduler on 571 * might also involve a cross-CPU call to trigger the scheduler on
572 * the target CPU. 572 * the target CPU.
573 */ 573 */
574 #ifdef CONFIG_SMP 574 #ifdef CONFIG_SMP
575 575
576 #ifndef tsk_is_polling 576 #ifndef tsk_is_polling
577 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 577 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
578 #endif 578 #endif
579 579
580 static void resched_task(struct task_struct *p) 580 static void resched_task(struct task_struct *p)
581 { 581 {
582 int cpu; 582 int cpu;
583 583
584 assert_spin_locked(&task_rq(p)->lock); 584 assert_spin_locked(&task_rq(p)->lock);
585 585
586 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 586 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
587 return; 587 return;
588 588
589 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 589 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
590 590
591 cpu = task_cpu(p); 591 cpu = task_cpu(p);
592 if (cpu == smp_processor_id()) 592 if (cpu == smp_processor_id())
593 return; 593 return;
594 594
595 /* NEED_RESCHED must be visible before we test polling */ 595 /* NEED_RESCHED must be visible before we test polling */
596 smp_mb(); 596 smp_mb();
597 if (!tsk_is_polling(p)) 597 if (!tsk_is_polling(p))
598 smp_send_reschedule(cpu); 598 smp_send_reschedule(cpu);
599 } 599 }
600 600
601 static void resched_cpu(int cpu) 601 static void resched_cpu(int cpu)
602 { 602 {
603 struct rq *rq = cpu_rq(cpu); 603 struct rq *rq = cpu_rq(cpu);
604 unsigned long flags; 604 unsigned long flags;
605 605
606 if (!spin_trylock_irqsave(&rq->lock, flags)) 606 if (!spin_trylock_irqsave(&rq->lock, flags))
607 return; 607 return;
608 resched_task(cpu_curr(cpu)); 608 resched_task(cpu_curr(cpu));
609 spin_unlock_irqrestore(&rq->lock, flags); 609 spin_unlock_irqrestore(&rq->lock, flags);
610 } 610 }
611 #else 611 #else
612 static inline void resched_task(struct task_struct *p) 612 static inline void resched_task(struct task_struct *p)
613 { 613 {
614 assert_spin_locked(&task_rq(p)->lock); 614 assert_spin_locked(&task_rq(p)->lock);
615 set_tsk_need_resched(p); 615 set_tsk_need_resched(p);
616 } 616 }
617 #endif 617 #endif
618 618
619 static u64 div64_likely32(u64 divident, unsigned long divisor) 619 static u64 div64_likely32(u64 divident, unsigned long divisor)
620 { 620 {
621 #if BITS_PER_LONG == 32 621 #if BITS_PER_LONG == 32
622 if (likely(divident <= 0xffffffffULL)) 622 if (likely(divident <= 0xffffffffULL))
623 return (u32)divident / divisor; 623 return (u32)divident / divisor;
624 do_div(divident, divisor); 624 do_div(divident, divisor);
625 625
626 return divident; 626 return divident;
627 #else 627 #else
628 return divident / divisor; 628 return divident / divisor;
629 #endif 629 #endif
630 } 630 }
631 631
632 #if BITS_PER_LONG == 32 632 #if BITS_PER_LONG == 32
633 # define WMULT_CONST (~0UL) 633 # define WMULT_CONST (~0UL)
634 #else 634 #else
635 # define WMULT_CONST (1UL << 32) 635 # define WMULT_CONST (1UL << 32)
636 #endif 636 #endif
637 637
638 #define WMULT_SHIFT 32 638 #define WMULT_SHIFT 32
639 639
640 static unsigned long 640 static unsigned long
641 calc_delta_mine(unsigned long delta_exec, unsigned long weight, 641 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
642 struct load_weight *lw) 642 struct load_weight *lw)
643 { 643 {
644 u64 tmp; 644 u64 tmp;
645 645
646 if (unlikely(!lw->inv_weight)) 646 if (unlikely(!lw->inv_weight))
647 lw->inv_weight = WMULT_CONST / lw->weight; 647 lw->inv_weight = WMULT_CONST / lw->weight;
648 648
649 tmp = (u64)delta_exec * weight; 649 tmp = (u64)delta_exec * weight;
650 /* 650 /*
651 * Check whether we'd overflow the 64-bit multiplication: 651 * Check whether we'd overflow the 64-bit multiplication:
652 */ 652 */
653 if (unlikely(tmp > WMULT_CONST)) { 653 if (unlikely(tmp > WMULT_CONST)) {
654 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) 654 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
655 >> (WMULT_SHIFT/2); 655 >> (WMULT_SHIFT/2);
656 } else { 656 } else {
657 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; 657 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
658 } 658 }
659 659
660 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 660 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
661 } 661 }
662 662
663 static inline unsigned long 663 static inline unsigned long
664 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) 664 calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
665 { 665 {
666 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); 666 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
667 } 667 }
668 668
669 static void update_load_add(struct load_weight *lw, unsigned long inc) 669 static void update_load_add(struct load_weight *lw, unsigned long inc)
670 { 670 {
671 lw->weight += inc; 671 lw->weight += inc;
672 lw->inv_weight = 0; 672 lw->inv_weight = 0;
673 } 673 }
674 674
675 static void update_load_sub(struct load_weight *lw, unsigned long dec) 675 static void update_load_sub(struct load_weight *lw, unsigned long dec)
676 { 676 {
677 lw->weight -= dec; 677 lw->weight -= dec;
678 lw->inv_weight = 0; 678 lw->inv_weight = 0;
679 } 679 }
680 680
681 /* 681 /*
682 * To aid in avoiding the subversion of "niceness" due to uneven distribution 682 * To aid in avoiding the subversion of "niceness" due to uneven distribution
683 * of tasks with abnormal "nice" values across CPUs the contribution that 683 * of tasks with abnormal "nice" values across CPUs the contribution that
684 * each task makes to its run queue's load is weighted according to its 684 * each task makes to its run queue's load is weighted according to its
685 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 685 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
686 * scaled version of the new time slice allocation that they receive on time 686 * scaled version of the new time slice allocation that they receive on time
687 * slice expiry etc. 687 * slice expiry etc.
688 */ 688 */
689 689
690 #define WEIGHT_IDLEPRIO 2 690 #define WEIGHT_IDLEPRIO 2
691 #define WMULT_IDLEPRIO (1 << 31) 691 #define WMULT_IDLEPRIO (1 << 31)
692 692
693 /* 693 /*
694 * Nice levels are multiplicative, with a gentle 10% change for every 694 * Nice levels are multiplicative, with a gentle 10% change for every
695 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to 695 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
696 * nice 1, it will get ~10% less CPU time than another CPU-bound task 696 * nice 1, it will get ~10% less CPU time than another CPU-bound task
697 * that remained on nice 0. 697 * that remained on nice 0.
698 * 698 *
699 * The "10% effect" is relative and cumulative: from _any_ nice level, 699 * The "10% effect" is relative and cumulative: from _any_ nice level,
700 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level 700 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
701 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. 701 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
702 * If a task goes up by ~10% and another task goes down by ~10% then 702 * If a task goes up by ~10% and another task goes down by ~10% then
703 * the relative distance between them is ~25%.) 703 * the relative distance between them is ~25%.)
704 */ 704 */
705 static const int prio_to_weight[40] = { 705 static const int prio_to_weight[40] = {
706 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, 706 /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
707 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, 707 /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
708 /* 0 */ NICE_0_LOAD /* 1024 */, 708 /* 0 */ NICE_0_LOAD /* 1024 */,
709 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, 709 /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
710 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, 710 /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
711 }; 711 };
712 712
713 /* 713 /*
714 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. 714 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
715 * 715 *
716 * In cases where the weight does not change often, we can use the 716 * In cases where the weight does not change often, we can use the
717 * precalculated inverse to speed up arithmetics by turning divisions 717 * precalculated inverse to speed up arithmetics by turning divisions
718 * into multiplications: 718 * into multiplications:
719 */ 719 */
720 static const u32 prio_to_wmult[40] = { 720 static const u32 prio_to_wmult[40] = {
721 /* -20 */ 48356, 60446, 75558, 94446, 118058, 721 /* -20 */ 48356, 60446, 75558, 94446, 118058,
722 /* -15 */ 147573, 184467, 230589, 288233, 360285, 722 /* -15 */ 147573, 184467, 230589, 288233, 360285,
723 /* -10 */ 450347, 562979, 703746, 879575, 1099582, 723 /* -10 */ 450347, 562979, 703746, 879575, 1099582,
724 /* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, 724 /* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,
725 /* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, 725 /* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,
726 /* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, 726 /* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,
727 /* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, 727 /* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,
728 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 728 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
729 }; 729 };
730 730
731 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); 731 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
732 732
733 /* 733 /*
734 * runqueue iterator, to support SMP load-balancing between different 734 * runqueue iterator, to support SMP load-balancing between different
735 * scheduling classes, without having to expose their internal data 735 * scheduling classes, without having to expose their internal data
736 * structures to the load-balancing proper: 736 * structures to the load-balancing proper:
737 */ 737 */
738 struct rq_iterator { 738 struct rq_iterator {
739 void *arg; 739 void *arg;
740 struct task_struct *(*start)(void *); 740 struct task_struct *(*start)(void *);
741 struct task_struct *(*next)(void *); 741 struct task_struct *(*next)(void *);
742 }; 742 };
743 743
744 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 744 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
745 unsigned long max_nr_move, unsigned long max_load_move, 745 unsigned long max_nr_move, unsigned long max_load_move,
746 struct sched_domain *sd, enum cpu_idle_type idle, 746 struct sched_domain *sd, enum cpu_idle_type idle,
747 int *all_pinned, unsigned long *load_moved, 747 int *all_pinned, unsigned long *load_moved,
748 int this_best_prio, int best_prio, int best_prio_seen, 748 int this_best_prio, int best_prio, int best_prio_seen,
749 struct rq_iterator *iterator); 749 struct rq_iterator *iterator);
750 750
751 #include "sched_stats.h" 751 #include "sched_stats.h"
752 #include "sched_rt.c" 752 #include "sched_rt.c"
753 #include "sched_fair.c" 753 #include "sched_fair.c"
754 #include "sched_idletask.c" 754 #include "sched_idletask.c"
755 #ifdef CONFIG_SCHED_DEBUG 755 #ifdef CONFIG_SCHED_DEBUG
756 # include "sched_debug.c" 756 # include "sched_debug.c"
757 #endif 757 #endif
758 758
759 #define sched_class_highest (&rt_sched_class) 759 #define sched_class_highest (&rt_sched_class)
760 760
761 static void __update_curr_load(struct rq *rq, struct load_stat *ls) 761 static void __update_curr_load(struct rq *rq, struct load_stat *ls)
762 { 762 {
763 if (rq->curr != rq->idle && ls->load.weight) { 763 if (rq->curr != rq->idle && ls->load.weight) {
764 ls->delta_exec += ls->delta_stat; 764 ls->delta_exec += ls->delta_stat;
765 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); 765 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
766 ls->delta_stat = 0; 766 ls->delta_stat = 0;
767 } 767 }
768 } 768 }
769 769
770 /* 770 /*
771 * Update delta_exec, delta_fair fields for rq. 771 * Update delta_exec, delta_fair fields for rq.
772 * 772 *
773 * delta_fair clock advances at a rate inversely proportional to 773 * delta_fair clock advances at a rate inversely proportional to
774 * total load (rq->ls.load.weight) on the runqueue, while 774 * total load (rq->ls.load.weight) on the runqueue, while
775 * delta_exec advances at the same rate as wall-clock (provided 775 * delta_exec advances at the same rate as wall-clock (provided
776 * cpu is not idle). 776 * cpu is not idle).
777 * 777 *
778 * delta_exec / delta_fair is a measure of the (smoothened) load on this 778 * delta_exec / delta_fair is a measure of the (smoothened) load on this
779 * runqueue over any given interval. This (smoothened) load is used 779 * runqueue over any given interval. This (smoothened) load is used
780 * during load balance. 780 * during load balance.
781 * 781 *
782 * This function is called /before/ updating rq->ls.load 782 * This function is called /before/ updating rq->ls.load
783 * and when switching tasks. 783 * and when switching tasks.
784 */ 784 */
785 static void update_curr_load(struct rq *rq, u64 now) 785 static void update_curr_load(struct rq *rq, u64 now)
786 { 786 {
787 struct load_stat *ls = &rq->ls; 787 struct load_stat *ls = &rq->ls;
788 u64 start; 788 u64 start;
789 789
790 start = ls->load_update_start; 790 start = ls->load_update_start;
791 ls->load_update_start = now; 791 ls->load_update_start = now;
792 ls->delta_stat += now - start; 792 ls->delta_stat += now - start;
793 /* 793 /*
794 * Stagger updates to ls->delta_fair. Very frequent updates 794 * Stagger updates to ls->delta_fair. Very frequent updates
795 * can be expensive. 795 * can be expensive.
796 */ 796 */
797 if (ls->delta_stat >= sysctl_sched_stat_granularity) 797 if (ls->delta_stat >= sysctl_sched_stat_granularity)
798 __update_curr_load(rq, ls); 798 __update_curr_load(rq, ls);
799 } 799 }
800 800
801 static inline void 801 static inline void
802 inc_load(struct rq *rq, const struct task_struct *p, u64 now) 802 inc_load(struct rq *rq, const struct task_struct *p, u64 now)
803 { 803 {
804 update_curr_load(rq, now); 804 update_curr_load(rq, now);
805 update_load_add(&rq->ls.load, p->se.load.weight); 805 update_load_add(&rq->ls.load, p->se.load.weight);
806 } 806 }
807 807
808 static inline void 808 static inline void
809 dec_load(struct rq *rq, const struct task_struct *p, u64 now) 809 dec_load(struct rq *rq, const struct task_struct *p, u64 now)
810 { 810 {
811 update_curr_load(rq, now); 811 update_curr_load(rq, now);
812 update_load_sub(&rq->ls.load, p->se.load.weight); 812 update_load_sub(&rq->ls.load, p->se.load.weight);
813 } 813 }
814 814
815 static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) 815 static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
816 { 816 {
817 rq->nr_running++; 817 rq->nr_running++;
818 inc_load(rq, p, now); 818 inc_load(rq, p, now);
819 } 819 }
820 820
821 static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) 821 static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
822 { 822 {
823 rq->nr_running--; 823 rq->nr_running--;
824 dec_load(rq, p, now); 824 dec_load(rq, p, now);
825 } 825 }
826 826
827 static void set_load_weight(struct task_struct *p) 827 static void set_load_weight(struct task_struct *p)
828 { 828 {
829 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; 829 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
830 p->se.wait_runtime = 0; 830 p->se.wait_runtime = 0;
831 831
832 if (task_has_rt_policy(p)) { 832 if (task_has_rt_policy(p)) {
833 p->se.load.weight = prio_to_weight[0] * 2; 833 p->se.load.weight = prio_to_weight[0] * 2;
834 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 834 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
835 return; 835 return;
836 } 836 }
837 837
838 /* 838 /*
839 * SCHED_IDLE tasks get minimal weight: 839 * SCHED_IDLE tasks get minimal weight:
840 */ 840 */
841 if (p->policy == SCHED_IDLE) { 841 if (p->policy == SCHED_IDLE) {
842 p->se.load.weight = WEIGHT_IDLEPRIO; 842 p->se.load.weight = WEIGHT_IDLEPRIO;
843 p->se.load.inv_weight = WMULT_IDLEPRIO; 843 p->se.load.inv_weight = WMULT_IDLEPRIO;
844 return; 844 return;
845 } 845 }
846 846
847 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 847 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
848 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 848 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
849 } 849 }
850 850
851 static void 851 static void
852 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) 852 enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
853 { 853 {
854 sched_info_queued(p); 854 sched_info_queued(p);
855 p->sched_class->enqueue_task(rq, p, wakeup, now); 855 p->sched_class->enqueue_task(rq, p, wakeup, now);
856 p->se.on_rq = 1; 856 p->se.on_rq = 1;
857 } 857 }
858 858
859 static void 859 static void
860 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) 860 dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
861 { 861 {
862 p->sched_class->dequeue_task(rq, p, sleep, now); 862 p->sched_class->dequeue_task(rq, p, sleep, now);
863 p->se.on_rq = 0; 863 p->se.on_rq = 0;
864 } 864 }
865 865
866 /* 866 /*
867 * __normal_prio - return the priority that is based on the static prio 867 * __normal_prio - return the priority that is based on the static prio
868 */ 868 */
869 static inline int __normal_prio(struct task_struct *p) 869 static inline int __normal_prio(struct task_struct *p)
870 { 870 {
871 return p->static_prio; 871 return p->static_prio;
872 } 872 }
873 873
874 /* 874 /*
875 * Calculate the expected normal priority: i.e. priority 875 * Calculate the expected normal priority: i.e. priority
876 * without taking RT-inheritance into account. Might be 876 * without taking RT-inheritance into account. Might be
877 * boosted by interactivity modifiers. Changes upon fork, 877 * boosted by interactivity modifiers. Changes upon fork,
878 * setprio syscalls, and whenever the interactivity 878 * setprio syscalls, and whenever the interactivity
879 * estimator recalculates. 879 * estimator recalculates.
880 */ 880 */
881 static inline int normal_prio(struct task_struct *p) 881 static inline int normal_prio(struct task_struct *p)
882 { 882 {
883 int prio; 883 int prio;
884 884
885 if (task_has_rt_policy(p)) 885 if (task_has_rt_policy(p))
886 prio = MAX_RT_PRIO-1 - p->rt_priority; 886 prio = MAX_RT_PRIO-1 - p->rt_priority;
887 else 887 else
888 prio = __normal_prio(p); 888 prio = __normal_prio(p);
889 return prio; 889 return prio;
890 } 890 }
891 891
892 /* 892 /*
893 * Calculate the current priority, i.e. the priority 893 * Calculate the current priority, i.e. the priority
894 * taken into account by the scheduler. This value might 894 * taken into account by the scheduler. This value might
895 * be boosted by RT tasks, or might be boosted by 895 * be boosted by RT tasks, or might be boosted by
896 * interactivity modifiers. Will be RT if the task got 896 * interactivity modifiers. Will be RT if the task got
897 * RT-boosted. If not then it returns p->normal_prio. 897 * RT-boosted. If not then it returns p->normal_prio.
898 */ 898 */
899 static int effective_prio(struct task_struct *p) 899 static int effective_prio(struct task_struct *p)
900 { 900 {
901 p->normal_prio = normal_prio(p); 901 p->normal_prio = normal_prio(p);
902 /* 902 /*
903 * If we are RT tasks or we were boosted to RT priority, 903 * If we are RT tasks or we were boosted to RT priority,
904 * keep the priority unchanged. Otherwise, update priority 904 * keep the priority unchanged. Otherwise, update priority
905 * to the normal priority: 905 * to the normal priority:
906 */ 906 */
907 if (!rt_prio(p->prio)) 907 if (!rt_prio(p->prio))
908 return p->normal_prio; 908 return p->normal_prio;
909 return p->prio; 909 return p->prio;
910 } 910 }
911 911
912 /* 912 /*
913 * activate_task - move a task to the runqueue. 913 * activate_task - move a task to the runqueue.
914 */ 914 */
915 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 915 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
916 { 916 {
917 u64 now = rq_clock(rq); 917 u64 now = rq_clock(rq);
918 918
919 if (p->state == TASK_UNINTERRUPTIBLE) 919 if (p->state == TASK_UNINTERRUPTIBLE)
920 rq->nr_uninterruptible--; 920 rq->nr_uninterruptible--;
921 921
922 enqueue_task(rq, p, wakeup, now); 922 enqueue_task(rq, p, wakeup, now);
923 inc_nr_running(p, rq, now); 923 inc_nr_running(p, rq, now);
924 } 924 }
925 925
926 /* 926 /*
927 * activate_idle_task - move idle task to the _front_ of runqueue. 927 * activate_idle_task - move idle task to the _front_ of runqueue.
928 */ 928 */
929 static inline void activate_idle_task(struct task_struct *p, struct rq *rq) 929 static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
930 { 930 {
931 u64 now = rq_clock(rq); 931 u64 now = rq_clock(rq);
932 932
933 if (p->state == TASK_UNINTERRUPTIBLE) 933 if (p->state == TASK_UNINTERRUPTIBLE)
934 rq->nr_uninterruptible--; 934 rq->nr_uninterruptible--;
935 935
936 enqueue_task(rq, p, 0, now); 936 enqueue_task(rq, p, 0, now);
937 inc_nr_running(p, rq, now); 937 inc_nr_running(p, rq, now);
938 } 938 }
939 939
940 /* 940 /*
941 * deactivate_task - remove a task from the runqueue. 941 * deactivate_task - remove a task from the runqueue.
942 */ 942 */
943 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 943 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
944 { 944 {
945 u64 now = rq_clock(rq); 945 u64 now = rq_clock(rq);
946 946
947 if (p->state == TASK_UNINTERRUPTIBLE) 947 if (p->state == TASK_UNINTERRUPTIBLE)
948 rq->nr_uninterruptible++; 948 rq->nr_uninterruptible++;
949 949
950 dequeue_task(rq, p, sleep, now); 950 dequeue_task(rq, p, sleep, now);
951 dec_nr_running(p, rq, now); 951 dec_nr_running(p, rq, now);
952 } 952 }
953 953
954 /** 954 /**
955 * task_curr - is this task currently executing on a CPU? 955 * task_curr - is this task currently executing on a CPU?
956 * @p: the task in question. 956 * @p: the task in question.
957 */ 957 */
958 inline int task_curr(const struct task_struct *p) 958 inline int task_curr(const struct task_struct *p)
959 { 959 {
960 return cpu_curr(task_cpu(p)) == p; 960 return cpu_curr(task_cpu(p)) == p;
961 } 961 }
962 962
963 /* Used instead of source_load when we know the type == 0 */ 963 /* Used instead of source_load when we know the type == 0 */
964 unsigned long weighted_cpuload(const int cpu) 964 unsigned long weighted_cpuload(const int cpu)
965 { 965 {
966 return cpu_rq(cpu)->ls.load.weight; 966 return cpu_rq(cpu)->ls.load.weight;
967 } 967 }
968 968
969 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 969 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
970 { 970 {
971 #ifdef CONFIG_SMP 971 #ifdef CONFIG_SMP
972 task_thread_info(p)->cpu = cpu; 972 task_thread_info(p)->cpu = cpu;
973 set_task_cfs_rq(p); 973 set_task_cfs_rq(p);
974 #endif 974 #endif
975 } 975 }
976 976
977 #ifdef CONFIG_SMP 977 #ifdef CONFIG_SMP
978 978
979 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 979 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
980 { 980 {
981 int old_cpu = task_cpu(p); 981 int old_cpu = task_cpu(p);
982 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 982 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
983 u64 clock_offset, fair_clock_offset; 983 u64 clock_offset, fair_clock_offset;
984 984
985 clock_offset = old_rq->clock - new_rq->clock; 985 clock_offset = old_rq->clock - new_rq->clock;
986 fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; 986 fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
987 987
988 if (p->se.wait_start_fair) 988 if (p->se.wait_start_fair)
989 p->se.wait_start_fair -= fair_clock_offset; 989 p->se.wait_start_fair -= fair_clock_offset;
990 if (p->se.sleep_start_fair) 990 if (p->se.sleep_start_fair)
991 p->se.sleep_start_fair -= fair_clock_offset; 991 p->se.sleep_start_fair -= fair_clock_offset;
992 992
993 #ifdef CONFIG_SCHEDSTATS 993 #ifdef CONFIG_SCHEDSTATS
994 if (p->se.wait_start) 994 if (p->se.wait_start)
995 p->se.wait_start -= clock_offset; 995 p->se.wait_start -= clock_offset;
996 if (p->se.sleep_start) 996 if (p->se.sleep_start)
997 p->se.sleep_start -= clock_offset; 997 p->se.sleep_start -= clock_offset;
998 if (p->se.block_start) 998 if (p->se.block_start)
999 p->se.block_start -= clock_offset; 999 p->se.block_start -= clock_offset;
1000 #endif 1000 #endif
1001 1001
1002 __set_task_cpu(p, new_cpu); 1002 __set_task_cpu(p, new_cpu);
1003 } 1003 }
1004 1004
1005 struct migration_req { 1005 struct migration_req {
1006 struct list_head list; 1006 struct list_head list;
1007 1007
1008 struct task_struct *task; 1008 struct task_struct *task;
1009 int dest_cpu; 1009 int dest_cpu;
1010 1010
1011 struct completion done; 1011 struct completion done;
1012 }; 1012 };
1013 1013
1014 /* 1014 /*
1015 * The task's runqueue lock must be held. 1015 * The task's runqueue lock must be held.
1016 * Returns true if you have to wait for migration thread. 1016 * Returns true if you have to wait for migration thread.
1017 */ 1017 */
1018 static int 1018 static int
1019 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) 1019 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1020 { 1020 {
1021 struct rq *rq = task_rq(p); 1021 struct rq *rq = task_rq(p);
1022 1022
1023 /* 1023 /*
1024 * If the task is not on a runqueue (and not running), then 1024 * If the task is not on a runqueue (and not running), then
1025 * it is sufficient to simply update the task's cpu field. 1025 * it is sufficient to simply update the task's cpu field.
1026 */ 1026 */
1027 if (!p->se.on_rq && !task_running(rq, p)) { 1027 if (!p->se.on_rq && !task_running(rq, p)) {
1028 set_task_cpu(p, dest_cpu); 1028 set_task_cpu(p, dest_cpu);
1029 return 0; 1029 return 0;
1030 } 1030 }
1031 1031
1032 init_completion(&req->done); 1032 init_completion(&req->done);
1033 req->task = p; 1033 req->task = p;
1034 req->dest_cpu = dest_cpu; 1034 req->dest_cpu = dest_cpu;
1035 list_add(&req->list, &rq->migration_queue); 1035 list_add(&req->list, &rq->migration_queue);
1036 1036
1037 return 1; 1037 return 1;
1038 } 1038 }
1039 1039
1040 /* 1040 /*
1041 * wait_task_inactive - wait for a thread to unschedule. 1041 * wait_task_inactive - wait for a thread to unschedule.
1042 * 1042 *
1043 * The caller must ensure that the task *will* unschedule sometime soon, 1043 * The caller must ensure that the task *will* unschedule sometime soon,
1044 * else this function might spin for a *long* time. This function can't 1044 * else this function might spin for a *long* time. This function can't
1045 * be called with interrupts off, or it may introduce deadlock with 1045 * be called with interrupts off, or it may introduce deadlock with
1046 * smp_call_function() if an IPI is sent by the same process we are 1046 * smp_call_function() if an IPI is sent by the same process we are
1047 * waiting to become inactive. 1047 * waiting to become inactive.
1048 */ 1048 */
1049 void wait_task_inactive(struct task_struct *p) 1049 void wait_task_inactive(struct task_struct *p)
1050 { 1050 {
1051 unsigned long flags; 1051 unsigned long flags;
1052 int running, on_rq; 1052 int running, on_rq;
1053 struct rq *rq; 1053 struct rq *rq;
1054 1054
1055 repeat: 1055 repeat:
1056 /* 1056 /*
1057 * We do the initial early heuristics without holding 1057 * We do the initial early heuristics without holding
1058 * any task-queue locks at all. We'll only try to get 1058 * any task-queue locks at all. We'll only try to get
1059 * the runqueue lock when things look like they will 1059 * the runqueue lock when things look like they will
1060 * work out! 1060 * work out!
1061 */ 1061 */
1062 rq = task_rq(p); 1062 rq = task_rq(p);
1063 1063
1064 /* 1064 /*
1065 * If the task is actively running on another CPU 1065 * If the task is actively running on another CPU
1066 * still, just relax and busy-wait without holding 1066 * still, just relax and busy-wait without holding
1067 * any locks. 1067 * any locks.
1068 * 1068 *
1069 * NOTE! Since we don't hold any locks, it's not 1069 * NOTE! Since we don't hold any locks, it's not
1070 * even sure that "rq" stays as the right runqueue! 1070 * even sure that "rq" stays as the right runqueue!
1071 * But we don't care, since "task_running()" will 1071 * But we don't care, since "task_running()" will
1072 * return false if the runqueue has changed and p 1072 * return false if the runqueue has changed and p
1073 * is actually now running somewhere else! 1073 * is actually now running somewhere else!
1074 */ 1074 */
1075 while (task_running(rq, p)) 1075 while (task_running(rq, p))
1076 cpu_relax(); 1076 cpu_relax();
1077 1077
1078 /* 1078 /*
1079 * Ok, time to look more closely! We need the rq 1079 * Ok, time to look more closely! We need the rq
1080 * lock now, to be *sure*. If we're wrong, we'll 1080 * lock now, to be *sure*. If we're wrong, we'll
1081 * just go back and repeat. 1081 * just go back and repeat.
1082 */ 1082 */
1083 rq = task_rq_lock(p, &flags); 1083 rq = task_rq_lock(p, &flags);
1084 running = task_running(rq, p); 1084 running = task_running(rq, p);
1085 on_rq = p->se.on_rq; 1085 on_rq = p->se.on_rq;
1086 task_rq_unlock(rq, &flags); 1086 task_rq_unlock(rq, &flags);
1087 1087
1088 /* 1088 /*
1089 * Was it really running after all now that we 1089 * Was it really running after all now that we
1090 * checked with the proper locks actually held? 1090 * checked with the proper locks actually held?
1091 * 1091 *
1092 * Oops. Go back and try again.. 1092 * Oops. Go back and try again..
1093 */ 1093 */
1094 if (unlikely(running)) { 1094 if (unlikely(running)) {
1095 cpu_relax(); 1095 cpu_relax();
1096 goto repeat; 1096 goto repeat;
1097 } 1097 }
1098 1098
1099 /* 1099 /*
1100 * It's not enough that it's not actively running, 1100 * It's not enough that it's not actively running,
1101 * it must be off the runqueue _entirely_, and not 1101 * it must be off the runqueue _entirely_, and not
1102 * preempted! 1102 * preempted!
1103 * 1103 *
1104 * So if it wa still runnable (but just not actively 1104 * So if it wa still runnable (but just not actively
1105 * running right now), it's preempted, and we should 1105 * running right now), it's preempted, and we should
1106 * yield - it could be a while. 1106 * yield - it could be a while.
1107 */ 1107 */
1108 if (unlikely(on_rq)) { 1108 if (unlikely(on_rq)) {
1109 yield(); 1109 yield();
1110 goto repeat; 1110 goto repeat;
1111 } 1111 }
1112 1112
1113 /* 1113 /*
1114 * Ahh, all good. It wasn't running, and it wasn't 1114 * Ahh, all good. It wasn't running, and it wasn't
1115 * runnable, which means that it will never become 1115 * runnable, which means that it will never become
1116 * running in the future either. We're all done! 1116 * running in the future either. We're all done!
1117 */ 1117 */
1118 } 1118 }
1119 1119
1120 /*** 1120 /***
1121 * kick_process - kick a running thread to enter/exit the kernel 1121 * kick_process - kick a running thread to enter/exit the kernel
1122 * @p: the to-be-kicked thread 1122 * @p: the to-be-kicked thread
1123 * 1123 *
1124 * Cause a process which is running on another CPU to enter 1124 * Cause a process which is running on another CPU to enter
1125 * kernel-mode, without any delay. (to get signals handled.) 1125 * kernel-mode, without any delay. (to get signals handled.)
1126 * 1126 *
1127 * NOTE: this function doesnt have to take the runqueue lock, 1127 * NOTE: this function doesnt have to take the runqueue lock,
1128 * because all it wants to ensure is that the remote task enters 1128 * because all it wants to ensure is that the remote task enters
1129 * the kernel. If the IPI races and the task has been migrated 1129 * the kernel. If the IPI races and the task has been migrated
1130 * to another CPU then no harm is done and the purpose has been 1130 * to another CPU then no harm is done and the purpose has been
1131 * achieved as well. 1131 * achieved as well.
1132 */ 1132 */
1133 void kick_process(struct task_struct *p) 1133 void kick_process(struct task_struct *p)
1134 { 1134 {
1135 int cpu; 1135 int cpu;
1136 1136
1137 preempt_disable(); 1137 preempt_disable();
1138 cpu = task_cpu(p); 1138 cpu = task_cpu(p);
1139 if ((cpu != smp_processor_id()) && task_curr(p)) 1139 if ((cpu != smp_processor_id()) && task_curr(p))
1140 smp_send_reschedule(cpu); 1140 smp_send_reschedule(cpu);
1141 preempt_enable(); 1141 preempt_enable();
1142 } 1142 }
1143 1143
1144 /* 1144 /*
1145 * Return a low guess at the load of a migration-source cpu weighted 1145 * Return a low guess at the load of a migration-source cpu weighted
1146 * according to the scheduling class and "nice" value. 1146 * according to the scheduling class and "nice" value.
1147 * 1147 *
1148 * We want to under-estimate the load of migration sources, to 1148 * We want to under-estimate the load of migration sources, to
1149 * balance conservatively. 1149 * balance conservatively.
1150 */ 1150 */
1151 static inline unsigned long source_load(int cpu, int type) 1151 static inline unsigned long source_load(int cpu, int type)
1152 { 1152 {
1153 struct rq *rq = cpu_rq(cpu); 1153 struct rq *rq = cpu_rq(cpu);
1154 unsigned long total = weighted_cpuload(cpu); 1154 unsigned long total = weighted_cpuload(cpu);
1155 1155
1156 if (type == 0) 1156 if (type == 0)
1157 return total; 1157 return total;
1158 1158
1159 return min(rq->cpu_load[type-1], total); 1159 return min(rq->cpu_load[type-1], total);
1160 } 1160 }
1161 1161
1162 /* 1162 /*
1163 * Return a high guess at the load of a migration-target cpu weighted 1163 * Return a high guess at the load of a migration-target cpu weighted
1164 * according to the scheduling class and "nice" value. 1164 * according to the scheduling class and "nice" value.
1165 */ 1165 */
1166 static inline unsigned long target_load(int cpu, int type) 1166 static inline unsigned long target_load(int cpu, int type)
1167 { 1167 {
1168 struct rq *rq = cpu_rq(cpu); 1168 struct rq *rq = cpu_rq(cpu);
1169 unsigned long total = weighted_cpuload(cpu); 1169 unsigned long total = weighted_cpuload(cpu);
1170 1170
1171 if (type == 0) 1171 if (type == 0)
1172 return total; 1172 return total;
1173 1173
1174 return max(rq->cpu_load[type-1], total); 1174 return max(rq->cpu_load[type-1], total);
1175 } 1175 }
1176 1176
1177 /* 1177 /*
1178 * Return the average load per task on the cpu's run queue 1178 * Return the average load per task on the cpu's run queue
1179 */ 1179 */
1180 static inline unsigned long cpu_avg_load_per_task(int cpu) 1180 static inline unsigned long cpu_avg_load_per_task(int cpu)
1181 { 1181 {
1182 struct rq *rq = cpu_rq(cpu); 1182 struct rq *rq = cpu_rq(cpu);
1183 unsigned long total = weighted_cpuload(cpu); 1183 unsigned long total = weighted_cpuload(cpu);
1184 unsigned long n = rq->nr_running; 1184 unsigned long n = rq->nr_running;
1185 1185
1186 return n ? total / n : SCHED_LOAD_SCALE; 1186 return n ? total / n : SCHED_LOAD_SCALE;
1187 } 1187 }
1188 1188
1189 /* 1189 /*
1190 * find_idlest_group finds and returns the least busy CPU group within the 1190 * find_idlest_group finds and returns the least busy CPU group within the
1191 * domain. 1191 * domain.
1192 */ 1192 */
1193 static struct sched_group * 1193 static struct sched_group *
1194 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) 1194 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1195 { 1195 {
1196 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1196 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1197 unsigned long min_load = ULONG_MAX, this_load = 0; 1197 unsigned long min_load = ULONG_MAX, this_load = 0;
1198 int load_idx = sd->forkexec_idx; 1198 int load_idx = sd->forkexec_idx;
1199 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1199 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1200 1200
1201 do { 1201 do {
1202 unsigned long load, avg_load; 1202 unsigned long load, avg_load;
1203 int local_group; 1203 int local_group;
1204 int i; 1204 int i;
1205 1205
1206 /* Skip over this group if it has no CPUs allowed */ 1206 /* Skip over this group if it has no CPUs allowed */
1207 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 1207 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1208 goto nextgroup; 1208 goto nextgroup;
1209 1209
1210 local_group = cpu_isset(this_cpu, group->cpumask); 1210 local_group = cpu_isset(this_cpu, group->cpumask);
1211 1211
1212 /* Tally up the load of all CPUs in the group */ 1212 /* Tally up the load of all CPUs in the group */
1213 avg_load = 0; 1213 avg_load = 0;
1214 1214
1215 for_each_cpu_mask(i, group->cpumask) { 1215 for_each_cpu_mask(i, group->cpumask) {
1216 /* Bias balancing toward cpus of our domain */ 1216 /* Bias balancing toward cpus of our domain */
1217 if (local_group) 1217 if (local_group)
1218 load = source_load(i, load_idx); 1218 load = source_load(i, load_idx);
1219 else 1219 else
1220 load = target_load(i, load_idx); 1220 load = target_load(i, load_idx);
1221 1221
1222 avg_load += load; 1222 avg_load += load;
1223 } 1223 }
1224 1224
1225 /* Adjust by relative CPU power of the group */ 1225 /* Adjust by relative CPU power of the group */
1226 avg_load = sg_div_cpu_power(group, 1226 avg_load = sg_div_cpu_power(group,
1227 avg_load * SCHED_LOAD_SCALE); 1227 avg_load * SCHED_LOAD_SCALE);
1228 1228
1229 if (local_group) { 1229 if (local_group) {
1230 this_load = avg_load; 1230 this_load = avg_load;
1231 this = group; 1231 this = group;
1232 } else if (avg_load < min_load) { 1232 } else if (avg_load < min_load) {
1233 min_load = avg_load; 1233 min_load = avg_load;
1234 idlest = group; 1234 idlest = group;
1235 } 1235 }
1236 nextgroup: 1236 nextgroup:
1237 group = group->next; 1237 group = group->next;
1238 } while (group != sd->groups); 1238 } while (group != sd->groups);
1239 1239
1240 if (!idlest || 100*this_load < imbalance*min_load) 1240 if (!idlest || 100*this_load < imbalance*min_load)
1241 return NULL; 1241 return NULL;
1242 return idlest; 1242 return idlest;
1243 } 1243 }
1244 1244
1245 /* 1245 /*
1246 * find_idlest_cpu - find the idlest cpu among the cpus in group. 1246 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1247 */ 1247 */
1248 static int 1248 static int
1249 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 1249 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1250 { 1250 {
1251 cpumask_t tmp; 1251 cpumask_t tmp;
1252 unsigned long load, min_load = ULONG_MAX; 1252 unsigned long load, min_load = ULONG_MAX;
1253 int idlest = -1; 1253 int idlest = -1;
1254 int i; 1254 int i;
1255 1255
1256 /* Traverse only the allowed CPUs */ 1256 /* Traverse only the allowed CPUs */
1257 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1257 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1258 1258
1259 for_each_cpu_mask(i, tmp) { 1259 for_each_cpu_mask(i, tmp) {
1260 load = weighted_cpuload(i); 1260 load = weighted_cpuload(i);
1261 1261
1262 if (load < min_load || (load == min_load && i == this_cpu)) { 1262 if (load < min_load || (load == min_load && i == this_cpu)) {
1263 min_load = load; 1263 min_load = load;
1264 idlest = i; 1264 idlest = i;
1265 } 1265 }
1266 } 1266 }
1267 1267
1268 return idlest; 1268 return idlest;
1269 } 1269 }
1270 1270
1271 /* 1271 /*
1272 * sched_balance_self: balance the current task (running on cpu) in domains 1272 * sched_balance_self: balance the current task (running on cpu) in domains
1273 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1273 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1274 * SD_BALANCE_EXEC. 1274 * SD_BALANCE_EXEC.
1275 * 1275 *
1276 * Balance, ie. select the least loaded group. 1276 * Balance, ie. select the least loaded group.
1277 * 1277 *
1278 * Returns the target CPU number, or the same CPU if no balancing is needed. 1278 * Returns the target CPU number, or the same CPU if no balancing is needed.
1279 * 1279 *
1280 * preempt must be disabled. 1280 * preempt must be disabled.
1281 */ 1281 */
1282 static int sched_balance_self(int cpu, int flag) 1282 static int sched_balance_self(int cpu, int flag)
1283 { 1283 {
1284 struct task_struct *t = current; 1284 struct task_struct *t = current;
1285 struct sched_domain *tmp, *sd = NULL; 1285 struct sched_domain *tmp, *sd = NULL;
1286 1286
1287 for_each_domain(cpu, tmp) { 1287 for_each_domain(cpu, tmp) {
1288 /* 1288 /*
1289 * If power savings logic is enabled for a domain, stop there. 1289 * If power savings logic is enabled for a domain, stop there.
1290 */ 1290 */
1291 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1291 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1292 break; 1292 break;
1293 if (tmp->flags & flag) 1293 if (tmp->flags & flag)
1294 sd = tmp; 1294 sd = tmp;
1295 } 1295 }
1296 1296
1297 while (sd) { 1297 while (sd) {
1298 cpumask_t span; 1298 cpumask_t span;
1299 struct sched_group *group; 1299 struct sched_group *group;
1300 int new_cpu, weight; 1300 int new_cpu, weight;
1301 1301
1302 if (!(sd->flags & flag)) { 1302 if (!(sd->flags & flag)) {
1303 sd = sd->child; 1303 sd = sd->child;
1304 continue; 1304 continue;
1305 } 1305 }
1306 1306
1307 span = sd->span; 1307 span = sd->span;
1308 group = find_idlest_group(sd, t, cpu); 1308 group = find_idlest_group(sd, t, cpu);
1309 if (!group) { 1309 if (!group) {
1310 sd = sd->child; 1310 sd = sd->child;
1311 continue; 1311 continue;
1312 } 1312 }
1313 1313
1314 new_cpu = find_idlest_cpu(group, t, cpu); 1314 new_cpu = find_idlest_cpu(group, t, cpu);
1315 if (new_cpu == -1 || new_cpu == cpu) { 1315 if (new_cpu == -1 || new_cpu == cpu) {
1316 /* Now try balancing at a lower domain level of cpu */ 1316 /* Now try balancing at a lower domain level of cpu */
1317 sd = sd->child; 1317 sd = sd->child;
1318 continue; 1318 continue;
1319 } 1319 }
1320 1320
1321 /* Now try balancing at a lower domain level of new_cpu */ 1321 /* Now try balancing at a lower domain level of new_cpu */
1322 cpu = new_cpu; 1322 cpu = new_cpu;
1323 sd = NULL; 1323 sd = NULL;
1324 weight = cpus_weight(span); 1324 weight = cpus_weight(span);
1325 for_each_domain(cpu, tmp) { 1325 for_each_domain(cpu, tmp) {
1326 if (weight <= cpus_weight(tmp->span)) 1326 if (weight <= cpus_weight(tmp->span))
1327 break; 1327 break;
1328 if (tmp->flags & flag) 1328 if (tmp->flags & flag)
1329 sd = tmp; 1329 sd = tmp;
1330 } 1330 }
1331 /* while loop will break here if sd == NULL */ 1331 /* while loop will break here if sd == NULL */
1332 } 1332 }
1333 1333
1334 return cpu; 1334 return cpu;
1335 } 1335 }
1336 1336
1337 #endif /* CONFIG_SMP */ 1337 #endif /* CONFIG_SMP */
1338 1338
1339 /* 1339 /*
1340 * wake_idle() will wake a task on an idle cpu if task->cpu is 1340 * wake_idle() will wake a task on an idle cpu if task->cpu is
1341 * not idle and an idle cpu is available. The span of cpus to 1341 * not idle and an idle cpu is available. The span of cpus to
1342 * search starts with cpus closest then further out as needed, 1342 * search starts with cpus closest then further out as needed,
1343 * so we always favor a closer, idle cpu. 1343 * so we always favor a closer, idle cpu.
1344 * 1344 *
1345 * Returns the CPU we should wake onto. 1345 * Returns the CPU we should wake onto.
1346 */ 1346 */
1347 #if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1347 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1348 static int wake_idle(int cpu, struct task_struct *p) 1348 static int wake_idle(int cpu, struct task_struct *p)
1349 { 1349 {
1350 cpumask_t tmp; 1350 cpumask_t tmp;
1351 struct sched_domain *sd; 1351 struct sched_domain *sd;
1352 int i; 1352 int i;
1353 1353
1354 /* 1354 /*
1355 * If it is idle, then it is the best cpu to run this task. 1355 * If it is idle, then it is the best cpu to run this task.
1356 * 1356 *
1357 * This cpu is also the best, if it has more than one task already. 1357 * This cpu is also the best, if it has more than one task already.
1358 * Siblings must be also busy(in most cases) as they didn't already 1358 * Siblings must be also busy(in most cases) as they didn't already
1359 * pickup the extra load from this cpu and hence we need not check 1359 * pickup the extra load from this cpu and hence we need not check
1360 * sibling runqueue info. This will avoid the checks and cache miss 1360 * sibling runqueue info. This will avoid the checks and cache miss
1361 * penalities associated with that. 1361 * penalities associated with that.
1362 */ 1362 */
1363 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) 1363 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1364 return cpu; 1364 return cpu;
1365 1365
1366 for_each_domain(cpu, sd) { 1366 for_each_domain(cpu, sd) {
1367 if (sd->flags & SD_WAKE_IDLE) { 1367 if (sd->flags & SD_WAKE_IDLE) {
1368 cpus_and(tmp, sd->span, p->cpus_allowed); 1368 cpus_and(tmp, sd->span, p->cpus_allowed);
1369 for_each_cpu_mask(i, tmp) { 1369 for_each_cpu_mask(i, tmp) {
1370 if (idle_cpu(i)) 1370 if (idle_cpu(i))
1371 return i; 1371 return i;
1372 } 1372 }
1373 } else { 1373 } else {
1374 break; 1374 break;
1375 } 1375 }
1376 } 1376 }
1377 return cpu; 1377 return cpu;
1378 } 1378 }
1379 #else 1379 #else
1380 static inline int wake_idle(int cpu, struct task_struct *p) 1380 static inline int wake_idle(int cpu, struct task_struct *p)
1381 { 1381 {
1382 return cpu; 1382 return cpu;
1383 } 1383 }
1384 #endif 1384 #endif
1385 1385
1386 /*** 1386 /***
1387 * try_to_wake_up - wake up a thread 1387 * try_to_wake_up - wake up a thread
1388 * @p: the to-be-woken-up thread 1388 * @p: the to-be-woken-up thread
1389 * @state: the mask of task states that can be woken 1389 * @state: the mask of task states that can be woken
1390 * @sync: do a synchronous wakeup? 1390 * @sync: do a synchronous wakeup?
1391 * 1391 *
1392 * Put it on the run-queue if it's not already there. The "current" 1392 * Put it on the run-queue if it's not already there. The "current"
1393 * thread is always on the run-queue (except when the actual 1393 * thread is always on the run-queue (except when the actual
1394 * re-schedule is in progress), and as such you're allowed to do 1394 * re-schedule is in progress), and as such you're allowed to do
1395 * the simpler "current->state = TASK_RUNNING" to mark yourself 1395 * the simpler "current->state = TASK_RUNNING" to mark yourself
1396 * runnable without the overhead of this. 1396 * runnable without the overhead of this.
1397 * 1397 *
1398 * returns failure only if the task is already active. 1398 * returns failure only if the task is already active.
1399 */ 1399 */
1400 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 1400 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1401 { 1401 {
1402 int cpu, this_cpu, success = 0; 1402 int cpu, this_cpu, success = 0;
1403 unsigned long flags; 1403 unsigned long flags;
1404 long old_state; 1404 long old_state;
1405 struct rq *rq; 1405 struct rq *rq;
1406 #ifdef CONFIG_SMP 1406 #ifdef CONFIG_SMP
1407 struct sched_domain *sd, *this_sd = NULL; 1407 struct sched_domain *sd, *this_sd = NULL;
1408 unsigned long load, this_load; 1408 unsigned long load, this_load;
1409 int new_cpu; 1409 int new_cpu;
1410 #endif 1410 #endif
1411 1411
1412 rq = task_rq_lock(p, &flags); 1412 rq = task_rq_lock(p, &flags);
1413 old_state = p->state; 1413 old_state = p->state;
1414 if (!(old_state & state)) 1414 if (!(old_state & state))
1415 goto out; 1415 goto out;
1416 1416
1417 if (p->se.on_rq) 1417 if (p->se.on_rq)
1418 goto out_running; 1418 goto out_running;
1419 1419
1420 cpu = task_cpu(p); 1420 cpu = task_cpu(p);
1421 this_cpu = smp_processor_id(); 1421 this_cpu = smp_processor_id();
1422 1422
1423 #ifdef CONFIG_SMP 1423 #ifdef CONFIG_SMP
1424 if (unlikely(task_running(rq, p))) 1424 if (unlikely(task_running(rq, p)))
1425 goto out_activate; 1425 goto out_activate;
1426 1426
1427 new_cpu = cpu; 1427 new_cpu = cpu;
1428 1428
1429 schedstat_inc(rq, ttwu_cnt); 1429 schedstat_inc(rq, ttwu_cnt);
1430 if (cpu == this_cpu) { 1430 if (cpu == this_cpu) {
1431 schedstat_inc(rq, ttwu_local); 1431 schedstat_inc(rq, ttwu_local);
1432 goto out_set_cpu; 1432 goto out_set_cpu;
1433 } 1433 }
1434 1434
1435 for_each_domain(this_cpu, sd) { 1435 for_each_domain(this_cpu, sd) {
1436 if (cpu_isset(cpu, sd->span)) { 1436 if (cpu_isset(cpu, sd->span)) {
1437 schedstat_inc(sd, ttwu_wake_remote); 1437 schedstat_inc(sd, ttwu_wake_remote);
1438 this_sd = sd; 1438 this_sd = sd;
1439 break; 1439 break;
1440 } 1440 }
1441 } 1441 }
1442 1442
1443 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1443 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1444 goto out_set_cpu; 1444 goto out_set_cpu;
1445 1445
1446 /* 1446 /*
1447 * Check for affine wakeup and passive balancing possibilities. 1447 * Check for affine wakeup and passive balancing possibilities.
1448 */ 1448 */
1449 if (this_sd) { 1449 if (this_sd) {
1450 int idx = this_sd->wake_idx; 1450 int idx = this_sd->wake_idx;
1451 unsigned int imbalance; 1451 unsigned int imbalance;
1452 1452
1453 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1453 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1454 1454
1455 load = source_load(cpu, idx); 1455 load = source_load(cpu, idx);
1456 this_load = target_load(this_cpu, idx); 1456 this_load = target_load(this_cpu, idx);
1457 1457
1458 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1458 new_cpu = this_cpu; /* Wake to this CPU if we can */
1459 1459
1460 if (this_sd->flags & SD_WAKE_AFFINE) { 1460 if (this_sd->flags & SD_WAKE_AFFINE) {
1461 unsigned long tl = this_load; 1461 unsigned long tl = this_load;
1462 unsigned long tl_per_task; 1462 unsigned long tl_per_task;
1463 1463
1464 tl_per_task = cpu_avg_load_per_task(this_cpu); 1464 tl_per_task = cpu_avg_load_per_task(this_cpu);
1465 1465
1466 /* 1466 /*
1467 * If sync wakeup then subtract the (maximum possible) 1467 * If sync wakeup then subtract the (maximum possible)
1468 * effect of the currently running task from the load 1468 * effect of the currently running task from the load
1469 * of the current CPU: 1469 * of the current CPU:
1470 */ 1470 */
1471 if (sync) 1471 if (sync)
1472 tl -= current->se.load.weight; 1472 tl -= current->se.load.weight;
1473 1473
1474 if ((tl <= load && 1474 if ((tl <= load &&
1475 tl + target_load(cpu, idx) <= tl_per_task) || 1475 tl + target_load(cpu, idx) <= tl_per_task) ||
1476 100*(tl + p->se.load.weight) <= imbalance*load) { 1476 100*(tl + p->se.load.weight) <= imbalance*load) {
1477 /* 1477 /*
1478 * This domain has SD_WAKE_AFFINE and 1478 * This domain has SD_WAKE_AFFINE and
1479 * p is cache cold in this domain, and 1479 * p is cache cold in this domain, and
1480 * there is no bad imbalance. 1480 * there is no bad imbalance.
1481 */ 1481 */
1482 schedstat_inc(this_sd, ttwu_move_affine); 1482 schedstat_inc(this_sd, ttwu_move_affine);
1483 goto out_set_cpu; 1483 goto out_set_cpu;
1484 } 1484 }
1485 } 1485 }
1486 1486
1487 /* 1487 /*
1488 * Start passive balancing when half the imbalance_pct 1488 * Start passive balancing when half the imbalance_pct
1489 * limit is reached. 1489 * limit is reached.
1490 */ 1490 */
1491 if (this_sd->flags & SD_WAKE_BALANCE) { 1491 if (this_sd->flags & SD_WAKE_BALANCE) {
1492 if (imbalance*this_load <= 100*load) { 1492 if (imbalance*this_load <= 100*load) {
1493 schedstat_inc(this_sd, ttwu_move_balance); 1493 schedstat_inc(this_sd, ttwu_move_balance);
1494 goto out_set_cpu; 1494 goto out_set_cpu;
1495 } 1495 }
1496 } 1496 }
1497 } 1497 }
1498 1498
1499 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1499 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1500 out_set_cpu: 1500 out_set_cpu:
1501 new_cpu = wake_idle(new_cpu, p); 1501 new_cpu = wake_idle(new_cpu, p);
1502 if (new_cpu != cpu) { 1502 if (new_cpu != cpu) {
1503 set_task_cpu(p, new_cpu); 1503 set_task_cpu(p, new_cpu);
1504 task_rq_unlock(rq, &flags); 1504 task_rq_unlock(rq, &flags);
1505 /* might preempt at this point */ 1505 /* might preempt at this point */
1506 rq = task_rq_lock(p, &flags); 1506 rq = task_rq_lock(p, &flags);
1507 old_state = p->state; 1507 old_state = p->state;
1508 if (!(old_state & state)) 1508 if (!(old_state & state))
1509 goto out; 1509 goto out;
1510 if (p->se.on_rq) 1510 if (p->se.on_rq)
1511 goto out_running; 1511 goto out_running;
1512 1512
1513 this_cpu = smp_processor_id(); 1513 this_cpu = smp_processor_id();
1514 cpu = task_cpu(p); 1514 cpu = task_cpu(p);
1515 } 1515 }
1516 1516
1517 out_activate: 1517 out_activate:
1518 #endif /* CONFIG_SMP */ 1518 #endif /* CONFIG_SMP */
1519 activate_task(rq, p, 1); 1519 activate_task(rq, p, 1);
1520 /* 1520 /*
1521 * Sync wakeups (i.e. those types of wakeups where the waker 1521 * Sync wakeups (i.e. those types of wakeups where the waker
1522 * has indicated that it will leave the CPU in short order) 1522 * has indicated that it will leave the CPU in short order)
1523 * don't trigger a preemption, if the woken up task will run on 1523 * don't trigger a preemption, if the woken up task will run on
1524 * this cpu. (in this case the 'I will reschedule' promise of 1524 * this cpu. (in this case the 'I will reschedule' promise of
1525 * the waker guarantees that the freshly woken up task is going 1525 * the waker guarantees that the freshly woken up task is going
1526 * to be considered on this CPU.) 1526 * to be considered on this CPU.)
1527 */ 1527 */
1528 if (!sync || cpu != this_cpu) 1528 if (!sync || cpu != this_cpu)
1529 check_preempt_curr(rq, p); 1529 check_preempt_curr(rq, p);
1530 success = 1; 1530 success = 1;
1531 1531
1532 out_running: 1532 out_running:
1533 p->state = TASK_RUNNING; 1533 p->state = TASK_RUNNING;
1534 out: 1534 out:
1535 task_rq_unlock(rq, &flags); 1535 task_rq_unlock(rq, &flags);
1536 1536
1537 return success; 1537 return success;
1538 } 1538 }
1539 1539
1540 int fastcall wake_up_process(struct task_struct *p) 1540 int fastcall wake_up_process(struct task_struct *p)
1541 { 1541 {
1542 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1542 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1543 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1543 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1544 } 1544 }
1545 EXPORT_SYMBOL(wake_up_process); 1545 EXPORT_SYMBOL(wake_up_process);
1546 1546
1547 int fastcall wake_up_state(struct task_struct *p, unsigned int state) 1547 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1548 { 1548 {
1549 return try_to_wake_up(p, state, 0); 1549 return try_to_wake_up(p, state, 0);
1550 } 1550 }
1551 1551
1552 /* 1552 /*
1553 * Perform scheduler related setup for a newly forked process p. 1553 * Perform scheduler related setup for a newly forked process p.
1554 * p is forked by current. 1554 * p is forked by current.
1555 * 1555 *
1556 * __sched_fork() is basic setup used by init_idle() too: 1556 * __sched_fork() is basic setup used by init_idle() too:
1557 */ 1557 */
1558 static void __sched_fork(struct task_struct *p) 1558 static void __sched_fork(struct task_struct *p)
1559 { 1559 {
1560 p->se.wait_start_fair = 0; 1560 p->se.wait_start_fair = 0;
1561 p->se.exec_start = 0; 1561 p->se.exec_start = 0;
1562 p->se.sum_exec_runtime = 0; 1562 p->se.sum_exec_runtime = 0;
1563 p->se.delta_exec = 0; 1563 p->se.delta_exec = 0;
1564 p->se.delta_fair_run = 0; 1564 p->se.delta_fair_run = 0;
1565 p->se.delta_fair_sleep = 0; 1565 p->se.delta_fair_sleep = 0;
1566 p->se.wait_runtime = 0; 1566 p->se.wait_runtime = 0;
1567 p->se.sleep_start_fair = 0; 1567 p->se.sleep_start_fair = 0;
1568 1568
1569 #ifdef CONFIG_SCHEDSTATS 1569 #ifdef CONFIG_SCHEDSTATS
1570 p->se.wait_start = 0; 1570 p->se.wait_start = 0;
1571 p->se.sum_wait_runtime = 0; 1571 p->se.sum_wait_runtime = 0;
1572 p->se.sum_sleep_runtime = 0; 1572 p->se.sum_sleep_runtime = 0;
1573 p->se.sleep_start = 0; 1573 p->se.sleep_start = 0;
1574 p->se.block_start = 0; 1574 p->se.block_start = 0;
1575 p->se.sleep_max = 0; 1575 p->se.sleep_max = 0;
1576 p->se.block_max = 0; 1576 p->se.block_max = 0;
1577 p->se.exec_max = 0; 1577 p->se.exec_max = 0;
1578 p->se.wait_max = 0; 1578 p->se.wait_max = 0;
1579 p->se.wait_runtime_overruns = 0; 1579 p->se.wait_runtime_overruns = 0;
1580 p->se.wait_runtime_underruns = 0; 1580 p->se.wait_runtime_underruns = 0;
1581 #endif 1581 #endif
1582 1582
1583 INIT_LIST_HEAD(&p->run_list); 1583 INIT_LIST_HEAD(&p->run_list);
1584 p->se.on_rq = 0; 1584 p->se.on_rq = 0;
1585 1585
1586 #ifdef CONFIG_PREEMPT_NOTIFIERS 1586 #ifdef CONFIG_PREEMPT_NOTIFIERS
1587 INIT_HLIST_HEAD(&p->preempt_notifiers); 1587 INIT_HLIST_HEAD(&p->preempt_notifiers);
1588 #endif 1588 #endif
1589 1589
1590 /* 1590 /*
1591 * We mark the process as running here, but have not actually 1591 * We mark the process as running here, but have not actually
1592 * inserted it onto the runqueue yet. This guarantees that 1592 * inserted it onto the runqueue yet. This guarantees that
1593 * nobody will actually run it, and a signal or other external 1593 * nobody will actually run it, and a signal or other external
1594 * event cannot wake it up and insert it on the runqueue either. 1594 * event cannot wake it up and insert it on the runqueue either.
1595 */ 1595 */
1596 p->state = TASK_RUNNING; 1596 p->state = TASK_RUNNING;
1597 } 1597 }
1598 1598
1599 /* 1599 /*
1600 * fork()/clone()-time setup: 1600 * fork()/clone()-time setup:
1601 */ 1601 */
1602 void sched_fork(struct task_struct *p, int clone_flags) 1602 void sched_fork(struct task_struct *p, int clone_flags)
1603 { 1603 {
1604 int cpu = get_cpu(); 1604 int cpu = get_cpu();
1605 1605
1606 __sched_fork(p); 1606 __sched_fork(p);
1607 1607
1608 #ifdef CONFIG_SMP 1608 #ifdef CONFIG_SMP
1609 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1609 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1610 #endif 1610 #endif
1611 __set_task_cpu(p, cpu); 1611 __set_task_cpu(p, cpu);
1612 1612
1613 /* 1613 /*
1614 * Make sure we do not leak PI boosting priority to the child: 1614 * Make sure we do not leak PI boosting priority to the child:
1615 */ 1615 */
1616 p->prio = current->normal_prio; 1616 p->prio = current->normal_prio;
1617 1617
1618 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1618 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1619 if (likely(sched_info_on())) 1619 if (likely(sched_info_on()))
1620 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1620 memset(&p->sched_info, 0, sizeof(p->sched_info));
1621 #endif 1621 #endif
1622 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1622 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1623 p->oncpu = 0; 1623 p->oncpu = 0;
1624 #endif 1624 #endif
1625 #ifdef CONFIG_PREEMPT 1625 #ifdef CONFIG_PREEMPT
1626 /* Want to start with kernel preemption disabled. */ 1626 /* Want to start with kernel preemption disabled. */
1627 task_thread_info(p)->preempt_count = 1; 1627 task_thread_info(p)->preempt_count = 1;
1628 #endif 1628 #endif
1629 put_cpu(); 1629 put_cpu();
1630 } 1630 }
1631 1631
1632 /* 1632 /*
1633 * After fork, child runs first. (default) If set to 0 then 1633 * After fork, child runs first. (default) If set to 0 then
1634 * parent will (try to) run first. 1634 * parent will (try to) run first.
1635 */ 1635 */
1636 unsigned int __read_mostly sysctl_sched_child_runs_first = 1; 1636 unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1637 1637
1638 /* 1638 /*
1639 * wake_up_new_task - wake up a newly created task for the first time. 1639 * wake_up_new_task - wake up a newly created task for the first time.
1640 * 1640 *
1641 * This function will do some initial scheduler statistics housekeeping 1641 * This function will do some initial scheduler statistics housekeeping
1642 * that must be done for every newly created context, then puts the task 1642 * that must be done for every newly created context, then puts the task
1643 * on the runqueue and wakes it. 1643 * on the runqueue and wakes it.
1644 */ 1644 */
1645 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1645 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1646 { 1646 {
1647 unsigned long flags; 1647 unsigned long flags;
1648 struct rq *rq; 1648 struct rq *rq;
1649 int this_cpu; 1649 int this_cpu;
1650 u64 now; 1650 u64 now;
1651 1651
1652 rq = task_rq_lock(p, &flags); 1652 rq = task_rq_lock(p, &flags);
1653 BUG_ON(p->state != TASK_RUNNING); 1653 BUG_ON(p->state != TASK_RUNNING);
1654 this_cpu = smp_processor_id(); /* parent's CPU */ 1654 this_cpu = smp_processor_id(); /* parent's CPU */
1655 now = rq_clock(rq); 1655 now = rq_clock(rq);
1656 1656
1657 p->prio = effective_prio(p); 1657 p->prio = effective_prio(p);
1658 1658
1659 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || 1659 if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
1660 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || 1660 (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
1661 !current->se.on_rq) { 1661 !current->se.on_rq) {
1662 1662
1663 activate_task(rq, p, 0); 1663 activate_task(rq, p, 0);
1664 } else { 1664 } else {
1665 /* 1665 /*
1666 * Let the scheduling class do new task startup 1666 * Let the scheduling class do new task startup
1667 * management (if any): 1667 * management (if any):
1668 */ 1668 */
1669 p->sched_class->task_new(rq, p, now); 1669 p->sched_class->task_new(rq, p, now);
1670 inc_nr_running(p, rq, now); 1670 inc_nr_running(p, rq, now);
1671 } 1671 }
1672 check_preempt_curr(rq, p); 1672 check_preempt_curr(rq, p);
1673 task_rq_unlock(rq, &flags); 1673 task_rq_unlock(rq, &flags);
1674 } 1674 }
1675 1675
1676 #ifdef CONFIG_PREEMPT_NOTIFIERS 1676 #ifdef CONFIG_PREEMPT_NOTIFIERS
1677 1677
1678 /** 1678 /**
1679 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 1679 * preempt_notifier_register - tell me when current is being being preempted & rescheduled
1680 * @notifier: notifier struct to register 1680 * @notifier: notifier struct to register
1681 */ 1681 */
1682 void preempt_notifier_register(struct preempt_notifier *notifier) 1682 void preempt_notifier_register(struct preempt_notifier *notifier)
1683 { 1683 {
1684 hlist_add_head(&notifier->link, &current->preempt_notifiers); 1684 hlist_add_head(&notifier->link, &current->preempt_notifiers);
1685 } 1685 }
1686 EXPORT_SYMBOL_GPL(preempt_notifier_register); 1686 EXPORT_SYMBOL_GPL(preempt_notifier_register);
1687 1687
1688 /** 1688 /**
1689 * preempt_notifier_unregister - no longer interested in preemption notifications 1689 * preempt_notifier_unregister - no longer interested in preemption notifications
1690 * @notifier: notifier struct to unregister 1690 * @notifier: notifier struct to unregister
1691 * 1691 *
1692 * This is safe to call from within a preemption notifier. 1692 * This is safe to call from within a preemption notifier.
1693 */ 1693 */
1694 void preempt_notifier_unregister(struct preempt_notifier *notifier) 1694 void preempt_notifier_unregister(struct preempt_notifier *notifier)
1695 { 1695 {
1696 hlist_del(&notifier->link); 1696 hlist_del(&notifier->link);
1697 } 1697 }
1698 EXPORT_SYMBOL_GPL(preempt_notifier_unregister); 1698 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1699 1699
1700 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1700 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1701 { 1701 {
1702 struct preempt_notifier *notifier; 1702 struct preempt_notifier *notifier;
1703 struct hlist_node *node; 1703 struct hlist_node *node;
1704 1704
1705 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1705 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1706 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1706 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1707 } 1707 }
1708 1708
1709 static void 1709 static void
1710 fire_sched_out_preempt_notifiers(struct task_struct *curr, 1710 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1711 struct task_struct *next) 1711 struct task_struct *next)
1712 { 1712 {
1713 struct preempt_notifier *notifier; 1713 struct preempt_notifier *notifier;
1714 struct hlist_node *node; 1714 struct hlist_node *node;
1715 1715
1716 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1716 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
1717 notifier->ops->sched_out(notifier, next); 1717 notifier->ops->sched_out(notifier, next);
1718 } 1718 }
1719 1719
1720 #else 1720 #else
1721 1721
1722 static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1722 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1723 { 1723 {
1724 } 1724 }
1725 1725
1726 static void 1726 static void
1727 fire_sched_out_preempt_notifiers(struct task_struct *curr, 1727 fire_sched_out_preempt_notifiers(struct task_struct *curr,
1728 struct task_struct *next) 1728 struct task_struct *next)
1729 { 1729 {
1730 } 1730 }
1731 1731
1732 #endif 1732 #endif
1733 1733
1734 /** 1734 /**
1735 * prepare_task_switch - prepare to switch tasks 1735 * prepare_task_switch - prepare to switch tasks
1736 * @rq: the runqueue preparing to switch 1736 * @rq: the runqueue preparing to switch
1737 * @prev: the current task that is being switched out 1737 * @prev: the current task that is being switched out
1738 * @next: the task we are going to switch to. 1738 * @next: the task we are going to switch to.
1739 * 1739 *
1740 * This is called with the rq lock held and interrupts off. It must 1740 * This is called with the rq lock held and interrupts off. It must
1741 * be paired with a subsequent finish_task_switch after the context 1741 * be paired with a subsequent finish_task_switch after the context
1742 * switch. 1742 * switch.
1743 * 1743 *
1744 * prepare_task_switch sets up locking and calls architecture specific 1744 * prepare_task_switch sets up locking and calls architecture specific
1745 * hooks. 1745 * hooks.
1746 */ 1746 */
1747 static inline void 1747 static inline void
1748 prepare_task_switch(struct rq *rq, struct task_struct *prev, 1748 prepare_task_switch(struct rq *rq, struct task_struct *prev,
1749 struct task_struct *next) 1749 struct task_struct *next)
1750 { 1750 {
1751 fire_sched_out_preempt_notifiers(prev, next); 1751 fire_sched_out_preempt_notifiers(prev, next);
1752 prepare_lock_switch(rq, next); 1752 prepare_lock_switch(rq, next);
1753 prepare_arch_switch(next); 1753 prepare_arch_switch(next);
1754 } 1754 }
1755 1755
1756 /** 1756 /**
1757 * finish_task_switch - clean up after a task-switch 1757 * finish_task_switch - clean up after a task-switch
1758 * @rq: runqueue associated with task-switch 1758 * @rq: runqueue associated with task-switch
1759 * @prev: the thread we just switched away from. 1759 * @prev: the thread we just switched away from.
1760 * 1760 *
1761 * finish_task_switch must be called after the context switch, paired 1761 * finish_task_switch must be called after the context switch, paired
1762 * with a prepare_task_switch call before the context switch. 1762 * with a prepare_task_switch call before the context switch.
1763 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1763 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1764 * and do any other architecture-specific cleanup actions. 1764 * and do any other architecture-specific cleanup actions.
1765 * 1765 *
1766 * Note that we may have delayed dropping an mm in context_switch(). If 1766 * Note that we may have delayed dropping an mm in context_switch(). If
1767 * so, we finish that here outside of the runqueue lock. (Doing it 1767 * so, we finish that here outside of the runqueue lock. (Doing it
1768 * with the lock held can cause deadlocks; see schedule() for 1768 * with the lock held can cause deadlocks; see schedule() for
1769 * details.) 1769 * details.)
1770 */ 1770 */
1771 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) 1771 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1772 __releases(rq->lock) 1772 __releases(rq->lock)
1773 { 1773 {
1774 struct mm_struct *mm = rq->prev_mm; 1774 struct mm_struct *mm = rq->prev_mm;
1775 long prev_state; 1775 long prev_state;
1776 1776
1777 rq->prev_mm = NULL; 1777 rq->prev_mm = NULL;
1778 1778
1779 /* 1779 /*
1780 * A task struct has one reference for the use as "current". 1780 * A task struct has one reference for the use as "current".
1781 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1781 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1782 * schedule one last time. The schedule call will never return, and 1782 * schedule one last time. The schedule call will never return, and
1783 * the scheduled task must drop that reference. 1783 * the scheduled task must drop that reference.
1784 * The test for TASK_DEAD must occur while the runqueue locks are 1784 * The test for TASK_DEAD must occur while the runqueue locks are
1785 * still held, otherwise prev could be scheduled on another cpu, die 1785 * still held, otherwise prev could be scheduled on another cpu, die
1786 * there before we look at prev->state, and then the reference would 1786 * there before we look at prev->state, and then the reference would
1787 * be dropped twice. 1787 * be dropped twice.
1788 * Manfred Spraul <manfred@colorfullife.com> 1788 * Manfred Spraul <manfred@colorfullife.com>
1789 */ 1789 */
1790 prev_state = prev->state; 1790 prev_state = prev->state;
1791 finish_arch_switch(prev); 1791 finish_arch_switch(prev);
1792 finish_lock_switch(rq, prev); 1792 finish_lock_switch(rq, prev);
1793 fire_sched_in_preempt_notifiers(current); 1793 fire_sched_in_preempt_notifiers(current);
1794 if (mm) 1794 if (mm)
1795 mmdrop(mm); 1795 mmdrop(mm);
1796 if (unlikely(prev_state == TASK_DEAD)) { 1796 if (unlikely(prev_state == TASK_DEAD)) {
1797 /* 1797 /*
1798 * Remove function-return probe instances associated with this 1798 * Remove function-return probe instances associated with this
1799 * task and put them back on the free list. 1799 * task and put them back on the free list.
1800 */ 1800 */
1801 kprobe_flush_task(prev); 1801 kprobe_flush_task(prev);
1802 put_task_struct(prev); 1802 put_task_struct(prev);
1803 } 1803 }
1804 } 1804 }
1805 1805
1806 /** 1806 /**
1807 * schedule_tail - first thing a freshly forked thread must call. 1807 * schedule_tail - first thing a freshly forked thread must call.
1808 * @prev: the thread we just switched away from. 1808 * @prev: the thread we just switched away from.
1809 */ 1809 */
1810 asmlinkage void schedule_tail(struct task_struct *prev) 1810 asmlinkage void schedule_tail(struct task_struct *prev)
1811 __releases(rq->lock) 1811 __releases(rq->lock)
1812 { 1812 {
1813 struct rq *rq = this_rq(); 1813 struct rq *rq = this_rq();
1814 1814
1815 finish_task_switch(rq, prev); 1815 finish_task_switch(rq, prev);
1816 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 1816 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1817 /* In this case, finish_task_switch does not reenable preemption */ 1817 /* In this case, finish_task_switch does not reenable preemption */
1818 preempt_enable(); 1818 preempt_enable();
1819 #endif 1819 #endif
1820 if (current->set_child_tid) 1820 if (current->set_child_tid)
1821 put_user(current->pid, current->set_child_tid); 1821 put_user(current->pid, current->set_child_tid);
1822 } 1822 }
1823 1823
1824 /* 1824 /*
1825 * context_switch - switch to the new MM and the new 1825 * context_switch - switch to the new MM and the new
1826 * thread's register state. 1826 * thread's register state.
1827 */ 1827 */
1828 static inline void 1828 static inline void
1829 context_switch(struct rq *rq, struct task_struct *prev, 1829 context_switch(struct rq *rq, struct task_struct *prev,
1830 struct task_struct *next) 1830 struct task_struct *next)
1831 { 1831 {
1832 struct mm_struct *mm, *oldmm; 1832 struct mm_struct *mm, *oldmm;
1833 1833
1834 prepare_task_switch(rq, prev, next); 1834 prepare_task_switch(rq, prev, next);
1835 mm = next->mm; 1835 mm = next->mm;
1836 oldmm = prev->active_mm; 1836 oldmm = prev->active_mm;
1837 /* 1837 /*
1838 * For paravirt, this is coupled with an exit in switch_to to 1838 * For paravirt, this is coupled with an exit in switch_to to
1839 * combine the page table reload and the switch backend into 1839 * combine the page table reload and the switch backend into
1840 * one hypercall. 1840 * one hypercall.
1841 */ 1841 */
1842 arch_enter_lazy_cpu_mode(); 1842 arch_enter_lazy_cpu_mode();
1843 1843
1844 if (unlikely(!mm)) { 1844 if (unlikely(!mm)) {
1845 next->active_mm = oldmm; 1845 next->active_mm = oldmm;
1846 atomic_inc(&oldmm->mm_count); 1846 atomic_inc(&oldmm->mm_count);
1847 enter_lazy_tlb(oldmm, next); 1847 enter_lazy_tlb(oldmm, next);
1848 } else 1848 } else
1849 switch_mm(oldmm, mm, next); 1849 switch_mm(oldmm, mm, next);
1850 1850
1851 if (unlikely(!prev->mm)) { 1851 if (unlikely(!prev->mm)) {
1852 prev->active_mm = NULL; 1852 prev->active_mm = NULL;
1853 rq->prev_mm = oldmm; 1853 rq->prev_mm = oldmm;
1854 } 1854 }
1855 /* 1855 /*
1856 * Since the runqueue lock will be released by the next 1856 * Since the runqueue lock will be released by the next
1857 * task (which is an invalid locking op but in the case 1857 * task (which is an invalid locking op but in the case
1858 * of the scheduler it's an obvious special-case), so we 1858 * of the scheduler it's an obvious special-case), so we
1859 * do an early lockdep release here: 1859 * do an early lockdep release here:
1860 */ 1860 */
1861 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 1861 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1862 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1862 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1863 #endif 1863 #endif
1864 1864
1865 /* Here we just switch the register state and the stack. */ 1865 /* Here we just switch the register state and the stack. */
1866 switch_to(prev, next, prev); 1866 switch_to(prev, next, prev);
1867 1867
1868 barrier(); 1868 barrier();
1869 /* 1869 /*
1870 * this_rq must be evaluated again because prev may have moved 1870 * this_rq must be evaluated again because prev may have moved
1871 * CPUs since it called schedule(), thus the 'rq' on its stack 1871 * CPUs since it called schedule(), thus the 'rq' on its stack
1872 * frame will be invalid. 1872 * frame will be invalid.
1873 */ 1873 */
1874 finish_task_switch(this_rq(), prev); 1874 finish_task_switch(this_rq(), prev);
1875 } 1875 }
1876 1876
1877 /* 1877 /*
1878 * nr_running, nr_uninterruptible and nr_context_switches: 1878 * nr_running, nr_uninterruptible and nr_context_switches:
1879 * 1879 *
1880 * externally visible scheduler statistics: current number of runnable 1880 * externally visible scheduler statistics: current number of runnable
1881 * threads, current number of uninterruptible-sleeping threads, total 1881 * threads, current number of uninterruptible-sleeping threads, total
1882 * number of context switches performed since bootup. 1882 * number of context switches performed since bootup.
1883 */ 1883 */
1884 unsigned long nr_running(void) 1884 unsigned long nr_running(void)
1885 { 1885 {
1886 unsigned long i, sum = 0; 1886 unsigned long i, sum = 0;
1887 1887
1888 for_each_online_cpu(i) 1888 for_each_online_cpu(i)
1889 sum += cpu_rq(i)->nr_running; 1889 sum += cpu_rq(i)->nr_running;
1890 1890
1891 return sum; 1891 return sum;
1892 } 1892 }
1893 1893
1894 unsigned long nr_uninterruptible(void) 1894 unsigned long nr_uninterruptible(void)
1895 { 1895 {
1896 unsigned long i, sum = 0; 1896 unsigned long i, sum = 0;
1897 1897
1898 for_each_possible_cpu(i) 1898 for_each_possible_cpu(i)
1899 sum += cpu_rq(i)->nr_uninterruptible; 1899 sum += cpu_rq(i)->nr_uninterruptible;
1900 1900
1901 /* 1901 /*
1902 * Since we read the counters lockless, it might be slightly 1902 * Since we read the counters lockless, it might be slightly
1903 * inaccurate. Do not allow it to go below zero though: 1903 * inaccurate. Do not allow it to go below zero though:
1904 */ 1904 */
1905 if (unlikely((long)sum < 0)) 1905 if (unlikely((long)sum < 0))
1906 sum = 0; 1906 sum = 0;
1907 1907
1908 return sum; 1908 return sum;
1909 } 1909 }
1910 1910
1911 unsigned long long nr_context_switches(void) 1911 unsigned long long nr_context_switches(void)
1912 { 1912 {
1913 int i; 1913 int i;
1914 unsigned long long sum = 0; 1914 unsigned long long sum = 0;
1915 1915
1916 for_each_possible_cpu(i) 1916 for_each_possible_cpu(i)
1917 sum += cpu_rq(i)->nr_switches; 1917 sum += cpu_rq(i)->nr_switches;
1918 1918
1919 return sum; 1919 return sum;
1920 } 1920 }
1921 1921
1922 unsigned long nr_iowait(void) 1922 unsigned long nr_iowait(void)
1923 { 1923 {
1924 unsigned long i, sum = 0; 1924 unsigned long i, sum = 0;
1925 1925
1926 for_each_possible_cpu(i) 1926 for_each_possible_cpu(i)
1927 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1927 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1928 1928
1929 return sum; 1929 return sum;
1930 } 1930 }
1931 1931
1932 unsigned long nr_active(void) 1932 unsigned long nr_active(void)
1933 { 1933 {
1934 unsigned long i, running = 0, uninterruptible = 0; 1934 unsigned long i, running = 0, uninterruptible = 0;
1935 1935
1936 for_each_online_cpu(i) { 1936 for_each_online_cpu(i) {
1937 running += cpu_rq(i)->nr_running; 1937 running += cpu_rq(i)->nr_running;
1938 uninterruptible += cpu_rq(i)->nr_uninterruptible; 1938 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1939 } 1939 }
1940 1940
1941 if (unlikely((long)uninterruptible < 0)) 1941 if (unlikely((long)uninterruptible < 0))
1942 uninterruptible = 0; 1942 uninterruptible = 0;
1943 1943
1944 return running + uninterruptible; 1944 return running + uninterruptible;
1945 } 1945 }
1946 1946
1947 /* 1947 /*
1948 * Update rq->cpu_load[] statistics. This function is usually called every 1948 * Update rq->cpu_load[] statistics. This function is usually called every
1949 * scheduler tick (TICK_NSEC). 1949 * scheduler tick (TICK_NSEC).
1950 */ 1950 */
1951 static void update_cpu_load(struct rq *this_rq) 1951 static void update_cpu_load(struct rq *this_rq)
1952 { 1952 {
1953 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; 1953 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1954 unsigned long total_load = this_rq->ls.load.weight; 1954 unsigned long total_load = this_rq->ls.load.weight;
1955 unsigned long this_load = total_load; 1955 unsigned long this_load = total_load;
1956 struct load_stat *ls = &this_rq->ls; 1956 struct load_stat *ls = &this_rq->ls;
1957 u64 now = __rq_clock(this_rq); 1957 u64 now = __rq_clock(this_rq);
1958 int i, scale; 1958 int i, scale;
1959 1959
1960 this_rq->nr_load_updates++; 1960 this_rq->nr_load_updates++;
1961 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) 1961 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1962 goto do_avg; 1962 goto do_avg;
1963 1963
1964 /* Update delta_fair/delta_exec fields first */ 1964 /* Update delta_fair/delta_exec fields first */
1965 update_curr_load(this_rq, now); 1965 update_curr_load(this_rq, now);
1966 1966
1967 fair_delta64 = ls->delta_fair + 1; 1967 fair_delta64 = ls->delta_fair + 1;
1968 ls->delta_fair = 0; 1968 ls->delta_fair = 0;
1969 1969
1970 exec_delta64 = ls->delta_exec + 1; 1970 exec_delta64 = ls->delta_exec + 1;
1971 ls->delta_exec = 0; 1971 ls->delta_exec = 0;
1972 1972
1973 sample_interval64 = now - ls->load_update_last; 1973 sample_interval64 = now - ls->load_update_last;
1974 ls->load_update_last = now; 1974 ls->load_update_last = now;
1975 1975
1976 if ((s64)sample_interval64 < (s64)TICK_NSEC) 1976 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1977 sample_interval64 = TICK_NSEC; 1977 sample_interval64 = TICK_NSEC;
1978 1978
1979 if (exec_delta64 > sample_interval64) 1979 if (exec_delta64 > sample_interval64)
1980 exec_delta64 = sample_interval64; 1980 exec_delta64 = sample_interval64;
1981 1981
1982 idle_delta64 = sample_interval64 - exec_delta64; 1982 idle_delta64 = sample_interval64 - exec_delta64;
1983 1983
1984 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); 1984 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1985 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); 1985 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1986 1986
1987 this_load = (unsigned long)tmp64; 1987 this_load = (unsigned long)tmp64;
1988 1988
1989 do_avg: 1989 do_avg:
1990 1990
1991 /* Update our load: */ 1991 /* Update our load: */
1992 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 1992 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1993 unsigned long old_load, new_load; 1993 unsigned long old_load, new_load;
1994 1994
1995 /* scale is effectively 1 << i now, and >> i divides by scale */ 1995 /* scale is effectively 1 << i now, and >> i divides by scale */
1996 1996
1997 old_load = this_rq->cpu_load[i]; 1997 old_load = this_rq->cpu_load[i];
1998 new_load = this_load; 1998 new_load = this_load;
1999 1999
2000 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2000 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2001 } 2001 }
2002 } 2002 }
2003 2003
2004 #ifdef CONFIG_SMP 2004 #ifdef CONFIG_SMP
2005 2005
2006 /* 2006 /*
2007 * double_rq_lock - safely lock two runqueues 2007 * double_rq_lock - safely lock two runqueues
2008 * 2008 *
2009 * Note this does not disable interrupts like task_rq_lock, 2009 * Note this does not disable interrupts like task_rq_lock,
2010 * you need to do so manually before calling. 2010 * you need to do so manually before calling.
2011 */ 2011 */
2012 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 2012 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2013 __acquires(rq1->lock) 2013 __acquires(rq1->lock)
2014 __acquires(rq2->lock) 2014 __acquires(rq2->lock)
2015 { 2015 {
2016 BUG_ON(!irqs_disabled()); 2016 BUG_ON(!irqs_disabled());
2017 if (rq1 == rq2) { 2017 if (rq1 == rq2) {
2018 spin_lock(&rq1->lock); 2018 spin_lock(&rq1->lock);
2019 __acquire(rq2->lock); /* Fake it out ;) */ 2019 __acquire(rq2->lock); /* Fake it out ;) */
2020 } else { 2020 } else {
2021 if (rq1 < rq2) { 2021 if (rq1 < rq2) {
2022 spin_lock(&rq1->lock); 2022 spin_lock(&rq1->lock);
2023 spin_lock(&rq2->lock); 2023 spin_lock(&rq2->lock);
2024 } else { 2024 } else {
2025 spin_lock(&rq2->lock); 2025 spin_lock(&rq2->lock);
2026 spin_lock(&rq1->lock); 2026 spin_lock(&rq1->lock);
2027 } 2027 }
2028 } 2028 }
2029 } 2029 }
2030 2030
2031 /* 2031 /*
2032 * double_rq_unlock - safely unlock two runqueues 2032 * double_rq_unlock - safely unlock two runqueues
2033 * 2033 *
2034 * Note this does not restore interrupts like task_rq_unlock, 2034 * Note this does not restore interrupts like task_rq_unlock,
2035 * you need to do so manually after calling. 2035 * you need to do so manually after calling.
2036 */ 2036 */
2037 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 2037 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2038 __releases(rq1->lock) 2038 __releases(rq1->lock)
2039 __releases(rq2->lock) 2039 __releases(rq2->lock)
2040 { 2040 {
2041 spin_unlock(&rq1->lock); 2041 spin_unlock(&rq1->lock);
2042 if (rq1 != rq2) 2042 if (rq1 != rq2)
2043 spin_unlock(&rq2->lock); 2043 spin_unlock(&rq2->lock);
2044 else 2044 else
2045 __release(rq2->lock); 2045 __release(rq2->lock);
2046 } 2046 }
2047 2047
2048 /* 2048 /*
2049 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2049 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2050 */ 2050 */
2051 static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2051 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2052 __releases(this_rq->lock) 2052 __releases(this_rq->lock)
2053 __acquires(busiest->lock) 2053 __acquires(busiest->lock)
2054 __acquires(this_rq->lock) 2054 __acquires(this_rq->lock)
2055 { 2055 {
2056 if (unlikely(!irqs_disabled())) { 2056 if (unlikely(!irqs_disabled())) {
2057 /* printk() doesn't work good under rq->lock */ 2057 /* printk() doesn't work good under rq->lock */
2058 spin_unlock(&this_rq->lock); 2058 spin_unlock(&this_rq->lock);
2059 BUG_ON(1); 2059 BUG_ON(1);
2060 } 2060 }
2061 if (unlikely(!spin_trylock(&busiest->lock))) { 2061 if (unlikely(!spin_trylock(&busiest->lock))) {
2062 if (busiest < this_rq) { 2062 if (busiest < this_rq) {
2063 spin_unlock(&this_rq->lock); 2063 spin_unlock(&this_rq->lock);
2064 spin_lock(&busiest->lock); 2064 spin_lock(&busiest->lock);
2065 spin_lock(&this_rq->lock); 2065 spin_lock(&this_rq->lock);
2066 } else 2066 } else
2067 spin_lock(&busiest->lock); 2067 spin_lock(&busiest->lock);
2068 } 2068 }
2069 } 2069 }
2070 2070
2071 /* 2071 /*
2072 * If dest_cpu is allowed for this process, migrate the task to it. 2072 * If dest_cpu is allowed for this process, migrate the task to it.
2073 * This is accomplished by forcing the cpu_allowed mask to only 2073 * This is accomplished by forcing the cpu_allowed mask to only
2074 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2074 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2075 * the cpu_allowed mask is restored. 2075 * the cpu_allowed mask is restored.
2076 */ 2076 */
2077 static void sched_migrate_task(struct task_struct *p, int dest_cpu) 2077 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2078 { 2078 {
2079 struct migration_req req; 2079 struct migration_req req;
2080 unsigned long flags; 2080 unsigned long flags;
2081 struct rq *rq; 2081 struct rq *rq;
2082 2082
2083 rq = task_rq_lock(p, &flags); 2083 rq = task_rq_lock(p, &flags);
2084 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2084 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2085 || unlikely(cpu_is_offline(dest_cpu))) 2085 || unlikely(cpu_is_offline(dest_cpu)))
2086 goto out; 2086 goto out;
2087 2087
2088 /* force the process onto the specified CPU */ 2088 /* force the process onto the specified CPU */
2089 if (migrate_task(p, dest_cpu, &req)) { 2089 if (migrate_task(p, dest_cpu, &req)) {
2090 /* Need to wait for migration thread (might exit: take ref). */ 2090 /* Need to wait for migration thread (might exit: take ref). */
2091 struct task_struct *mt = rq->migration_thread; 2091 struct task_struct *mt = rq->migration_thread;
2092 2092
2093 get_task_struct(mt); 2093 get_task_struct(mt);
2094 task_rq_unlock(rq, &flags); 2094 task_rq_unlock(rq, &flags);
2095 wake_up_process(mt); 2095 wake_up_process(mt);
2096 put_task_struct(mt); 2096 put_task_struct(mt);
2097 wait_for_completion(&req.done); 2097 wait_for_completion(&req.done);
2098 2098
2099 return; 2099 return;
2100 } 2100 }
2101 out: 2101 out:
2102 task_rq_unlock(rq, &flags); 2102 task_rq_unlock(rq, &flags);
2103 } 2103 }
2104 2104
2105 /* 2105 /*
2106 * sched_exec - execve() is a valuable balancing opportunity, because at 2106 * sched_exec - execve() is a valuable balancing opportunity, because at
2107 * this point the task has the smallest effective memory and cache footprint. 2107 * this point the task has the smallest effective memory and cache footprint.
2108 */ 2108 */
2109 void sched_exec(void) 2109 void sched_exec(void)
2110 { 2110 {
2111 int new_cpu, this_cpu = get_cpu(); 2111 int new_cpu, this_cpu = get_cpu();
2112 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 2112 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2113 put_cpu(); 2113 put_cpu();
2114 if (new_cpu != this_cpu) 2114 if (new_cpu != this_cpu)
2115 sched_migrate_task(current, new_cpu); 2115 sched_migrate_task(current, new_cpu);
2116 } 2116 }
2117 2117
2118 /* 2118 /*
2119 * pull_task - move a task from a remote runqueue to the local runqueue. 2119 * pull_task - move a task from a remote runqueue to the local runqueue.
2120 * Both runqueues must be locked. 2120 * Both runqueues must be locked.
2121 */ 2121 */
2122 static void pull_task(struct rq *src_rq, struct task_struct *p, 2122 static void pull_task(struct rq *src_rq, struct task_struct *p,
2123 struct rq *this_rq, int this_cpu) 2123 struct rq *this_rq, int this_cpu)
2124 { 2124 {
2125 deactivate_task(src_rq, p, 0); 2125 deactivate_task(src_rq, p, 0);
2126 set_task_cpu(p, this_cpu); 2126 set_task_cpu(p, this_cpu);
2127 activate_task(this_rq, p, 0); 2127 activate_task(this_rq, p, 0);
2128 /* 2128 /*
2129 * Note that idle threads have a prio of MAX_PRIO, for this test 2129 * Note that idle threads have a prio of MAX_PRIO, for this test
2130 * to be always true for them. 2130 * to be always true for them.
2131 */ 2131 */
2132 check_preempt_curr(this_rq, p); 2132 check_preempt_curr(this_rq, p);
2133 } 2133 }
2134 2134
2135 /* 2135 /*
2136 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 2136 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2137 */ 2137 */
2138 static 2138 static
2139 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2139 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2140 struct sched_domain *sd, enum cpu_idle_type idle, 2140 struct sched_domain *sd, enum cpu_idle_type idle,
2141 int *all_pinned) 2141 int *all_pinned)
2142 { 2142 {
2143 /* 2143 /*
2144 * We do not migrate tasks that are: 2144 * We do not migrate tasks that are:
2145 * 1) running (obviously), or 2145 * 1) running (obviously), or
2146 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2146 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2147 * 3) are cache-hot on their current CPU. 2147 * 3) are cache-hot on their current CPU.
2148 */ 2148 */
2149 if (!cpu_isset(this_cpu, p->cpus_allowed)) 2149 if (!cpu_isset(this_cpu, p->cpus_allowed))
2150 return 0; 2150 return 0;
2151 *all_pinned = 0; 2151 *all_pinned = 0;
2152 2152
2153 if (task_running(rq, p)) 2153 if (task_running(rq, p))
2154 return 0; 2154 return 0;
2155 2155
2156 /* 2156 /*
2157 * Aggressive migration if too many balance attempts have failed: 2157 * Aggressive migration if too many balance attempts have failed:
2158 */ 2158 */
2159 if (sd->nr_balance_failed > sd->cache_nice_tries) 2159 if (sd->nr_balance_failed > sd->cache_nice_tries)
2160 return 1; 2160 return 1;
2161 2161
2162 return 1; 2162 return 1;
2163 } 2163 }
2164 2164
2165 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2165 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2166 unsigned long max_nr_move, unsigned long max_load_move, 2166 unsigned long max_nr_move, unsigned long max_load_move,
2167 struct sched_domain *sd, enum cpu_idle_type idle, 2167 struct sched_domain *sd, enum cpu_idle_type idle,
2168 int *all_pinned, unsigned long *load_moved, 2168 int *all_pinned, unsigned long *load_moved,
2169 int this_best_prio, int best_prio, int best_prio_seen, 2169 int this_best_prio, int best_prio, int best_prio_seen,
2170 struct rq_iterator *iterator) 2170 struct rq_iterator *iterator)
2171 { 2171 {
2172 int pulled = 0, pinned = 0, skip_for_load; 2172 int pulled = 0, pinned = 0, skip_for_load;
2173 struct task_struct *p; 2173 struct task_struct *p;
2174 long rem_load_move = max_load_move; 2174 long rem_load_move = max_load_move;
2175 2175
2176 if (max_nr_move == 0 || max_load_move == 0) 2176 if (max_nr_move == 0 || max_load_move == 0)
2177 goto out; 2177 goto out;
2178 2178
2179 pinned = 1; 2179 pinned = 1;
2180 2180
2181 /* 2181 /*
2182 * Start the load-balancing iterator: 2182 * Start the load-balancing iterator:
2183 */ 2183 */
2184 p = iterator->start(iterator->arg); 2184 p = iterator->start(iterator->arg);
2185 next: 2185 next:
2186 if (!p) 2186 if (!p)
2187 goto out; 2187 goto out;
2188 /* 2188 /*
2189 * To help distribute high priority tasks accross CPUs we don't 2189 * To help distribute high priority tasks accross CPUs we don't
2190 * skip a task if it will be the highest priority task (i.e. smallest 2190 * skip a task if it will be the highest priority task (i.e. smallest
2191 * prio value) on its new queue regardless of its load weight 2191 * prio value) on its new queue regardless of its load weight
2192 */ 2192 */
2193 skip_for_load = (p->se.load.weight >> 1) > rem_load_move + 2193 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2194 SCHED_LOAD_SCALE_FUZZ; 2194 SCHED_LOAD_SCALE_FUZZ;
2195 if (skip_for_load && p->prio < this_best_prio) 2195 if (skip_for_load && p->prio < this_best_prio)
2196 skip_for_load = !best_prio_seen && p->prio == best_prio; 2196 skip_for_load = !best_prio_seen && p->prio == best_prio;
2197 if (skip_for_load || 2197 if (skip_for_load ||
2198 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2198 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2199 2199
2200 best_prio_seen |= p->prio == best_prio; 2200 best_prio_seen |= p->prio == best_prio;
2201 p = iterator->next(iterator->arg); 2201 p = iterator->next(iterator->arg);
2202 goto next; 2202 goto next;
2203 } 2203 }
2204 2204
2205 pull_task(busiest, p, this_rq, this_cpu); 2205 pull_task(busiest, p, this_rq, this_cpu);
2206 pulled++; 2206 pulled++;
2207 rem_load_move -= p->se.load.weight; 2207 rem_load_move -= p->se.load.weight;
2208 2208
2209 /* 2209 /*
2210 * We only want to steal up to the prescribed number of tasks 2210 * We only want to steal up to the prescribed number of tasks
2211 * and the prescribed amount of weighted load. 2211 * and the prescribed amount of weighted load.
2212 */ 2212 */
2213 if (pulled < max_nr_move && rem_load_move > 0) { 2213 if (pulled < max_nr_move && rem_load_move > 0) {
2214 if (p->prio < this_best_prio) 2214 if (p->prio < this_best_prio)
2215 this_best_prio = p->prio; 2215 this_best_prio = p->prio;
2216 p = iterator->next(iterator->arg); 2216 p = iterator->next(iterator->arg);
2217 goto next; 2217 goto next;
2218 } 2218 }
2219 out: 2219 out:
2220 /* 2220 /*
2221 * Right now, this is the only place pull_task() is called, 2221 * Right now, this is the only place pull_task() is called,
2222 * so we can safely collect pull_task() stats here rather than 2222 * so we can safely collect pull_task() stats here rather than
2223 * inside pull_task(). 2223 * inside pull_task().
2224 */ 2224 */
2225 schedstat_add(sd, lb_gained[idle], pulled); 2225 schedstat_add(sd, lb_gained[idle], pulled);
2226 2226
2227 if (all_pinned) 2227 if (all_pinned)
2228 *all_pinned = pinned; 2228 *all_pinned = pinned;
2229 *load_moved = max_load_move - rem_load_move; 2229 *load_moved = max_load_move - rem_load_move;
2230 return pulled; 2230 return pulled;
2231 } 2231 }
2232 2232
2233 /* 2233 /*
2234 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted 2234 * move_tasks tries to move up to max_load_move weighted load from busiest to
2235 * load from busiest to this_rq, as part of a balancing operation within 2235 * this_rq, as part of a balancing operation within domain "sd".
2236 * "domain". Returns the number of tasks moved. 2236 * Returns 1 if successful and 0 otherwise.
2237 * 2237 *
2238 * Called with both runqueues locked. 2238 * Called with both runqueues locked.
2239 */ 2239 */
2240 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2240 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2241 unsigned long max_nr_move, unsigned long max_load_move, 2241 unsigned long max_load_move,
2242 struct sched_domain *sd, enum cpu_idle_type idle, 2242 struct sched_domain *sd, enum cpu_idle_type idle,
2243 int *all_pinned) 2243 int *all_pinned)
2244 { 2244 {
2245 struct sched_class *class = sched_class_highest; 2245 struct sched_class *class = sched_class_highest;
2246 unsigned long load_moved, total_nr_moved = 0, nr_moved; 2246 unsigned long total_load_moved = 0;
2247 long rem_load_move = max_load_move;
2248 2247
2249 do { 2248 do {
2250 nr_moved = class->load_balance(this_rq, this_cpu, busiest, 2249 total_load_moved +=
2251 max_nr_move, (unsigned long)rem_load_move, 2250 class->load_balance(this_rq, this_cpu, busiest,
2252 sd, idle, all_pinned, &load_moved); 2251 ULONG_MAX, max_load_move - total_load_moved,
2253 total_nr_moved += nr_moved; 2252 sd, idle, all_pinned);
2254 max_nr_move -= nr_moved;
2255 rem_load_move -= load_moved;
2256 class = class->next; 2253 class = class->next;
2257 } while (class && max_nr_move && rem_load_move > 0); 2254 } while (class && max_load_move > total_load_moved);
2258 2255
2259 return total_nr_moved; 2256 return total_load_moved > 0;
2260 } 2257 }
2261 2258
2262 /* 2259 /*
2260 * move_one_task tries to move exactly one task from busiest to this_rq, as
2261 * part of active balancing operations within "domain".
2262 * Returns 1 if successful and 0 otherwise.
2263 *
2264 * Called with both runqueues locked.
2265 */
2266 static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
2267 struct sched_domain *sd, enum cpu_idle_type idle)
2268 {
2269 struct sched_class *class;
2270
2271 for (class = sched_class_highest; class; class = class->next)
2272 if (class->load_balance(this_rq, this_cpu, busiest,
2273 1, ULONG_MAX, sd, idle, NULL))
2274 return 1;
2275
2276 return 0;
2277 }
2278
2279 /*
2263 * find_busiest_group finds and returns the busiest CPU group within the 2280 * find_busiest_group finds and returns the busiest CPU group within the
2264 * domain. It calculates and returns the amount of weighted load which 2281 * domain. It calculates and returns the amount of weighted load which
2265 * should be moved to restore balance via the imbalance parameter. 2282 * should be moved to restore balance via the imbalance parameter.
2266 */ 2283 */
2267 static struct sched_group * 2284 static struct sched_group *
2268 find_busiest_group(struct sched_domain *sd, int this_cpu, 2285 find_busiest_group(struct sched_domain *sd, int this_cpu,
2269 unsigned long *imbalance, enum cpu_idle_type idle, 2286 unsigned long *imbalance, enum cpu_idle_type idle,
2270 int *sd_idle, cpumask_t *cpus, int *balance) 2287 int *sd_idle, cpumask_t *cpus, int *balance)
2271 { 2288 {
2272 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2289 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2273 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2290 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2274 unsigned long max_pull; 2291 unsigned long max_pull;
2275 unsigned long busiest_load_per_task, busiest_nr_running; 2292 unsigned long busiest_load_per_task, busiest_nr_running;
2276 unsigned long this_load_per_task, this_nr_running; 2293 unsigned long this_load_per_task, this_nr_running;
2277 int load_idx; 2294 int load_idx;
2278 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2295 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2279 int power_savings_balance = 1; 2296 int power_savings_balance = 1;
2280 unsigned long leader_nr_running = 0, min_load_per_task = 0; 2297 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2281 unsigned long min_nr_running = ULONG_MAX; 2298 unsigned long min_nr_running = ULONG_MAX;
2282 struct sched_group *group_min = NULL, *group_leader = NULL; 2299 struct sched_group *group_min = NULL, *group_leader = NULL;
2283 #endif 2300 #endif
2284 2301
2285 max_load = this_load = total_load = total_pwr = 0; 2302 max_load = this_load = total_load = total_pwr = 0;
2286 busiest_load_per_task = busiest_nr_running = 0; 2303 busiest_load_per_task = busiest_nr_running = 0;
2287 this_load_per_task = this_nr_running = 0; 2304 this_load_per_task = this_nr_running = 0;
2288 if (idle == CPU_NOT_IDLE) 2305 if (idle == CPU_NOT_IDLE)
2289 load_idx = sd->busy_idx; 2306 load_idx = sd->busy_idx;
2290 else if (idle == CPU_NEWLY_IDLE) 2307 else if (idle == CPU_NEWLY_IDLE)
2291 load_idx = sd->newidle_idx; 2308 load_idx = sd->newidle_idx;
2292 else 2309 else
2293 load_idx = sd->idle_idx; 2310 load_idx = sd->idle_idx;
2294 2311
2295 do { 2312 do {
2296 unsigned long load, group_capacity; 2313 unsigned long load, group_capacity;
2297 int local_group; 2314 int local_group;
2298 int i; 2315 int i;
2299 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2316 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2300 unsigned long sum_nr_running, sum_weighted_load; 2317 unsigned long sum_nr_running, sum_weighted_load;
2301 2318
2302 local_group = cpu_isset(this_cpu, group->cpumask); 2319 local_group = cpu_isset(this_cpu, group->cpumask);
2303 2320
2304 if (local_group) 2321 if (local_group)
2305 balance_cpu = first_cpu(group->cpumask); 2322 balance_cpu = first_cpu(group->cpumask);
2306 2323
2307 /* Tally up the load of all CPUs in the group */ 2324 /* Tally up the load of all CPUs in the group */
2308 sum_weighted_load = sum_nr_running = avg_load = 0; 2325 sum_weighted_load = sum_nr_running = avg_load = 0;
2309 2326
2310 for_each_cpu_mask(i, group->cpumask) { 2327 for_each_cpu_mask(i, group->cpumask) {
2311 struct rq *rq; 2328 struct rq *rq;
2312 2329
2313 if (!cpu_isset(i, *cpus)) 2330 if (!cpu_isset(i, *cpus))
2314 continue; 2331 continue;
2315 2332
2316 rq = cpu_rq(i); 2333 rq = cpu_rq(i);
2317 2334
2318 if (*sd_idle && rq->nr_running) 2335 if (*sd_idle && rq->nr_running)
2319 *sd_idle = 0; 2336 *sd_idle = 0;
2320 2337
2321 /* Bias balancing toward cpus of our domain */ 2338 /* Bias balancing toward cpus of our domain */
2322 if (local_group) { 2339 if (local_group) {
2323 if (idle_cpu(i) && !first_idle_cpu) { 2340 if (idle_cpu(i) && !first_idle_cpu) {
2324 first_idle_cpu = 1; 2341 first_idle_cpu = 1;
2325 balance_cpu = i; 2342 balance_cpu = i;
2326 } 2343 }
2327 2344
2328 load = target_load(i, load_idx); 2345 load = target_load(i, load_idx);
2329 } else 2346 } else
2330 load = source_load(i, load_idx); 2347 load = source_load(i, load_idx);
2331 2348
2332 avg_load += load; 2349 avg_load += load;
2333 sum_nr_running += rq->nr_running; 2350 sum_nr_running += rq->nr_running;
2334 sum_weighted_load += weighted_cpuload(i); 2351 sum_weighted_load += weighted_cpuload(i);
2335 } 2352 }
2336 2353
2337 /* 2354 /*
2338 * First idle cpu or the first cpu(busiest) in this sched group 2355 * First idle cpu or the first cpu(busiest) in this sched group
2339 * is eligible for doing load balancing at this and above 2356 * is eligible for doing load balancing at this and above
2340 * domains. In the newly idle case, we will allow all the cpu's 2357 * domains. In the newly idle case, we will allow all the cpu's
2341 * to do the newly idle load balance. 2358 * to do the newly idle load balance.
2342 */ 2359 */
2343 if (idle != CPU_NEWLY_IDLE && local_group && 2360 if (idle != CPU_NEWLY_IDLE && local_group &&
2344 balance_cpu != this_cpu && balance) { 2361 balance_cpu != this_cpu && balance) {
2345 *balance = 0; 2362 *balance = 0;
2346 goto ret; 2363 goto ret;
2347 } 2364 }
2348 2365
2349 total_load += avg_load; 2366 total_load += avg_load;
2350 total_pwr += group->__cpu_power; 2367 total_pwr += group->__cpu_power;
2351 2368
2352 /* Adjust by relative CPU power of the group */ 2369 /* Adjust by relative CPU power of the group */
2353 avg_load = sg_div_cpu_power(group, 2370 avg_load = sg_div_cpu_power(group,
2354 avg_load * SCHED_LOAD_SCALE); 2371 avg_load * SCHED_LOAD_SCALE);
2355 2372
2356 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 2373 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
2357 2374
2358 if (local_group) { 2375 if (local_group) {
2359 this_load = avg_load; 2376 this_load = avg_load;
2360 this = group; 2377 this = group;
2361 this_nr_running = sum_nr_running; 2378 this_nr_running = sum_nr_running;
2362 this_load_per_task = sum_weighted_load; 2379 this_load_per_task = sum_weighted_load;
2363 } else if (avg_load > max_load && 2380 } else if (avg_load > max_load &&
2364 sum_nr_running > group_capacity) { 2381 sum_nr_running > group_capacity) {
2365 max_load = avg_load; 2382 max_load = avg_load;
2366 busiest = group; 2383 busiest = group;
2367 busiest_nr_running = sum_nr_running; 2384 busiest_nr_running = sum_nr_running;
2368 busiest_load_per_task = sum_weighted_load; 2385 busiest_load_per_task = sum_weighted_load;
2369 } 2386 }
2370 2387
2371 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2388 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2372 /* 2389 /*
2373 * Busy processors will not participate in power savings 2390 * Busy processors will not participate in power savings
2374 * balance. 2391 * balance.
2375 */ 2392 */
2376 if (idle == CPU_NOT_IDLE || 2393 if (idle == CPU_NOT_IDLE ||
2377 !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2394 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2378 goto group_next; 2395 goto group_next;
2379 2396
2380 /* 2397 /*
2381 * If the local group is idle or completely loaded 2398 * If the local group is idle or completely loaded
2382 * no need to do power savings balance at this domain 2399 * no need to do power savings balance at this domain
2383 */ 2400 */
2384 if (local_group && (this_nr_running >= group_capacity || 2401 if (local_group && (this_nr_running >= group_capacity ||
2385 !this_nr_running)) 2402 !this_nr_running))
2386 power_savings_balance = 0; 2403 power_savings_balance = 0;
2387 2404
2388 /* 2405 /*
2389 * If a group is already running at full capacity or idle, 2406 * If a group is already running at full capacity or idle,
2390 * don't include that group in power savings calculations 2407 * don't include that group in power savings calculations
2391 */ 2408 */
2392 if (!power_savings_balance || sum_nr_running >= group_capacity 2409 if (!power_savings_balance || sum_nr_running >= group_capacity
2393 || !sum_nr_running) 2410 || !sum_nr_running)
2394 goto group_next; 2411 goto group_next;
2395 2412
2396 /* 2413 /*
2397 * Calculate the group which has the least non-idle load. 2414 * Calculate the group which has the least non-idle load.
2398 * This is the group from where we need to pick up the load 2415 * This is the group from where we need to pick up the load
2399 * for saving power 2416 * for saving power
2400 */ 2417 */
2401 if ((sum_nr_running < min_nr_running) || 2418 if ((sum_nr_running < min_nr_running) ||
2402 (sum_nr_running == min_nr_running && 2419 (sum_nr_running == min_nr_running &&
2403 first_cpu(group->cpumask) < 2420 first_cpu(group->cpumask) <
2404 first_cpu(group_min->cpumask))) { 2421 first_cpu(group_min->cpumask))) {
2405 group_min = group; 2422 group_min = group;
2406 min_nr_running = sum_nr_running; 2423 min_nr_running = sum_nr_running;
2407 min_load_per_task = sum_weighted_load / 2424 min_load_per_task = sum_weighted_load /
2408 sum_nr_running; 2425 sum_nr_running;
2409 } 2426 }
2410 2427
2411 /* 2428 /*
2412 * Calculate the group which is almost near its 2429 * Calculate the group which is almost near its
2413 * capacity but still has some space to pick up some load 2430 * capacity but still has some space to pick up some load
2414 * from other group and save more power 2431 * from other group and save more power
2415 */ 2432 */
2416 if (sum_nr_running <= group_capacity - 1) { 2433 if (sum_nr_running <= group_capacity - 1) {
2417 if (sum_nr_running > leader_nr_running || 2434 if (sum_nr_running > leader_nr_running ||
2418 (sum_nr_running == leader_nr_running && 2435 (sum_nr_running == leader_nr_running &&
2419 first_cpu(group->cpumask) > 2436 first_cpu(group->cpumask) >
2420 first_cpu(group_leader->cpumask))) { 2437 first_cpu(group_leader->cpumask))) {
2421 group_leader = group; 2438 group_leader = group;
2422 leader_nr_running = sum_nr_running; 2439 leader_nr_running = sum_nr_running;
2423 } 2440 }
2424 } 2441 }
2425 group_next: 2442 group_next:
2426 #endif 2443 #endif
2427 group = group->next; 2444 group = group->next;
2428 } while (group != sd->groups); 2445 } while (group != sd->groups);
2429 2446
2430 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 2447 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2431 goto out_balanced; 2448 goto out_balanced;
2432 2449
2433 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2450 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2434 2451
2435 if (this_load >= avg_load || 2452 if (this_load >= avg_load ||
2436 100*max_load <= sd->imbalance_pct*this_load) 2453 100*max_load <= sd->imbalance_pct*this_load)
2437 goto out_balanced; 2454 goto out_balanced;
2438 2455
2439 busiest_load_per_task /= busiest_nr_running; 2456 busiest_load_per_task /= busiest_nr_running;
2440 /* 2457 /*
2441 * We're trying to get all the cpus to the average_load, so we don't 2458 * We're trying to get all the cpus to the average_load, so we don't
2442 * want to push ourselves above the average load, nor do we wish to 2459 * want to push ourselves above the average load, nor do we wish to
2443 * reduce the max loaded cpu below the average load, as either of these 2460 * reduce the max loaded cpu below the average load, as either of these
2444 * actions would just result in more rebalancing later, and ping-pong 2461 * actions would just result in more rebalancing later, and ping-pong
2445 * tasks around. Thus we look for the minimum possible imbalance. 2462 * tasks around. Thus we look for the minimum possible imbalance.
2446 * Negative imbalances (*we* are more loaded than anyone else) will 2463 * Negative imbalances (*we* are more loaded than anyone else) will
2447 * be counted as no imbalance for these purposes -- we can't fix that 2464 * be counted as no imbalance for these purposes -- we can't fix that
2448 * by pulling tasks to us. Be careful of negative numbers as they'll 2465 * by pulling tasks to us. Be careful of negative numbers as they'll
2449 * appear as very large values with unsigned longs. 2466 * appear as very large values with unsigned longs.
2450 */ 2467 */
2451 if (max_load <= busiest_load_per_task) 2468 if (max_load <= busiest_load_per_task)
2452 goto out_balanced; 2469 goto out_balanced;
2453 2470
2454 /* 2471 /*
2455 * In the presence of smp nice balancing, certain scenarios can have 2472 * In the presence of smp nice balancing, certain scenarios can have
2456 * max load less than avg load(as we skip the groups at or below 2473 * max load less than avg load(as we skip the groups at or below
2457 * its cpu_power, while calculating max_load..) 2474 * its cpu_power, while calculating max_load..)
2458 */ 2475 */
2459 if (max_load < avg_load) { 2476 if (max_load < avg_load) {
2460 *imbalance = 0; 2477 *imbalance = 0;
2461 goto small_imbalance; 2478 goto small_imbalance;
2462 } 2479 }
2463 2480
2464 /* Don't want to pull so many tasks that a group would go idle */ 2481 /* Don't want to pull so many tasks that a group would go idle */
2465 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 2482 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2466 2483
2467 /* How much load to actually move to equalise the imbalance */ 2484 /* How much load to actually move to equalise the imbalance */
2468 *imbalance = min(max_pull * busiest->__cpu_power, 2485 *imbalance = min(max_pull * busiest->__cpu_power,
2469 (avg_load - this_load) * this->__cpu_power) 2486 (avg_load - this_load) * this->__cpu_power)
2470 / SCHED_LOAD_SCALE; 2487 / SCHED_LOAD_SCALE;
2471 2488
2472 /* 2489 /*
2473 * if *imbalance is less than the average load per runnable task 2490 * if *imbalance is less than the average load per runnable task
2474 * there is no gaurantee that any tasks will be moved so we'll have 2491 * there is no gaurantee that any tasks will be moved so we'll have
2475 * a think about bumping its value to force at least one task to be 2492 * a think about bumping its value to force at least one task to be
2476 * moved 2493 * moved
2477 */ 2494 */
2478 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { 2495 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
2479 unsigned long tmp, pwr_now, pwr_move; 2496 unsigned long tmp, pwr_now, pwr_move;
2480 unsigned int imbn; 2497 unsigned int imbn;
2481 2498
2482 small_imbalance: 2499 small_imbalance:
2483 pwr_move = pwr_now = 0; 2500 pwr_move = pwr_now = 0;
2484 imbn = 2; 2501 imbn = 2;
2485 if (this_nr_running) { 2502 if (this_nr_running) {
2486 this_load_per_task /= this_nr_running; 2503 this_load_per_task /= this_nr_running;
2487 if (busiest_load_per_task > this_load_per_task) 2504 if (busiest_load_per_task > this_load_per_task)
2488 imbn = 1; 2505 imbn = 1;
2489 } else 2506 } else
2490 this_load_per_task = SCHED_LOAD_SCALE; 2507 this_load_per_task = SCHED_LOAD_SCALE;
2491 2508
2492 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 2509 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2493 busiest_load_per_task * imbn) { 2510 busiest_load_per_task * imbn) {
2494 *imbalance = busiest_load_per_task; 2511 *imbalance = busiest_load_per_task;
2495 return busiest; 2512 return busiest;
2496 } 2513 }
2497 2514
2498 /* 2515 /*
2499 * OK, we don't have enough imbalance to justify moving tasks, 2516 * OK, we don't have enough imbalance to justify moving tasks,
2500 * however we may be able to increase total CPU power used by 2517 * however we may be able to increase total CPU power used by
2501 * moving them. 2518 * moving them.
2502 */ 2519 */
2503 2520
2504 pwr_now += busiest->__cpu_power * 2521 pwr_now += busiest->__cpu_power *
2505 min(busiest_load_per_task, max_load); 2522 min(busiest_load_per_task, max_load);
2506 pwr_now += this->__cpu_power * 2523 pwr_now += this->__cpu_power *
2507 min(this_load_per_task, this_load); 2524 min(this_load_per_task, this_load);
2508 pwr_now /= SCHED_LOAD_SCALE; 2525 pwr_now /= SCHED_LOAD_SCALE;
2509 2526
2510 /* Amount of load we'd subtract */ 2527 /* Amount of load we'd subtract */
2511 tmp = sg_div_cpu_power(busiest, 2528 tmp = sg_div_cpu_power(busiest,
2512 busiest_load_per_task * SCHED_LOAD_SCALE); 2529 busiest_load_per_task * SCHED_LOAD_SCALE);
2513 if (max_load > tmp) 2530 if (max_load > tmp)
2514 pwr_move += busiest->__cpu_power * 2531 pwr_move += busiest->__cpu_power *
2515 min(busiest_load_per_task, max_load - tmp); 2532 min(busiest_load_per_task, max_load - tmp);
2516 2533
2517 /* Amount of load we'd add */ 2534 /* Amount of load we'd add */
2518 if (max_load * busiest->__cpu_power < 2535 if (max_load * busiest->__cpu_power <
2519 busiest_load_per_task * SCHED_LOAD_SCALE) 2536 busiest_load_per_task * SCHED_LOAD_SCALE)
2520 tmp = sg_div_cpu_power(this, 2537 tmp = sg_div_cpu_power(this,
2521 max_load * busiest->__cpu_power); 2538 max_load * busiest->__cpu_power);
2522 else 2539 else
2523 tmp = sg_div_cpu_power(this, 2540 tmp = sg_div_cpu_power(this,
2524 busiest_load_per_task * SCHED_LOAD_SCALE); 2541 busiest_load_per_task * SCHED_LOAD_SCALE);
2525 pwr_move += this->__cpu_power * 2542 pwr_move += this->__cpu_power *
2526 min(this_load_per_task, this_load + tmp); 2543 min(this_load_per_task, this_load + tmp);
2527 pwr_move /= SCHED_LOAD_SCALE; 2544 pwr_move /= SCHED_LOAD_SCALE;
2528 2545
2529 /* Move if we gain throughput */ 2546 /* Move if we gain throughput */
2530 if (pwr_move <= pwr_now) 2547 if (pwr_move <= pwr_now)
2531 goto out_balanced; 2548 goto out_balanced;
2532 2549
2533 *imbalance = busiest_load_per_task; 2550 *imbalance = busiest_load_per_task;
2534 } 2551 }
2535 2552
2536 return busiest; 2553 return busiest;
2537 2554
2538 out_balanced: 2555 out_balanced:
2539 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2556 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2540 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2557 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2541 goto ret; 2558 goto ret;
2542 2559
2543 if (this == group_leader && group_leader != group_min) { 2560 if (this == group_leader && group_leader != group_min) {
2544 *imbalance = min_load_per_task; 2561 *imbalance = min_load_per_task;
2545 return group_min; 2562 return group_min;
2546 } 2563 }
2547 #endif 2564 #endif
2548 ret: 2565 ret:
2549 *imbalance = 0; 2566 *imbalance = 0;
2550 return NULL; 2567 return NULL;
2551 } 2568 }
2552 2569
2553 /* 2570 /*
2554 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2571 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2555 */ 2572 */
2556 static struct rq * 2573 static struct rq *
2557 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2574 find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2558 unsigned long imbalance, cpumask_t *cpus) 2575 unsigned long imbalance, cpumask_t *cpus)
2559 { 2576 {
2560 struct rq *busiest = NULL, *rq; 2577 struct rq *busiest = NULL, *rq;
2561 unsigned long max_load = 0; 2578 unsigned long max_load = 0;
2562 int i; 2579 int i;
2563 2580
2564 for_each_cpu_mask(i, group->cpumask) { 2581 for_each_cpu_mask(i, group->cpumask) {
2565 unsigned long wl; 2582 unsigned long wl;
2566 2583
2567 if (!cpu_isset(i, *cpus)) 2584 if (!cpu_isset(i, *cpus))
2568 continue; 2585 continue;
2569 2586
2570 rq = cpu_rq(i); 2587 rq = cpu_rq(i);
2571 wl = weighted_cpuload(i); 2588 wl = weighted_cpuload(i);
2572 2589
2573 if (rq->nr_running == 1 && wl > imbalance) 2590 if (rq->nr_running == 1 && wl > imbalance)
2574 continue; 2591 continue;
2575 2592
2576 if (wl > max_load) { 2593 if (wl > max_load) {
2577 max_load = wl; 2594 max_load = wl;
2578 busiest = rq; 2595 busiest = rq;
2579 } 2596 }
2580 } 2597 }
2581 2598
2582 return busiest; 2599 return busiest;
2583 } 2600 }
2584 2601
2585 /* 2602 /*
2586 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 2603 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2587 * so long as it is large enough. 2604 * so long as it is large enough.
2588 */ 2605 */
2589 #define MAX_PINNED_INTERVAL 512 2606 #define MAX_PINNED_INTERVAL 512
2590 2607
2591 static inline unsigned long minus_1_or_zero(unsigned long n)
2592 {
2593 return n > 0 ? n - 1 : 0;
2594 }
2595
2596 /* 2608 /*
2597 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2609 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2598 * tasks if there is an imbalance. 2610 * tasks if there is an imbalance.
2599 */ 2611 */
2600 static int load_balance(int this_cpu, struct rq *this_rq, 2612 static int load_balance(int this_cpu, struct rq *this_rq,
2601 struct sched_domain *sd, enum cpu_idle_type idle, 2613 struct sched_domain *sd, enum cpu_idle_type idle,
2602 int *balance) 2614 int *balance)
2603 { 2615 {
2604 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2616 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2605 struct sched_group *group; 2617 struct sched_group *group;
2606 unsigned long imbalance; 2618 unsigned long imbalance;
2607 struct rq *busiest; 2619 struct rq *busiest;
2608 cpumask_t cpus = CPU_MASK_ALL; 2620 cpumask_t cpus = CPU_MASK_ALL;
2609 unsigned long flags; 2621 unsigned long flags;
2610 2622
2611 /* 2623 /*
2612 * When power savings policy is enabled for the parent domain, idle 2624 * When power savings policy is enabled for the parent domain, idle
2613 * sibling can pick up load irrespective of busy siblings. In this case, 2625 * sibling can pick up load irrespective of busy siblings. In this case,
2614 * let the state of idle sibling percolate up as CPU_IDLE, instead of 2626 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2615 * portraying it as CPU_NOT_IDLE. 2627 * portraying it as CPU_NOT_IDLE.
2616 */ 2628 */
2617 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2629 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2618 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2630 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2619 sd_idle = 1; 2631 sd_idle = 1;
2620 2632
2621 schedstat_inc(sd, lb_cnt[idle]); 2633 schedstat_inc(sd, lb_cnt[idle]);
2622 2634
2623 redo: 2635 redo:
2624 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2636 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2625 &cpus, balance); 2637 &cpus, balance);
2626 2638
2627 if (*balance == 0) 2639 if (*balance == 0)
2628 goto out_balanced; 2640 goto out_balanced;
2629 2641
2630 if (!group) { 2642 if (!group) {
2631 schedstat_inc(sd, lb_nobusyg[idle]); 2643 schedstat_inc(sd, lb_nobusyg[idle]);
2632 goto out_balanced; 2644 goto out_balanced;
2633 } 2645 }
2634 2646
2635 busiest = find_busiest_queue(group, idle, imbalance, &cpus); 2647 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2636 if (!busiest) { 2648 if (!busiest) {
2637 schedstat_inc(sd, lb_nobusyq[idle]); 2649 schedstat_inc(sd, lb_nobusyq[idle]);
2638 goto out_balanced; 2650 goto out_balanced;
2639 } 2651 }
2640 2652
2641 BUG_ON(busiest == this_rq); 2653 BUG_ON(busiest == this_rq);
2642 2654
2643 schedstat_add(sd, lb_imbalance[idle], imbalance); 2655 schedstat_add(sd, lb_imbalance[idle], imbalance);
2644 2656
2645 nr_moved = 0; 2657 ld_moved = 0;
2646 if (busiest->nr_running > 1) { 2658 if (busiest->nr_running > 1) {
2647 /* 2659 /*
2648 * Attempt to move tasks. If find_busiest_group has found 2660 * Attempt to move tasks. If find_busiest_group has found
2649 * an imbalance but busiest->nr_running <= 1, the group is 2661 * an imbalance but busiest->nr_running <= 1, the group is
2650 * still unbalanced. nr_moved simply stays zero, so it is 2662 * still unbalanced. ld_moved simply stays zero, so it is
2651 * correctly treated as an imbalance. 2663 * correctly treated as an imbalance.
2652 */ 2664 */
2653 local_irq_save(flags); 2665 local_irq_save(flags);
2654 double_rq_lock(this_rq, busiest); 2666 double_rq_lock(this_rq, busiest);
2655 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2667 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2656 minus_1_or_zero(busiest->nr_running),
2657 imbalance, sd, idle, &all_pinned); 2668 imbalance, sd, idle, &all_pinned);
2658 double_rq_unlock(this_rq, busiest); 2669 double_rq_unlock(this_rq, busiest);
2659 local_irq_restore(flags); 2670 local_irq_restore(flags);
2660 2671
2661 /* 2672 /*
2662 * some other cpu did the load balance for us. 2673 * some other cpu did the load balance for us.
2663 */ 2674 */
2664 if (nr_moved && this_cpu != smp_processor_id()) 2675 if (ld_moved && this_cpu != smp_processor_id())
2665 resched_cpu(this_cpu); 2676 resched_cpu(this_cpu);
2666 2677
2667 /* All tasks on this runqueue were pinned by CPU affinity */ 2678 /* All tasks on this runqueue were pinned by CPU affinity */
2668 if (unlikely(all_pinned)) { 2679 if (unlikely(all_pinned)) {
2669 cpu_clear(cpu_of(busiest), cpus); 2680 cpu_clear(cpu_of(busiest), cpus);
2670 if (!cpus_empty(cpus)) 2681 if (!cpus_empty(cpus))
2671 goto redo; 2682 goto redo;
2672 goto out_balanced; 2683 goto out_balanced;
2673 } 2684 }
2674 } 2685 }
2675 2686
2676 if (!nr_moved) { 2687 if (!ld_moved) {
2677 schedstat_inc(sd, lb_failed[idle]); 2688 schedstat_inc(sd, lb_failed[idle]);
2678 sd->nr_balance_failed++; 2689 sd->nr_balance_failed++;
2679 2690
2680 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2691 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2681 2692
2682 spin_lock_irqsave(&busiest->lock, flags); 2693 spin_lock_irqsave(&busiest->lock, flags);
2683 2694
2684 /* don't kick the migration_thread, if the curr 2695 /* don't kick the migration_thread, if the curr
2685 * task on busiest cpu can't be moved to this_cpu 2696 * task on busiest cpu can't be moved to this_cpu
2686 */ 2697 */
2687 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2698 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2688 spin_unlock_irqrestore(&busiest->lock, flags); 2699 spin_unlock_irqrestore(&busiest->lock, flags);
2689 all_pinned = 1; 2700 all_pinned = 1;
2690 goto out_one_pinned; 2701 goto out_one_pinned;
2691 } 2702 }
2692 2703
2693 if (!busiest->active_balance) { 2704 if (!busiest->active_balance) {
2694 busiest->active_balance = 1; 2705 busiest->active_balance = 1;
2695 busiest->push_cpu = this_cpu; 2706 busiest->push_cpu = this_cpu;
2696 active_balance = 1; 2707 active_balance = 1;
2697 } 2708 }
2698 spin_unlock_irqrestore(&busiest->lock, flags); 2709 spin_unlock_irqrestore(&busiest->lock, flags);
2699 if (active_balance) 2710 if (active_balance)
2700 wake_up_process(busiest->migration_thread); 2711 wake_up_process(busiest->migration_thread);
2701 2712
2702 /* 2713 /*
2703 * We've kicked active balancing, reset the failure 2714 * We've kicked active balancing, reset the failure
2704 * counter. 2715 * counter.
2705 */ 2716 */
2706 sd->nr_balance_failed = sd->cache_nice_tries+1; 2717 sd->nr_balance_failed = sd->cache_nice_tries+1;
2707 } 2718 }
2708 } else 2719 } else
2709 sd->nr_balance_failed = 0; 2720 sd->nr_balance_failed = 0;
2710 2721
2711 if (likely(!active_balance)) { 2722 if (likely(!active_balance)) {
2712 /* We were unbalanced, so reset the balancing interval */ 2723 /* We were unbalanced, so reset the balancing interval */
2713 sd->balance_interval = sd->min_interval; 2724 sd->balance_interval = sd->min_interval;
2714 } else { 2725 } else {
2715 /* 2726 /*
2716 * If we've begun active balancing, start to back off. This 2727 * If we've begun active balancing, start to back off. This
2717 * case may not be covered by the all_pinned logic if there 2728 * case may not be covered by the all_pinned logic if there
2718 * is only 1 task on the busy runqueue (because we don't call 2729 * is only 1 task on the busy runqueue (because we don't call
2719 * move_tasks). 2730 * move_tasks).
2720 */ 2731 */
2721 if (sd->balance_interval < sd->max_interval) 2732 if (sd->balance_interval < sd->max_interval)
2722 sd->balance_interval *= 2; 2733 sd->balance_interval *= 2;
2723 } 2734 }
2724 2735
2725 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2736 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2726 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2737 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2727 return -1; 2738 return -1;
2728 return nr_moved; 2739 return ld_moved;
2729 2740
2730 out_balanced: 2741 out_balanced:
2731 schedstat_inc(sd, lb_balanced[idle]); 2742 schedstat_inc(sd, lb_balanced[idle]);
2732 2743
2733 sd->nr_balance_failed = 0; 2744 sd->nr_balance_failed = 0;
2734 2745
2735 out_one_pinned: 2746 out_one_pinned:
2736 /* tune up the balancing interval */ 2747 /* tune up the balancing interval */
2737 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 2748 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2738 (sd->balance_interval < sd->max_interval)) 2749 (sd->balance_interval < sd->max_interval))
2739 sd->balance_interval *= 2; 2750 sd->balance_interval *= 2;
2740 2751
2741 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2752 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2742 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2753 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2743 return -1; 2754 return -1;
2744 return 0; 2755 return 0;
2745 } 2756 }
2746 2757
2747 /* 2758 /*
2748 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2759 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2749 * tasks if there is an imbalance. 2760 * tasks if there is an imbalance.
2750 * 2761 *
2751 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). 2762 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
2752 * this_rq is locked. 2763 * this_rq is locked.
2753 */ 2764 */
2754 static int 2765 static int
2755 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) 2766 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2756 { 2767 {
2757 struct sched_group *group; 2768 struct sched_group *group;
2758 struct rq *busiest = NULL; 2769 struct rq *busiest = NULL;
2759 unsigned long imbalance; 2770 unsigned long imbalance;
2760 int nr_moved = 0; 2771 int ld_moved = 0;
2761 int sd_idle = 0; 2772 int sd_idle = 0;
2762 int all_pinned = 0; 2773 int all_pinned = 0;
2763 cpumask_t cpus = CPU_MASK_ALL; 2774 cpumask_t cpus = CPU_MASK_ALL;
2764 2775
2765 /* 2776 /*
2766 * When power savings policy is enabled for the parent domain, idle 2777 * When power savings policy is enabled for the parent domain, idle
2767 * sibling can pick up load irrespective of busy siblings. In this case, 2778 * sibling can pick up load irrespective of busy siblings. In this case,
2768 * let the state of idle sibling percolate up as IDLE, instead of 2779 * let the state of idle sibling percolate up as IDLE, instead of
2769 * portraying it as CPU_NOT_IDLE. 2780 * portraying it as CPU_NOT_IDLE.
2770 */ 2781 */
2771 if (sd->flags & SD_SHARE_CPUPOWER && 2782 if (sd->flags & SD_SHARE_CPUPOWER &&
2772 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2783 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2773 sd_idle = 1; 2784 sd_idle = 1;
2774 2785
2775 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); 2786 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
2776 redo: 2787 redo:
2777 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 2788 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
2778 &sd_idle, &cpus, NULL); 2789 &sd_idle, &cpus, NULL);
2779 if (!group) { 2790 if (!group) {
2780 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); 2791 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
2781 goto out_balanced; 2792 goto out_balanced;
2782 } 2793 }
2783 2794
2784 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, 2795 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
2785 &cpus); 2796 &cpus);
2786 if (!busiest) { 2797 if (!busiest) {
2787 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); 2798 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
2788 goto out_balanced; 2799 goto out_balanced;
2789 } 2800 }
2790 2801
2791 BUG_ON(busiest == this_rq); 2802 BUG_ON(busiest == this_rq);
2792 2803
2793 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); 2804 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
2794 2805
2795 nr_moved = 0; 2806 ld_moved = 0;
2796 if (busiest->nr_running > 1) { 2807 if (busiest->nr_running > 1) {
2797 /* Attempt to move tasks */ 2808 /* Attempt to move tasks */
2798 double_lock_balance(this_rq, busiest); 2809 double_lock_balance(this_rq, busiest);
2799 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2810 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2800 minus_1_or_zero(busiest->nr_running),
2801 imbalance, sd, CPU_NEWLY_IDLE, 2811 imbalance, sd, CPU_NEWLY_IDLE,
2802 &all_pinned); 2812 &all_pinned);
2803 spin_unlock(&busiest->lock); 2813 spin_unlock(&busiest->lock);
2804 2814
2805 if (unlikely(all_pinned)) { 2815 if (unlikely(all_pinned)) {
2806 cpu_clear(cpu_of(busiest), cpus); 2816 cpu_clear(cpu_of(busiest), cpus);
2807 if (!cpus_empty(cpus)) 2817 if (!cpus_empty(cpus))
2808 goto redo; 2818 goto redo;
2809 } 2819 }
2810 } 2820 }
2811 2821
2812 if (!nr_moved) { 2822 if (!ld_moved) {
2813 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); 2823 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
2814 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2824 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2815 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2825 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2816 return -1; 2826 return -1;
2817 } else 2827 } else
2818 sd->nr_balance_failed = 0; 2828 sd->nr_balance_failed = 0;
2819 2829
2820 return nr_moved; 2830 return ld_moved;
2821 2831
2822 out_balanced: 2832 out_balanced:
2823 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); 2833 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
2824 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2834 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2825 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2835 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2826 return -1; 2836 return -1;
2827 sd->nr_balance_failed = 0; 2837 sd->nr_balance_failed = 0;
2828 2838
2829 return 0; 2839 return 0;
2830 } 2840 }
2831 2841
2832 /* 2842 /*
2833 * idle_balance is called by schedule() if this_cpu is about to become 2843 * idle_balance is called by schedule() if this_cpu is about to become
2834 * idle. Attempts to pull tasks from other CPUs. 2844 * idle. Attempts to pull tasks from other CPUs.
2835 */ 2845 */
2836 static void idle_balance(int this_cpu, struct rq *this_rq) 2846 static void idle_balance(int this_cpu, struct rq *this_rq)
2837 { 2847 {
2838 struct sched_domain *sd; 2848 struct sched_domain *sd;
2839 int pulled_task = -1; 2849 int pulled_task = -1;
2840 unsigned long next_balance = jiffies + HZ; 2850 unsigned long next_balance = jiffies + HZ;
2841 2851
2842 for_each_domain(this_cpu, sd) { 2852 for_each_domain(this_cpu, sd) {
2843 unsigned long interval; 2853 unsigned long interval;
2844 2854
2845 if (!(sd->flags & SD_LOAD_BALANCE)) 2855 if (!(sd->flags & SD_LOAD_BALANCE))
2846 continue; 2856 continue;
2847 2857
2848 if (sd->flags & SD_BALANCE_NEWIDLE) 2858 if (sd->flags & SD_BALANCE_NEWIDLE)
2849 /* If we've pulled tasks over stop searching: */ 2859 /* If we've pulled tasks over stop searching: */
2850 pulled_task = load_balance_newidle(this_cpu, 2860 pulled_task = load_balance_newidle(this_cpu,
2851 this_rq, sd); 2861 this_rq, sd);
2852 2862
2853 interval = msecs_to_jiffies(sd->balance_interval); 2863 interval = msecs_to_jiffies(sd->balance_interval);
2854 if (time_after(next_balance, sd->last_balance + interval)) 2864 if (time_after(next_balance, sd->last_balance + interval))
2855 next_balance = sd->last_balance + interval; 2865 next_balance = sd->last_balance + interval;
2856 if (pulled_task) 2866 if (pulled_task)
2857 break; 2867 break;
2858 } 2868 }
2859 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 2869 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
2860 /* 2870 /*
2861 * We are going idle. next_balance may be set based on 2871 * We are going idle. next_balance may be set based on
2862 * a busy processor. So reset next_balance. 2872 * a busy processor. So reset next_balance.
2863 */ 2873 */
2864 this_rq->next_balance = next_balance; 2874 this_rq->next_balance = next_balance;
2865 } 2875 }
2866 } 2876 }
2867 2877
2868 /* 2878 /*
2869 * active_load_balance is run by migration threads. It pushes running tasks 2879 * active_load_balance is run by migration threads. It pushes running tasks
2870 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 2880 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2871 * running on each physical CPU where possible, and avoids physical / 2881 * running on each physical CPU where possible, and avoids physical /
2872 * logical imbalances. 2882 * logical imbalances.
2873 * 2883 *
2874 * Called with busiest_rq locked. 2884 * Called with busiest_rq locked.
2875 */ 2885 */
2876 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 2886 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2877 { 2887 {
2878 int target_cpu = busiest_rq->push_cpu; 2888 int target_cpu = busiest_rq->push_cpu;
2879 struct sched_domain *sd; 2889 struct sched_domain *sd;
2880 struct rq *target_rq; 2890 struct rq *target_rq;
2881 2891
2882 /* Is there any task to move? */ 2892 /* Is there any task to move? */
2883 if (busiest_rq->nr_running <= 1) 2893 if (busiest_rq->nr_running <= 1)
2884 return; 2894 return;
2885 2895
2886 target_rq = cpu_rq(target_cpu); 2896 target_rq = cpu_rq(target_cpu);
2887 2897
2888 /* 2898 /*
2889 * This condition is "impossible", if it occurs 2899 * This condition is "impossible", if it occurs
2890 * we need to fix it. Originally reported by 2900 * we need to fix it. Originally reported by
2891 * Bjorn Helgaas on a 128-cpu setup. 2901 * Bjorn Helgaas on a 128-cpu setup.
2892 */ 2902 */
2893 BUG_ON(busiest_rq == target_rq); 2903 BUG_ON(busiest_rq == target_rq);
2894 2904
2895 /* move a task from busiest_rq to target_rq */ 2905 /* move a task from busiest_rq to target_rq */
2896 double_lock_balance(busiest_rq, target_rq); 2906 double_lock_balance(busiest_rq, target_rq);
2897 2907
2898 /* Search for an sd spanning us and the target CPU. */ 2908 /* Search for an sd spanning us and the target CPU. */
2899 for_each_domain(target_cpu, sd) { 2909 for_each_domain(target_cpu, sd) {
2900 if ((sd->flags & SD_LOAD_BALANCE) && 2910 if ((sd->flags & SD_LOAD_BALANCE) &&
2901 cpu_isset(busiest_cpu, sd->span)) 2911 cpu_isset(busiest_cpu, sd->span))
2902 break; 2912 break;
2903 } 2913 }
2904 2914
2905 if (likely(sd)) { 2915 if (likely(sd)) {
2906 schedstat_inc(sd, alb_cnt); 2916 schedstat_inc(sd, alb_cnt);
2907 2917
2908 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2918 if (move_one_task(target_rq, target_cpu, busiest_rq,
2909 ULONG_MAX, sd, CPU_IDLE, NULL)) 2919 sd, CPU_IDLE))
2910 schedstat_inc(sd, alb_pushed); 2920 schedstat_inc(sd, alb_pushed);
2911 else 2921 else
2912 schedstat_inc(sd, alb_failed); 2922 schedstat_inc(sd, alb_failed);
2913 } 2923 }
2914 spin_unlock(&target_rq->lock); 2924 spin_unlock(&target_rq->lock);
2915 } 2925 }
2916 2926
2917 #ifdef CONFIG_NO_HZ 2927 #ifdef CONFIG_NO_HZ
2918 static struct { 2928 static struct {
2919 atomic_t load_balancer; 2929 atomic_t load_balancer;
2920 cpumask_t cpu_mask; 2930 cpumask_t cpu_mask;
2921 } nohz ____cacheline_aligned = { 2931 } nohz ____cacheline_aligned = {
2922 .load_balancer = ATOMIC_INIT(-1), 2932 .load_balancer = ATOMIC_INIT(-1),
2923 .cpu_mask = CPU_MASK_NONE, 2933 .cpu_mask = CPU_MASK_NONE,
2924 }; 2934 };
2925 2935
2926 /* 2936 /*
2927 * This routine will try to nominate the ilb (idle load balancing) 2937 * This routine will try to nominate the ilb (idle load balancing)
2928 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 2938 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2929 * load balancing on behalf of all those cpus. If all the cpus in the system 2939 * load balancing on behalf of all those cpus. If all the cpus in the system
2930 * go into this tickless mode, then there will be no ilb owner (as there is 2940 * go into this tickless mode, then there will be no ilb owner (as there is
2931 * no need for one) and all the cpus will sleep till the next wakeup event 2941 * no need for one) and all the cpus will sleep till the next wakeup event
2932 * arrives... 2942 * arrives...
2933 * 2943 *
2934 * For the ilb owner, tick is not stopped. And this tick will be used 2944 * For the ilb owner, tick is not stopped. And this tick will be used
2935 * for idle load balancing. ilb owner will still be part of 2945 * for idle load balancing. ilb owner will still be part of
2936 * nohz.cpu_mask.. 2946 * nohz.cpu_mask..
2937 * 2947 *
2938 * While stopping the tick, this cpu will become the ilb owner if there 2948 * While stopping the tick, this cpu will become the ilb owner if there
2939 * is no other owner. And will be the owner till that cpu becomes busy 2949 * is no other owner. And will be the owner till that cpu becomes busy
2940 * or if all cpus in the system stop their ticks at which point 2950 * or if all cpus in the system stop their ticks at which point
2941 * there is no need for ilb owner. 2951 * there is no need for ilb owner.
2942 * 2952 *
2943 * When the ilb owner becomes busy, it nominates another owner, during the 2953 * When the ilb owner becomes busy, it nominates another owner, during the
2944 * next busy scheduler_tick() 2954 * next busy scheduler_tick()
2945 */ 2955 */
2946 int select_nohz_load_balancer(int stop_tick) 2956 int select_nohz_load_balancer(int stop_tick)
2947 { 2957 {
2948 int cpu = smp_processor_id(); 2958 int cpu = smp_processor_id();
2949 2959
2950 if (stop_tick) { 2960 if (stop_tick) {
2951 cpu_set(cpu, nohz.cpu_mask); 2961 cpu_set(cpu, nohz.cpu_mask);
2952 cpu_rq(cpu)->in_nohz_recently = 1; 2962 cpu_rq(cpu)->in_nohz_recently = 1;
2953 2963
2954 /* 2964 /*
2955 * If we are going offline and still the leader, give up! 2965 * If we are going offline and still the leader, give up!
2956 */ 2966 */
2957 if (cpu_is_offline(cpu) && 2967 if (cpu_is_offline(cpu) &&
2958 atomic_read(&nohz.load_balancer) == cpu) { 2968 atomic_read(&nohz.load_balancer) == cpu) {
2959 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 2969 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2960 BUG(); 2970 BUG();
2961 return 0; 2971 return 0;
2962 } 2972 }
2963 2973
2964 /* time for ilb owner also to sleep */ 2974 /* time for ilb owner also to sleep */
2965 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 2975 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2966 if (atomic_read(&nohz.load_balancer) == cpu) 2976 if (atomic_read(&nohz.load_balancer) == cpu)
2967 atomic_set(&nohz.load_balancer, -1); 2977 atomic_set(&nohz.load_balancer, -1);
2968 return 0; 2978 return 0;
2969 } 2979 }
2970 2980
2971 if (atomic_read(&nohz.load_balancer) == -1) { 2981 if (atomic_read(&nohz.load_balancer) == -1) {
2972 /* make me the ilb owner */ 2982 /* make me the ilb owner */
2973 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) 2983 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2974 return 1; 2984 return 1;
2975 } else if (atomic_read(&nohz.load_balancer) == cpu) 2985 } else if (atomic_read(&nohz.load_balancer) == cpu)
2976 return 1; 2986 return 1;
2977 } else { 2987 } else {
2978 if (!cpu_isset(cpu, nohz.cpu_mask)) 2988 if (!cpu_isset(cpu, nohz.cpu_mask))
2979 return 0; 2989 return 0;
2980 2990
2981 cpu_clear(cpu, nohz.cpu_mask); 2991 cpu_clear(cpu, nohz.cpu_mask);
2982 2992
2983 if (atomic_read(&nohz.load_balancer) == cpu) 2993 if (atomic_read(&nohz.load_balancer) == cpu)
2984 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 2994 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2985 BUG(); 2995 BUG();
2986 } 2996 }
2987 return 0; 2997 return 0;
2988 } 2998 }
2989 #endif 2999 #endif
2990 3000
2991 static DEFINE_SPINLOCK(balancing); 3001 static DEFINE_SPINLOCK(balancing);
2992 3002
2993 /* 3003 /*
2994 * It checks each scheduling domain to see if it is due to be balanced, 3004 * It checks each scheduling domain to see if it is due to be balanced,
2995 * and initiates a balancing operation if so. 3005 * and initiates a balancing operation if so.
2996 * 3006 *
2997 * Balancing parameters are set up in arch_init_sched_domains. 3007 * Balancing parameters are set up in arch_init_sched_domains.
2998 */ 3008 */
2999 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) 3009 static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
3000 { 3010 {
3001 int balance = 1; 3011 int balance = 1;
3002 struct rq *rq = cpu_rq(cpu); 3012 struct rq *rq = cpu_rq(cpu);
3003 unsigned long interval; 3013 unsigned long interval;
3004 struct sched_domain *sd; 3014 struct sched_domain *sd;
3005 /* Earliest time when we have to do rebalance again */ 3015 /* Earliest time when we have to do rebalance again */
3006 unsigned long next_balance = jiffies + 60*HZ; 3016 unsigned long next_balance = jiffies + 60*HZ;
3007 3017
3008 for_each_domain(cpu, sd) { 3018 for_each_domain(cpu, sd) {
3009 if (!(sd->flags & SD_LOAD_BALANCE)) 3019 if (!(sd->flags & SD_LOAD_BALANCE))
3010 continue; 3020 continue;
3011 3021
3012 interval = sd->balance_interval; 3022 interval = sd->balance_interval;
3013 if (idle != CPU_IDLE) 3023 if (idle != CPU_IDLE)
3014 interval *= sd->busy_factor; 3024 interval *= sd->busy_factor;
3015 3025
3016 /* scale ms to jiffies */ 3026 /* scale ms to jiffies */
3017 interval = msecs_to_jiffies(interval); 3027 interval = msecs_to_jiffies(interval);
3018 if (unlikely(!interval)) 3028 if (unlikely(!interval))
3019 interval = 1; 3029 interval = 1;
3020 if (interval > HZ*NR_CPUS/10) 3030 if (interval > HZ*NR_CPUS/10)
3021 interval = HZ*NR_CPUS/10; 3031 interval = HZ*NR_CPUS/10;
3022 3032
3023 3033
3024 if (sd->flags & SD_SERIALIZE) { 3034 if (sd->flags & SD_SERIALIZE) {
3025 if (!spin_trylock(&balancing)) 3035 if (!spin_trylock(&balancing))
3026 goto out; 3036 goto out;
3027 } 3037 }
3028 3038
3029 if (time_after_eq(jiffies, sd->last_balance + interval)) { 3039 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3030 if (load_balance(cpu, rq, sd, idle, &balance)) { 3040 if (load_balance(cpu, rq, sd, idle, &balance)) {
3031 /* 3041 /*
3032 * We've pulled tasks over so either we're no 3042 * We've pulled tasks over so either we're no
3033 * longer idle, or one of our SMT siblings is 3043 * longer idle, or one of our SMT siblings is
3034 * not idle. 3044 * not idle.
3035 */ 3045 */
3036 idle = CPU_NOT_IDLE; 3046 idle = CPU_NOT_IDLE;
3037 } 3047 }
3038 sd->last_balance = jiffies; 3048 sd->last_balance = jiffies;
3039 } 3049 }
3040 if (sd->flags & SD_SERIALIZE) 3050 if (sd->flags & SD_SERIALIZE)
3041 spin_unlock(&balancing); 3051 spin_unlock(&balancing);
3042 out: 3052 out:
3043 if (time_after(next_balance, sd->last_balance + interval)) 3053 if (time_after(next_balance, sd->last_balance + interval))
3044 next_balance = sd->last_balance + interval; 3054 next_balance = sd->last_balance + interval;
3045 3055
3046 /* 3056 /*
3047 * Stop the load balance at this level. There is another 3057 * Stop the load balance at this level. There is another
3048 * CPU in our sched group which is doing load balancing more 3058 * CPU in our sched group which is doing load balancing more
3049 * actively. 3059 * actively.
3050 */ 3060 */
3051 if (!balance) 3061 if (!balance)
3052 break; 3062 break;
3053 } 3063 }
3054 rq->next_balance = next_balance; 3064 rq->next_balance = next_balance;
3055 } 3065 }
3056 3066
3057 /* 3067 /*
3058 * run_rebalance_domains is triggered when needed from the scheduler tick. 3068 * run_rebalance_domains is triggered when needed from the scheduler tick.
3059 * In CONFIG_NO_HZ case, the idle load balance owner will do the 3069 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3060 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3070 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3061 */ 3071 */
3062 static void run_rebalance_domains(struct softirq_action *h) 3072 static void run_rebalance_domains(struct softirq_action *h)
3063 { 3073 {
3064 int this_cpu = smp_processor_id(); 3074 int this_cpu = smp_processor_id();
3065 struct rq *this_rq = cpu_rq(this_cpu); 3075 struct rq *this_rq = cpu_rq(this_cpu);
3066 enum cpu_idle_type idle = this_rq->idle_at_tick ? 3076 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3067 CPU_IDLE : CPU_NOT_IDLE; 3077 CPU_IDLE : CPU_NOT_IDLE;
3068 3078
3069 rebalance_domains(this_cpu, idle); 3079 rebalance_domains(this_cpu, idle);
3070 3080
3071 #ifdef CONFIG_NO_HZ 3081 #ifdef CONFIG_NO_HZ
3072 /* 3082 /*
3073 * If this cpu is the owner for idle load balancing, then do the 3083 * If this cpu is the owner for idle load balancing, then do the
3074 * balancing on behalf of the other idle cpus whose ticks are 3084 * balancing on behalf of the other idle cpus whose ticks are
3075 * stopped. 3085 * stopped.
3076 */ 3086 */
3077 if (this_rq->idle_at_tick && 3087 if (this_rq->idle_at_tick &&
3078 atomic_read(&nohz.load_balancer) == this_cpu) { 3088 atomic_read(&nohz.load_balancer) == this_cpu) {
3079 cpumask_t cpus = nohz.cpu_mask; 3089 cpumask_t cpus = nohz.cpu_mask;
3080 struct rq *rq; 3090 struct rq *rq;
3081 int balance_cpu; 3091 int balance_cpu;
3082 3092
3083 cpu_clear(this_cpu, cpus); 3093 cpu_clear(this_cpu, cpus);
3084 for_each_cpu_mask(balance_cpu, cpus) { 3094 for_each_cpu_mask(balance_cpu, cpus) {
3085 /* 3095 /*
3086 * If this cpu gets work to do, stop the load balancing 3096 * If this cpu gets work to do, stop the load balancing
3087 * work being done for other cpus. Next load 3097 * work being done for other cpus. Next load
3088 * balancing owner will pick it up. 3098 * balancing owner will pick it up.
3089 */ 3099 */
3090 if (need_resched()) 3100 if (need_resched())
3091 break; 3101 break;
3092 3102
3093 rebalance_domains(balance_cpu, SCHED_IDLE); 3103 rebalance_domains(balance_cpu, SCHED_IDLE);
3094 3104
3095 rq = cpu_rq(balance_cpu); 3105 rq = cpu_rq(balance_cpu);
3096 if (time_after(this_rq->next_balance, rq->next_balance)) 3106 if (time_after(this_rq->next_balance, rq->next_balance))
3097 this_rq->next_balance = rq->next_balance; 3107 this_rq->next_balance = rq->next_balance;
3098 } 3108 }
3099 } 3109 }
3100 #endif 3110 #endif
3101 } 3111 }
3102 3112
3103 /* 3113 /*
3104 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3114 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3105 * 3115 *
3106 * In case of CONFIG_NO_HZ, this is the place where we nominate a new 3116 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3107 * idle load balancing owner or decide to stop the periodic load balancing, 3117 * idle load balancing owner or decide to stop the periodic load balancing,
3108 * if the whole system is idle. 3118 * if the whole system is idle.
3109 */ 3119 */
3110 static inline void trigger_load_balance(struct rq *rq, int cpu) 3120 static inline void trigger_load_balance(struct rq *rq, int cpu)
3111 { 3121 {
3112 #ifdef CONFIG_NO_HZ 3122 #ifdef CONFIG_NO_HZ
3113 /* 3123 /*
3114 * If we were in the nohz mode recently and busy at the current 3124 * If we were in the nohz mode recently and busy at the current
3115 * scheduler tick, then check if we need to nominate new idle 3125 * scheduler tick, then check if we need to nominate new idle
3116 * load balancer. 3126 * load balancer.
3117 */ 3127 */
3118 if (rq->in_nohz_recently && !rq->idle_at_tick) { 3128 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3119 rq->in_nohz_recently = 0; 3129 rq->in_nohz_recently = 0;
3120 3130
3121 if (atomic_read(&nohz.load_balancer) == cpu) { 3131 if (atomic_read(&nohz.load_balancer) == cpu) {
3122 cpu_clear(cpu, nohz.cpu_mask); 3132 cpu_clear(cpu, nohz.cpu_mask);
3123 atomic_set(&nohz.load_balancer, -1); 3133 atomic_set(&nohz.load_balancer, -1);
3124 } 3134 }
3125 3135
3126 if (atomic_read(&nohz.load_balancer) == -1) { 3136 if (atomic_read(&nohz.load_balancer) == -1) {
3127 /* 3137 /*
3128 * simple selection for now: Nominate the 3138 * simple selection for now: Nominate the
3129 * first cpu in the nohz list to be the next 3139 * first cpu in the nohz list to be the next
3130 * ilb owner. 3140 * ilb owner.
3131 * 3141 *
3132 * TBD: Traverse the sched domains and nominate 3142 * TBD: Traverse the sched domains and nominate
3133 * the nearest cpu in the nohz.cpu_mask. 3143 * the nearest cpu in the nohz.cpu_mask.
3134 */ 3144 */
3135 int ilb = first_cpu(nohz.cpu_mask); 3145 int ilb = first_cpu(nohz.cpu_mask);
3136 3146
3137 if (ilb != NR_CPUS) 3147 if (ilb != NR_CPUS)
3138 resched_cpu(ilb); 3148 resched_cpu(ilb);
3139 } 3149 }
3140 } 3150 }
3141 3151
3142 /* 3152 /*
3143 * If this cpu is idle and doing idle load balancing for all the 3153 * If this cpu is idle and doing idle load balancing for all the
3144 * cpus with ticks stopped, is it time for that to stop? 3154 * cpus with ticks stopped, is it time for that to stop?
3145 */ 3155 */
3146 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && 3156 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3147 cpus_weight(nohz.cpu_mask) == num_online_cpus()) { 3157 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3148 resched_cpu(cpu); 3158 resched_cpu(cpu);
3149 return; 3159 return;
3150 } 3160 }
3151 3161
3152 /* 3162 /*
3153 * If this cpu is idle and the idle load balancing is done by 3163 * If this cpu is idle and the idle load balancing is done by
3154 * someone else, then no need raise the SCHED_SOFTIRQ 3164 * someone else, then no need raise the SCHED_SOFTIRQ
3155 */ 3165 */
3156 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && 3166 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3157 cpu_isset(cpu, nohz.cpu_mask)) 3167 cpu_isset(cpu, nohz.cpu_mask))
3158 return; 3168 return;
3159 #endif 3169 #endif
3160 if (time_after_eq(jiffies, rq->next_balance)) 3170 if (time_after_eq(jiffies, rq->next_balance))
3161 raise_softirq(SCHED_SOFTIRQ); 3171 raise_softirq(SCHED_SOFTIRQ);
3162 } 3172 }
3163 3173
3164 #else /* CONFIG_SMP */ 3174 #else /* CONFIG_SMP */
3165 3175
3166 /* 3176 /*
3167 * on UP we do not need to balance between CPUs: 3177 * on UP we do not need to balance between CPUs:
3168 */ 3178 */
3169 static inline void idle_balance(int cpu, struct rq *rq) 3179 static inline void idle_balance(int cpu, struct rq *rq)
3170 { 3180 {
3171 } 3181 }
3172 3182
3173 /* Avoid "used but not defined" warning on UP */ 3183 /* Avoid "used but not defined" warning on UP */
3174 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3184 static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3175 unsigned long max_nr_move, unsigned long max_load_move, 3185 unsigned long max_nr_move, unsigned long max_load_move,
3176 struct sched_domain *sd, enum cpu_idle_type idle, 3186 struct sched_domain *sd, enum cpu_idle_type idle,
3177 int *all_pinned, unsigned long *load_moved, 3187 int *all_pinned, unsigned long *load_moved,
3178 int this_best_prio, int best_prio, int best_prio_seen, 3188 int this_best_prio, int best_prio, int best_prio_seen,
3179 struct rq_iterator *iterator) 3189 struct rq_iterator *iterator)
3180 { 3190 {
3181 *load_moved = 0; 3191 *load_moved = 0;
3182 3192
3183 return 0; 3193 return 0;
3184 } 3194 }
3185 3195
3186 #endif 3196 #endif
3187 3197
3188 DEFINE_PER_CPU(struct kernel_stat, kstat); 3198 DEFINE_PER_CPU(struct kernel_stat, kstat);
3189 3199
3190 EXPORT_PER_CPU_SYMBOL(kstat); 3200 EXPORT_PER_CPU_SYMBOL(kstat);
3191 3201
3192 /* 3202 /*
3193 * Return p->sum_exec_runtime plus any more ns on the sched_clock 3203 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3194 * that have not yet been banked in case the task is currently running. 3204 * that have not yet been banked in case the task is currently running.
3195 */ 3205 */
3196 unsigned long long task_sched_runtime(struct task_struct *p) 3206 unsigned long long task_sched_runtime(struct task_struct *p)
3197 { 3207 {
3198 unsigned long flags; 3208 unsigned long flags;
3199 u64 ns, delta_exec; 3209 u64 ns, delta_exec;
3200 struct rq *rq; 3210 struct rq *rq;
3201 3211
3202 rq = task_rq_lock(p, &flags); 3212 rq = task_rq_lock(p, &flags);
3203 ns = p->se.sum_exec_runtime; 3213 ns = p->se.sum_exec_runtime;
3204 if (rq->curr == p) { 3214 if (rq->curr == p) {
3205 delta_exec = rq_clock(rq) - p->se.exec_start; 3215 delta_exec = rq_clock(rq) - p->se.exec_start;
3206 if ((s64)delta_exec > 0) 3216 if ((s64)delta_exec > 0)
3207 ns += delta_exec; 3217 ns += delta_exec;
3208 } 3218 }
3209 task_rq_unlock(rq, &flags); 3219 task_rq_unlock(rq, &flags);
3210 3220
3211 return ns; 3221 return ns;
3212 } 3222 }
3213 3223
3214 /* 3224 /*
3215 * Account user cpu time to a process. 3225 * Account user cpu time to a process.
3216 * @p: the process that the cpu time gets accounted to 3226 * @p: the process that the cpu time gets accounted to
3217 * @hardirq_offset: the offset to subtract from hardirq_count() 3227 * @hardirq_offset: the offset to subtract from hardirq_count()
3218 * @cputime: the cpu time spent in user space since the last update 3228 * @cputime: the cpu time spent in user space since the last update
3219 */ 3229 */
3220 void account_user_time(struct task_struct *p, cputime_t cputime) 3230 void account_user_time(struct task_struct *p, cputime_t cputime)
3221 { 3231 {
3222 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3232 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3223 cputime64_t tmp; 3233 cputime64_t tmp;
3224 3234
3225 p->utime = cputime_add(p->utime, cputime); 3235 p->utime = cputime_add(p->utime, cputime);
3226 3236
3227 /* Add user time to cpustat. */ 3237 /* Add user time to cpustat. */
3228 tmp = cputime_to_cputime64(cputime); 3238 tmp = cputime_to_cputime64(cputime);
3229 if (TASK_NICE(p) > 0) 3239 if (TASK_NICE(p) > 0)
3230 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3240 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3231 else 3241 else
3232 cpustat->user = cputime64_add(cpustat->user, tmp); 3242 cpustat->user = cputime64_add(cpustat->user, tmp);
3233 } 3243 }
3234 3244
3235 /* 3245 /*
3236 * Account system cpu time to a process. 3246 * Account system cpu time to a process.
3237 * @p: the process that the cpu time gets accounted to 3247 * @p: the process that the cpu time gets accounted to
3238 * @hardirq_offset: the offset to subtract from hardirq_count() 3248 * @hardirq_offset: the offset to subtract from hardirq_count()
3239 * @cputime: the cpu time spent in kernel space since the last update 3249 * @cputime: the cpu time spent in kernel space since the last update
3240 */ 3250 */
3241 void account_system_time(struct task_struct *p, int hardirq_offset, 3251 void account_system_time(struct task_struct *p, int hardirq_offset,
3242 cputime_t cputime) 3252 cputime_t cputime)
3243 { 3253 {
3244 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3254 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3245 struct rq *rq = this_rq(); 3255 struct rq *rq = this_rq();
3246 cputime64_t tmp; 3256 cputime64_t tmp;
3247 3257
3248 p->stime = cputime_add(p->stime, cputime); 3258 p->stime = cputime_add(p->stime, cputime);
3249 3259
3250 /* Add system time to cpustat. */ 3260 /* Add system time to cpustat. */
3251 tmp = cputime_to_cputime64(cputime); 3261 tmp = cputime_to_cputime64(cputime);
3252 if (hardirq_count() - hardirq_offset) 3262 if (hardirq_count() - hardirq_offset)
3253 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3263 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3254 else if (softirq_count()) 3264 else if (softirq_count())
3255 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3265 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3256 else if (p != rq->idle) 3266 else if (p != rq->idle)
3257 cpustat->system = cputime64_add(cpustat->system, tmp); 3267 cpustat->system = cputime64_add(cpustat->system, tmp);
3258 else if (atomic_read(&rq->nr_iowait) > 0) 3268 else if (atomic_read(&rq->nr_iowait) > 0)
3259 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3269 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3260 else 3270 else
3261 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3271 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3262 /* Account for system time used */ 3272 /* Account for system time used */
3263 acct_update_integrals(p); 3273 acct_update_integrals(p);
3264 } 3274 }
3265 3275
3266 /* 3276 /*
3267 * Account for involuntary wait time. 3277 * Account for involuntary wait time.
3268 * @p: the process from which the cpu time has been stolen 3278 * @p: the process from which the cpu time has been stolen
3269 * @steal: the cpu time spent in involuntary wait 3279 * @steal: the cpu time spent in involuntary wait
3270 */ 3280 */
3271 void account_steal_time(struct task_struct *p, cputime_t steal) 3281 void account_steal_time(struct task_struct *p, cputime_t steal)
3272 { 3282 {
3273 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3283 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3274 cputime64_t tmp = cputime_to_cputime64(steal); 3284 cputime64_t tmp = cputime_to_cputime64(steal);
3275 struct rq *rq = this_rq(); 3285 struct rq *rq = this_rq();
3276 3286
3277 if (p == rq->idle) { 3287 if (p == rq->idle) {
3278 p->stime = cputime_add(p->stime, steal); 3288 p->stime = cputime_add(p->stime, steal);
3279 if (atomic_read(&rq->nr_iowait) > 0) 3289 if (atomic_read(&rq->nr_iowait) > 0)
3280 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3290 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3281 else 3291 else
3282 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3292 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3283 } else 3293 } else
3284 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3294 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3285 } 3295 }
3286 3296
3287 /* 3297 /*
3288 * This function gets called by the timer code, with HZ frequency. 3298 * This function gets called by the timer code, with HZ frequency.
3289 * We call it with interrupts disabled. 3299 * We call it with interrupts disabled.
3290 * 3300 *
3291 * It also gets called by the fork code, when changing the parent's 3301 * It also gets called by the fork code, when changing the parent's
3292 * timeslices. 3302 * timeslices.
3293 */ 3303 */
3294 void scheduler_tick(void) 3304 void scheduler_tick(void)
3295 { 3305 {
3296 int cpu = smp_processor_id(); 3306 int cpu = smp_processor_id();
3297 struct rq *rq = cpu_rq(cpu); 3307 struct rq *rq = cpu_rq(cpu);
3298 struct task_struct *curr = rq->curr; 3308 struct task_struct *curr = rq->curr;
3299 3309
3300 spin_lock(&rq->lock); 3310 spin_lock(&rq->lock);
3301 update_cpu_load(rq); 3311 update_cpu_load(rq);
3302 if (curr != rq->idle) /* FIXME: needed? */ 3312 if (curr != rq->idle) /* FIXME: needed? */
3303 curr->sched_class->task_tick(rq, curr); 3313 curr->sched_class->task_tick(rq, curr);
3304 spin_unlock(&rq->lock); 3314 spin_unlock(&rq->lock);
3305 3315
3306 #ifdef CONFIG_SMP 3316 #ifdef CONFIG_SMP
3307 rq->idle_at_tick = idle_cpu(cpu); 3317 rq->idle_at_tick = idle_cpu(cpu);
3308 trigger_load_balance(rq, cpu); 3318 trigger_load_balance(rq, cpu);
3309 #endif 3319 #endif
3310 } 3320 }
3311 3321
3312 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 3322 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3313 3323
3314 void fastcall add_preempt_count(int val) 3324 void fastcall add_preempt_count(int val)
3315 { 3325 {
3316 /* 3326 /*
3317 * Underflow? 3327 * Underflow?
3318 */ 3328 */
3319 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3329 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3320 return; 3330 return;
3321 preempt_count() += val; 3331 preempt_count() += val;
3322 /* 3332 /*
3323 * Spinlock count overflowing soon? 3333 * Spinlock count overflowing soon?
3324 */ 3334 */
3325 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3335 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3326 PREEMPT_MASK - 10); 3336 PREEMPT_MASK - 10);
3327 } 3337 }
3328 EXPORT_SYMBOL(add_preempt_count); 3338 EXPORT_SYMBOL(add_preempt_count);
3329 3339
3330 void fastcall sub_preempt_count(int val) 3340 void fastcall sub_preempt_count(int val)
3331 { 3341 {
3332 /* 3342 /*
3333 * Underflow? 3343 * Underflow?
3334 */ 3344 */
3335 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3345 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3336 return; 3346 return;
3337 /* 3347 /*
3338 * Is the spinlock portion underflowing? 3348 * Is the spinlock portion underflowing?
3339 */ 3349 */
3340 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3350 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3341 !(preempt_count() & PREEMPT_MASK))) 3351 !(preempt_count() & PREEMPT_MASK)))
3342 return; 3352 return;
3343 3353
3344 preempt_count() -= val; 3354 preempt_count() -= val;
3345 } 3355 }
3346 EXPORT_SYMBOL(sub_preempt_count); 3356 EXPORT_SYMBOL(sub_preempt_count);
3347 3357
3348 #endif 3358 #endif
3349 3359
3350 /* 3360 /*
3351 * Print scheduling while atomic bug: 3361 * Print scheduling while atomic bug:
3352 */ 3362 */
3353 static noinline void __schedule_bug(struct task_struct *prev) 3363 static noinline void __schedule_bug(struct task_struct *prev)
3354 { 3364 {
3355 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", 3365 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3356 prev->comm, preempt_count(), prev->pid); 3366 prev->comm, preempt_count(), prev->pid);
3357 debug_show_held_locks(prev); 3367 debug_show_held_locks(prev);
3358 if (irqs_disabled()) 3368 if (irqs_disabled())
3359 print_irqtrace_events(prev); 3369 print_irqtrace_events(prev);
3360 dump_stack(); 3370 dump_stack();
3361 } 3371 }
3362 3372
3363 /* 3373 /*
3364 * Various schedule()-time debugging checks and statistics: 3374 * Various schedule()-time debugging checks and statistics:
3365 */ 3375 */
3366 static inline void schedule_debug(struct task_struct *prev) 3376 static inline void schedule_debug(struct task_struct *prev)
3367 { 3377 {
3368 /* 3378 /*
3369 * Test if we are atomic. Since do_exit() needs to call into 3379 * Test if we are atomic. Since do_exit() needs to call into
3370 * schedule() atomically, we ignore that path for now. 3380 * schedule() atomically, we ignore that path for now.
3371 * Otherwise, whine if we are scheduling when we should not be. 3381 * Otherwise, whine if we are scheduling when we should not be.
3372 */ 3382 */
3373 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) 3383 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3374 __schedule_bug(prev); 3384 __schedule_bug(prev);
3375 3385
3376 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3386 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3377 3387
3378 schedstat_inc(this_rq(), sched_cnt); 3388 schedstat_inc(this_rq(), sched_cnt);
3379 } 3389 }
3380 3390
3381 /* 3391 /*
3382 * Pick up the highest-prio task: 3392 * Pick up the highest-prio task:
3383 */ 3393 */
3384 static inline struct task_struct * 3394 static inline struct task_struct *
3385 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) 3395 pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3386 { 3396 {
3387 struct sched_class *class; 3397 struct sched_class *class;
3388 struct task_struct *p; 3398 struct task_struct *p;
3389 3399
3390 /* 3400 /*
3391 * Optimization: we know that if all tasks are in 3401 * Optimization: we know that if all tasks are in
3392 * the fair class we can call that function directly: 3402 * the fair class we can call that function directly:
3393 */ 3403 */
3394 if (likely(rq->nr_running == rq->cfs.nr_running)) { 3404 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3395 p = fair_sched_class.pick_next_task(rq, now); 3405 p = fair_sched_class.pick_next_task(rq, now);
3396 if (likely(p)) 3406 if (likely(p))
3397 return p; 3407 return p;
3398 } 3408 }
3399 3409
3400 class = sched_class_highest; 3410 class = sched_class_highest;
3401 for ( ; ; ) { 3411 for ( ; ; ) {
3402 p = class->pick_next_task(rq, now); 3412 p = class->pick_next_task(rq, now);
3403 if (p) 3413 if (p)
3404 return p; 3414 return p;
3405 /* 3415 /*
3406 * Will never be NULL as the idle class always 3416 * Will never be NULL as the idle class always
3407 * returns a non-NULL p: 3417 * returns a non-NULL p:
3408 */ 3418 */
3409 class = class->next; 3419 class = class->next;
3410 } 3420 }
3411 } 3421 }
3412 3422
3413 /* 3423 /*
3414 * schedule() is the main scheduler function. 3424 * schedule() is the main scheduler function.
3415 */ 3425 */
3416 asmlinkage void __sched schedule(void) 3426 asmlinkage void __sched schedule(void)
3417 { 3427 {
3418 struct task_struct *prev, *next; 3428 struct task_struct *prev, *next;
3419 long *switch_count; 3429 long *switch_count;
3420 struct rq *rq; 3430 struct rq *rq;
3421 u64 now; 3431 u64 now;
3422 int cpu; 3432 int cpu;
3423 3433
3424 need_resched: 3434 need_resched:
3425 preempt_disable(); 3435 preempt_disable();
3426 cpu = smp_processor_id(); 3436 cpu = smp_processor_id();
3427 rq = cpu_rq(cpu); 3437 rq = cpu_rq(cpu);
3428 rcu_qsctr_inc(cpu); 3438 rcu_qsctr_inc(cpu);
3429 prev = rq->curr; 3439 prev = rq->curr;
3430 switch_count = &prev->nivcsw; 3440 switch_count = &prev->nivcsw;
3431 3441
3432 release_kernel_lock(prev); 3442 release_kernel_lock(prev);
3433 need_resched_nonpreemptible: 3443 need_resched_nonpreemptible:
3434 3444
3435 schedule_debug(prev); 3445 schedule_debug(prev);
3436 3446
3437 spin_lock_irq(&rq->lock); 3447 spin_lock_irq(&rq->lock);
3438 clear_tsk_need_resched(prev); 3448 clear_tsk_need_resched(prev);
3439 3449
3440 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3450 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3441 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3451 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3442 unlikely(signal_pending(prev)))) { 3452 unlikely(signal_pending(prev)))) {
3443 prev->state = TASK_RUNNING; 3453 prev->state = TASK_RUNNING;
3444 } else { 3454 } else {
3445 deactivate_task(rq, prev, 1); 3455 deactivate_task(rq, prev, 1);
3446 } 3456 }
3447 switch_count = &prev->nvcsw; 3457 switch_count = &prev->nvcsw;
3448 } 3458 }
3449 3459
3450 if (unlikely(!rq->nr_running)) 3460 if (unlikely(!rq->nr_running))
3451 idle_balance(cpu, rq); 3461 idle_balance(cpu, rq);
3452 3462
3453 now = __rq_clock(rq); 3463 now = __rq_clock(rq);
3454 prev->sched_class->put_prev_task(rq, prev, now); 3464 prev->sched_class->put_prev_task(rq, prev, now);
3455 next = pick_next_task(rq, prev, now); 3465 next = pick_next_task(rq, prev, now);
3456 3466
3457 sched_info_switch(prev, next); 3467 sched_info_switch(prev, next);
3458 3468
3459 if (likely(prev != next)) { 3469 if (likely(prev != next)) {
3460 rq->nr_switches++; 3470 rq->nr_switches++;
3461 rq->curr = next; 3471 rq->curr = next;
3462 ++*switch_count; 3472 ++*switch_count;
3463 3473
3464 context_switch(rq, prev, next); /* unlocks the rq */ 3474 context_switch(rq, prev, next); /* unlocks the rq */
3465 } else 3475 } else
3466 spin_unlock_irq(&rq->lock); 3476 spin_unlock_irq(&rq->lock);
3467 3477
3468 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3478 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3469 cpu = smp_processor_id(); 3479 cpu = smp_processor_id();
3470 rq = cpu_rq(cpu); 3480 rq = cpu_rq(cpu);
3471 goto need_resched_nonpreemptible; 3481 goto need_resched_nonpreemptible;
3472 } 3482 }
3473 preempt_enable_no_resched(); 3483 preempt_enable_no_resched();
3474 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3484 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3475 goto need_resched; 3485 goto need_resched;
3476 } 3486 }
3477 EXPORT_SYMBOL(schedule); 3487 EXPORT_SYMBOL(schedule);
3478 3488
3479 #ifdef CONFIG_PREEMPT 3489 #ifdef CONFIG_PREEMPT
3480 /* 3490 /*
3481 * this is the entry point to schedule() from in-kernel preemption 3491 * this is the entry point to schedule() from in-kernel preemption
3482 * off of preempt_enable. Kernel preemptions off return from interrupt 3492 * off of preempt_enable. Kernel preemptions off return from interrupt
3483 * occur there and call schedule directly. 3493 * occur there and call schedule directly.
3484 */ 3494 */
3485 asmlinkage void __sched preempt_schedule(void) 3495 asmlinkage void __sched preempt_schedule(void)
3486 { 3496 {
3487 struct thread_info *ti = current_thread_info(); 3497 struct thread_info *ti = current_thread_info();
3488 #ifdef CONFIG_PREEMPT_BKL 3498 #ifdef CONFIG_PREEMPT_BKL
3489 struct task_struct *task = current; 3499 struct task_struct *task = current;
3490 int saved_lock_depth; 3500 int saved_lock_depth;
3491 #endif 3501 #endif
3492 /* 3502 /*
3493 * If there is a non-zero preempt_count or interrupts are disabled, 3503 * If there is a non-zero preempt_count or interrupts are disabled,
3494 * we do not want to preempt the current task. Just return.. 3504 * we do not want to preempt the current task. Just return..
3495 */ 3505 */
3496 if (likely(ti->preempt_count || irqs_disabled())) 3506 if (likely(ti->preempt_count || irqs_disabled()))
3497 return; 3507 return;
3498 3508
3499 need_resched: 3509 need_resched:
3500 add_preempt_count(PREEMPT_ACTIVE); 3510 add_preempt_count(PREEMPT_ACTIVE);
3501 /* 3511 /*
3502 * We keep the big kernel semaphore locked, but we 3512 * We keep the big kernel semaphore locked, but we
3503 * clear ->lock_depth so that schedule() doesnt 3513 * clear ->lock_depth so that schedule() doesnt
3504 * auto-release the semaphore: 3514 * auto-release the semaphore:
3505 */ 3515 */
3506 #ifdef CONFIG_PREEMPT_BKL 3516 #ifdef CONFIG_PREEMPT_BKL
3507 saved_lock_depth = task->lock_depth; 3517 saved_lock_depth = task->lock_depth;
3508 task->lock_depth = -1; 3518 task->lock_depth = -1;
3509 #endif 3519 #endif
3510 schedule(); 3520 schedule();
3511 #ifdef CONFIG_PREEMPT_BKL 3521 #ifdef CONFIG_PREEMPT_BKL
3512 task->lock_depth = saved_lock_depth; 3522 task->lock_depth = saved_lock_depth;
3513 #endif 3523 #endif
3514 sub_preempt_count(PREEMPT_ACTIVE); 3524 sub_preempt_count(PREEMPT_ACTIVE);
3515 3525
3516 /* we could miss a preemption opportunity between schedule and now */ 3526 /* we could miss a preemption opportunity between schedule and now */
3517 barrier(); 3527 barrier();
3518 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3528 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3519 goto need_resched; 3529 goto need_resched;
3520 } 3530 }
3521 EXPORT_SYMBOL(preempt_schedule); 3531 EXPORT_SYMBOL(preempt_schedule);
3522 3532
3523 /* 3533 /*
3524 * this is the entry point to schedule() from kernel preemption 3534 * this is the entry point to schedule() from kernel preemption
3525 * off of irq context. 3535 * off of irq context.
3526 * Note, that this is called and return with irqs disabled. This will 3536 * Note, that this is called and return with irqs disabled. This will
3527 * protect us against recursive calling from irq. 3537 * protect us against recursive calling from irq.
3528 */ 3538 */
3529 asmlinkage void __sched preempt_schedule_irq(void) 3539 asmlinkage void __sched preempt_schedule_irq(void)
3530 { 3540 {
3531 struct thread_info *ti = current_thread_info(); 3541 struct thread_info *ti = current_thread_info();
3532 #ifdef CONFIG_PREEMPT_BKL 3542 #ifdef CONFIG_PREEMPT_BKL
3533 struct task_struct *task = current; 3543 struct task_struct *task = current;
3534 int saved_lock_depth; 3544 int saved_lock_depth;
3535 #endif 3545 #endif
3536 /* Catch callers which need to be fixed */ 3546 /* Catch callers which need to be fixed */
3537 BUG_ON(ti->preempt_count || !irqs_disabled()); 3547 BUG_ON(ti->preempt_count || !irqs_disabled());
3538 3548
3539 need_resched: 3549 need_resched:
3540 add_preempt_count(PREEMPT_ACTIVE); 3550 add_preempt_count(PREEMPT_ACTIVE);
3541 /* 3551 /*
3542 * We keep the big kernel semaphore locked, but we 3552 * We keep the big kernel semaphore locked, but we
3543 * clear ->lock_depth so that schedule() doesnt 3553 * clear ->lock_depth so that schedule() doesnt
3544 * auto-release the semaphore: 3554 * auto-release the semaphore:
3545 */ 3555 */
3546 #ifdef CONFIG_PREEMPT_BKL 3556 #ifdef CONFIG_PREEMPT_BKL
3547 saved_lock_depth = task->lock_depth; 3557 saved_lock_depth = task->lock_depth;
3548 task->lock_depth = -1; 3558 task->lock_depth = -1;
3549 #endif 3559 #endif
3550 local_irq_enable(); 3560 local_irq_enable();
3551 schedule(); 3561 schedule();
3552 local_irq_disable(); 3562 local_irq_disable();
3553 #ifdef CONFIG_PREEMPT_BKL 3563 #ifdef CONFIG_PREEMPT_BKL
3554 task->lock_depth = saved_lock_depth; 3564 task->lock_depth = saved_lock_depth;
3555 #endif 3565 #endif
3556 sub_preempt_count(PREEMPT_ACTIVE); 3566 sub_preempt_count(PREEMPT_ACTIVE);
3557 3567
3558 /* we could miss a preemption opportunity between schedule and now */ 3568 /* we could miss a preemption opportunity between schedule and now */
3559 barrier(); 3569 barrier();
3560 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3570 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3561 goto need_resched; 3571 goto need_resched;
3562 } 3572 }
3563 3573
3564 #endif /* CONFIG_PREEMPT */ 3574 #endif /* CONFIG_PREEMPT */
3565 3575
3566 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 3576 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3567 void *key) 3577 void *key)
3568 { 3578 {
3569 return try_to_wake_up(curr->private, mode, sync); 3579 return try_to_wake_up(curr->private, mode, sync);
3570 } 3580 }
3571 EXPORT_SYMBOL(default_wake_function); 3581 EXPORT_SYMBOL(default_wake_function);
3572 3582
3573 /* 3583 /*
3574 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3584 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3575 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3585 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3576 * number) then we wake all the non-exclusive tasks and one exclusive task. 3586 * number) then we wake all the non-exclusive tasks and one exclusive task.
3577 * 3587 *
3578 * There are circumstances in which we can try to wake a task which has already 3588 * There are circumstances in which we can try to wake a task which has already
3579 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3589 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3580 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3590 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3581 */ 3591 */
3582 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3592 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3583 int nr_exclusive, int sync, void *key) 3593 int nr_exclusive, int sync, void *key)
3584 { 3594 {
3585 struct list_head *tmp, *next; 3595 struct list_head *tmp, *next;
3586 3596
3587 list_for_each_safe(tmp, next, &q->task_list) { 3597 list_for_each_safe(tmp, next, &q->task_list) {
3588 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); 3598 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3589 unsigned flags = curr->flags; 3599 unsigned flags = curr->flags;
3590 3600
3591 if (curr->func(curr, mode, sync, key) && 3601 if (curr->func(curr, mode, sync, key) &&
3592 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 3602 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3593 break; 3603 break;
3594 } 3604 }
3595 } 3605 }
3596 3606
3597 /** 3607 /**
3598 * __wake_up - wake up threads blocked on a waitqueue. 3608 * __wake_up - wake up threads blocked on a waitqueue.
3599 * @q: the waitqueue 3609 * @q: the waitqueue
3600 * @mode: which threads 3610 * @mode: which threads
3601 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3611 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3602 * @key: is directly passed to the wakeup function 3612 * @key: is directly passed to the wakeup function
3603 */ 3613 */
3604 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 3614 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3605 int nr_exclusive, void *key) 3615 int nr_exclusive, void *key)
3606 { 3616 {
3607 unsigned long flags; 3617 unsigned long flags;
3608 3618
3609 spin_lock_irqsave(&q->lock, flags); 3619 spin_lock_irqsave(&q->lock, flags);
3610 __wake_up_common(q, mode, nr_exclusive, 0, key); 3620 __wake_up_common(q, mode, nr_exclusive, 0, key);
3611 spin_unlock_irqrestore(&q->lock, flags); 3621 spin_unlock_irqrestore(&q->lock, flags);
3612 } 3622 }
3613 EXPORT_SYMBOL(__wake_up); 3623 EXPORT_SYMBOL(__wake_up);
3614 3624
3615 /* 3625 /*
3616 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3626 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3617 */ 3627 */
3618 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3628 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3619 { 3629 {
3620 __wake_up_common(q, mode, 1, 0, NULL); 3630 __wake_up_common(q, mode, 1, 0, NULL);
3621 } 3631 }
3622 3632
3623 /** 3633 /**
3624 * __wake_up_sync - wake up threads blocked on a waitqueue. 3634 * __wake_up_sync - wake up threads blocked on a waitqueue.
3625 * @q: the waitqueue 3635 * @q: the waitqueue
3626 * @mode: which threads 3636 * @mode: which threads
3627 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3637 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3628 * 3638 *
3629 * The sync wakeup differs that the waker knows that it will schedule 3639 * The sync wakeup differs that the waker knows that it will schedule
3630 * away soon, so while the target thread will be woken up, it will not 3640 * away soon, so while the target thread will be woken up, it will not
3631 * be migrated to another CPU - ie. the two threads are 'synchronized' 3641 * be migrated to another CPU - ie. the two threads are 'synchronized'
3632 * with each other. This can prevent needless bouncing between CPUs. 3642 * with each other. This can prevent needless bouncing between CPUs.
3633 * 3643 *
3634 * On UP it can prevent extra preemption. 3644 * On UP it can prevent extra preemption.
3635 */ 3645 */
3636 void fastcall 3646 void fastcall
3637 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3647 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3638 { 3648 {
3639 unsigned long flags; 3649 unsigned long flags;
3640 int sync = 1; 3650 int sync = 1;
3641 3651
3642 if (unlikely(!q)) 3652 if (unlikely(!q))
3643 return; 3653 return;
3644 3654
3645 if (unlikely(!nr_exclusive)) 3655 if (unlikely(!nr_exclusive))
3646 sync = 0; 3656 sync = 0;
3647 3657
3648 spin_lock_irqsave(&q->lock, flags); 3658 spin_lock_irqsave(&q->lock, flags);
3649 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 3659 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3650 spin_unlock_irqrestore(&q->lock, flags); 3660 spin_unlock_irqrestore(&q->lock, flags);
3651 } 3661 }
3652 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3662 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3653 3663
3654 void fastcall complete(struct completion *x) 3664 void fastcall complete(struct completion *x)
3655 { 3665 {
3656 unsigned long flags; 3666 unsigned long flags;
3657 3667
3658 spin_lock_irqsave(&x->wait.lock, flags); 3668 spin_lock_irqsave(&x->wait.lock, flags);
3659 x->done++; 3669 x->done++;
3660 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 3670 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3661 1, 0, NULL); 3671 1, 0, NULL);
3662 spin_unlock_irqrestore(&x->wait.lock, flags); 3672 spin_unlock_irqrestore(&x->wait.lock, flags);
3663 } 3673 }
3664 EXPORT_SYMBOL(complete); 3674 EXPORT_SYMBOL(complete);
3665 3675
3666 void fastcall complete_all(struct completion *x) 3676 void fastcall complete_all(struct completion *x)
3667 { 3677 {
3668 unsigned long flags; 3678 unsigned long flags;
3669 3679
3670 spin_lock_irqsave(&x->wait.lock, flags); 3680 spin_lock_irqsave(&x->wait.lock, flags);
3671 x->done += UINT_MAX/2; 3681 x->done += UINT_MAX/2;
3672 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 3682 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3673 0, 0, NULL); 3683 0, 0, NULL);
3674 spin_unlock_irqrestore(&x->wait.lock, flags); 3684 spin_unlock_irqrestore(&x->wait.lock, flags);
3675 } 3685 }
3676 EXPORT_SYMBOL(complete_all); 3686 EXPORT_SYMBOL(complete_all);
3677 3687
3678 void fastcall __sched wait_for_completion(struct completion *x) 3688 void fastcall __sched wait_for_completion(struct completion *x)
3679 { 3689 {
3680 might_sleep(); 3690 might_sleep();
3681 3691
3682 spin_lock_irq(&x->wait.lock); 3692 spin_lock_irq(&x->wait.lock);
3683 if (!x->done) { 3693 if (!x->done) {
3684 DECLARE_WAITQUEUE(wait, current); 3694 DECLARE_WAITQUEUE(wait, current);
3685 3695
3686 wait.flags |= WQ_FLAG_EXCLUSIVE; 3696 wait.flags |= WQ_FLAG_EXCLUSIVE;
3687 __add_wait_queue_tail(&x->wait, &wait); 3697 __add_wait_queue_tail(&x->wait, &wait);
3688 do { 3698 do {
3689 __set_current_state(TASK_UNINTERRUPTIBLE); 3699 __set_current_state(TASK_UNINTERRUPTIBLE);
3690 spin_unlock_irq(&x->wait.lock); 3700 spin_unlock_irq(&x->wait.lock);
3691 schedule(); 3701 schedule();
3692 spin_lock_irq(&x->wait.lock); 3702 spin_lock_irq(&x->wait.lock);
3693 } while (!x->done); 3703 } while (!x->done);
3694 __remove_wait_queue(&x->wait, &wait); 3704 __remove_wait_queue(&x->wait, &wait);
3695 } 3705 }
3696 x->done--; 3706 x->done--;
3697 spin_unlock_irq(&x->wait.lock); 3707 spin_unlock_irq(&x->wait.lock);
3698 } 3708 }
3699 EXPORT_SYMBOL(wait_for_completion); 3709 EXPORT_SYMBOL(wait_for_completion);
3700 3710
3701 unsigned long fastcall __sched 3711 unsigned long fastcall __sched
3702 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3712 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3703 { 3713 {
3704 might_sleep(); 3714 might_sleep();
3705 3715
3706 spin_lock_irq(&x->wait.lock); 3716 spin_lock_irq(&x->wait.lock);
3707 if (!x->done) { 3717 if (!x->done) {
3708 DECLARE_WAITQUEUE(wait, current); 3718 DECLARE_WAITQUEUE(wait, current);
3709 3719
3710 wait.flags |= WQ_FLAG_EXCLUSIVE; 3720 wait.flags |= WQ_FLAG_EXCLUSIVE;
3711 __add_wait_queue_tail(&x->wait, &wait); 3721 __add_wait_queue_tail(&x->wait, &wait);
3712 do { 3722 do {
3713 __set_current_state(TASK_UNINTERRUPTIBLE); 3723 __set_current_state(TASK_UNINTERRUPTIBLE);
3714 spin_unlock_irq(&x->wait.lock); 3724 spin_unlock_irq(&x->wait.lock);
3715 timeout = schedule_timeout(timeout); 3725 timeout = schedule_timeout(timeout);
3716 spin_lock_irq(&x->wait.lock); 3726 spin_lock_irq(&x->wait.lock);
3717 if (!timeout) { 3727 if (!timeout) {
3718 __remove_wait_queue(&x->wait, &wait); 3728 __remove_wait_queue(&x->wait, &wait);
3719 goto out; 3729 goto out;
3720 } 3730 }
3721 } while (!x->done); 3731 } while (!x->done);
3722 __remove_wait_queue(&x->wait, &wait); 3732 __remove_wait_queue(&x->wait, &wait);
3723 } 3733 }
3724 x->done--; 3734 x->done--;
3725 out: 3735 out:
3726 spin_unlock_irq(&x->wait.lock); 3736 spin_unlock_irq(&x->wait.lock);
3727 return timeout; 3737 return timeout;
3728 } 3738 }
3729 EXPORT_SYMBOL(wait_for_completion_timeout); 3739 EXPORT_SYMBOL(wait_for_completion_timeout);
3730 3740
3731 int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3741 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3732 { 3742 {
3733 int ret = 0; 3743 int ret = 0;
3734 3744
3735 might_sleep(); 3745 might_sleep();
3736 3746
3737 spin_lock_irq(&x->wait.lock); 3747 spin_lock_irq(&x->wait.lock);
3738 if (!x->done) { 3748 if (!x->done) {
3739 DECLARE_WAITQUEUE(wait, current); 3749 DECLARE_WAITQUEUE(wait, current);
3740 3750
3741 wait.flags |= WQ_FLAG_EXCLUSIVE; 3751 wait.flags |= WQ_FLAG_EXCLUSIVE;
3742 __add_wait_queue_tail(&x->wait, &wait); 3752 __add_wait_queue_tail(&x->wait, &wait);
3743 do { 3753 do {
3744 if (signal_pending(current)) { 3754 if (signal_pending(current)) {
3745 ret = -ERESTARTSYS; 3755 ret = -ERESTARTSYS;
3746 __remove_wait_queue(&x->wait, &wait); 3756 __remove_wait_queue(&x->wait, &wait);
3747 goto out; 3757 goto out;
3748 } 3758 }
3749 __set_current_state(TASK_INTERRUPTIBLE); 3759 __set_current_state(TASK_INTERRUPTIBLE);
3750 spin_unlock_irq(&x->wait.lock); 3760 spin_unlock_irq(&x->wait.lock);
3751 schedule(); 3761 schedule();
3752 spin_lock_irq(&x->wait.lock); 3762 spin_lock_irq(&x->wait.lock);
3753 } while (!x->done); 3763 } while (!x->done);
3754 __remove_wait_queue(&x->wait, &wait); 3764 __remove_wait_queue(&x->wait, &wait);
3755 } 3765 }
3756 x->done--; 3766 x->done--;
3757 out: 3767 out:
3758 spin_unlock_irq(&x->wait.lock); 3768 spin_unlock_irq(&x->wait.lock);
3759 3769
3760 return ret; 3770 return ret;
3761 } 3771 }
3762 EXPORT_SYMBOL(wait_for_completion_interruptible); 3772 EXPORT_SYMBOL(wait_for_completion_interruptible);
3763 3773
3764 unsigned long fastcall __sched 3774 unsigned long fastcall __sched
3765 wait_for_completion_interruptible_timeout(struct completion *x, 3775 wait_for_completion_interruptible_timeout(struct completion *x,
3766 unsigned long timeout) 3776 unsigned long timeout)
3767 { 3777 {
3768 might_sleep(); 3778 might_sleep();
3769 3779
3770 spin_lock_irq(&x->wait.lock); 3780 spin_lock_irq(&x->wait.lock);
3771 if (!x->done) { 3781 if (!x->done) {
3772 DECLARE_WAITQUEUE(wait, current); 3782 DECLARE_WAITQUEUE(wait, current);
3773 3783
3774 wait.flags |= WQ_FLAG_EXCLUSIVE; 3784 wait.flags |= WQ_FLAG_EXCLUSIVE;
3775 __add_wait_queue_tail(&x->wait, &wait); 3785 __add_wait_queue_tail(&x->wait, &wait);
3776 do { 3786 do {
3777 if (signal_pending(current)) { 3787 if (signal_pending(current)) {
3778 timeout = -ERESTARTSYS; 3788 timeout = -ERESTARTSYS;
3779 __remove_wait_queue(&x->wait, &wait); 3789 __remove_wait_queue(&x->wait, &wait);
3780 goto out; 3790 goto out;
3781 } 3791 }
3782 __set_current_state(TASK_INTERRUPTIBLE); 3792 __set_current_state(TASK_INTERRUPTIBLE);
3783 spin_unlock_irq(&x->wait.lock); 3793 spin_unlock_irq(&x->wait.lock);
3784 timeout = schedule_timeout(timeout); 3794 timeout = schedule_timeout(timeout);
3785 spin_lock_irq(&x->wait.lock); 3795 spin_lock_irq(&x->wait.lock);
3786 if (!timeout) { 3796 if (!timeout) {
3787 __remove_wait_queue(&x->wait, &wait); 3797 __remove_wait_queue(&x->wait, &wait);
3788 goto out; 3798 goto out;
3789 } 3799 }
3790 } while (!x->done); 3800 } while (!x->done);
3791 __remove_wait_queue(&x->wait, &wait); 3801 __remove_wait_queue(&x->wait, &wait);
3792 } 3802 }
3793 x->done--; 3803 x->done--;
3794 out: 3804 out:
3795 spin_unlock_irq(&x->wait.lock); 3805 spin_unlock_irq(&x->wait.lock);
3796 return timeout; 3806 return timeout;
3797 } 3807 }
3798 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3808 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3799 3809
3800 static inline void 3810 static inline void
3801 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) 3811 sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3802 { 3812 {
3803 spin_lock_irqsave(&q->lock, *flags); 3813 spin_lock_irqsave(&q->lock, *flags);
3804 __add_wait_queue(q, wait); 3814 __add_wait_queue(q, wait);
3805 spin_unlock(&q->lock); 3815 spin_unlock(&q->lock);
3806 } 3816 }
3807 3817
3808 static inline void 3818 static inline void
3809 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) 3819 sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3810 { 3820 {
3811 spin_lock_irq(&q->lock); 3821 spin_lock_irq(&q->lock);
3812 __remove_wait_queue(q, wait); 3822 __remove_wait_queue(q, wait);
3813 spin_unlock_irqrestore(&q->lock, *flags); 3823 spin_unlock_irqrestore(&q->lock, *flags);
3814 } 3824 }
3815 3825
3816 void __sched interruptible_sleep_on(wait_queue_head_t *q) 3826 void __sched interruptible_sleep_on(wait_queue_head_t *q)
3817 { 3827 {
3818 unsigned long flags; 3828 unsigned long flags;
3819 wait_queue_t wait; 3829 wait_queue_t wait;
3820 3830
3821 init_waitqueue_entry(&wait, current); 3831 init_waitqueue_entry(&wait, current);
3822 3832
3823 current->state = TASK_INTERRUPTIBLE; 3833 current->state = TASK_INTERRUPTIBLE;
3824 3834
3825 sleep_on_head(q, &wait, &flags); 3835 sleep_on_head(q, &wait, &flags);
3826 schedule(); 3836 schedule();
3827 sleep_on_tail(q, &wait, &flags); 3837 sleep_on_tail(q, &wait, &flags);
3828 } 3838 }
3829 EXPORT_SYMBOL(interruptible_sleep_on); 3839 EXPORT_SYMBOL(interruptible_sleep_on);
3830 3840
3831 long __sched 3841 long __sched
3832 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3842 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3833 { 3843 {
3834 unsigned long flags; 3844 unsigned long flags;
3835 wait_queue_t wait; 3845 wait_queue_t wait;
3836 3846
3837 init_waitqueue_entry(&wait, current); 3847 init_waitqueue_entry(&wait, current);
3838 3848
3839 current->state = TASK_INTERRUPTIBLE; 3849 current->state = TASK_INTERRUPTIBLE;
3840 3850
3841 sleep_on_head(q, &wait, &flags); 3851 sleep_on_head(q, &wait, &flags);
3842 timeout = schedule_timeout(timeout); 3852 timeout = schedule_timeout(timeout);
3843 sleep_on_tail(q, &wait, &flags); 3853 sleep_on_tail(q, &wait, &flags);
3844 3854
3845 return timeout; 3855 return timeout;
3846 } 3856 }
3847 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3857 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3848 3858
3849 void __sched sleep_on(wait_queue_head_t *q) 3859 void __sched sleep_on(wait_queue_head_t *q)
3850 { 3860 {
3851 unsigned long flags; 3861 unsigned long flags;
3852 wait_queue_t wait; 3862 wait_queue_t wait;
3853 3863
3854 init_waitqueue_entry(&wait, current); 3864 init_waitqueue_entry(&wait, current);
3855 3865
3856 current->state = TASK_UNINTERRUPTIBLE; 3866 current->state = TASK_UNINTERRUPTIBLE;
3857 3867
3858 sleep_on_head(q, &wait, &flags); 3868 sleep_on_head(q, &wait, &flags);
3859 schedule(); 3869 schedule();
3860 sleep_on_tail(q, &wait, &flags); 3870 sleep_on_tail(q, &wait, &flags);
3861 } 3871 }
3862 EXPORT_SYMBOL(sleep_on); 3872 EXPORT_SYMBOL(sleep_on);
3863 3873
3864 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3874 long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3865 { 3875 {
3866 unsigned long flags; 3876 unsigned long flags;
3867 wait_queue_t wait; 3877 wait_queue_t wait;
3868 3878
3869 init_waitqueue_entry(&wait, current); 3879 init_waitqueue_entry(&wait, current);
3870 3880
3871 current->state = TASK_UNINTERRUPTIBLE; 3881 current->state = TASK_UNINTERRUPTIBLE;
3872 3882
3873 sleep_on_head(q, &wait, &flags); 3883 sleep_on_head(q, &wait, &flags);
3874 timeout = schedule_timeout(timeout); 3884 timeout = schedule_timeout(timeout);
3875 sleep_on_tail(q, &wait, &flags); 3885 sleep_on_tail(q, &wait, &flags);
3876 3886
3877 return timeout; 3887 return timeout;
3878 } 3888 }
3879 EXPORT_SYMBOL(sleep_on_timeout); 3889 EXPORT_SYMBOL(sleep_on_timeout);
3880 3890
3881 #ifdef CONFIG_RT_MUTEXES 3891 #ifdef CONFIG_RT_MUTEXES
3882 3892
3883 /* 3893 /*
3884 * rt_mutex_setprio - set the current priority of a task 3894 * rt_mutex_setprio - set the current priority of a task
3885 * @p: task 3895 * @p: task
3886 * @prio: prio value (kernel-internal form) 3896 * @prio: prio value (kernel-internal form)
3887 * 3897 *
3888 * This function changes the 'effective' priority of a task. It does 3898 * This function changes the 'effective' priority of a task. It does
3889 * not touch ->normal_prio like __setscheduler(). 3899 * not touch ->normal_prio like __setscheduler().
3890 * 3900 *
3891 * Used by the rt_mutex code to implement priority inheritance logic. 3901 * Used by the rt_mutex code to implement priority inheritance logic.
3892 */ 3902 */
3893 void rt_mutex_setprio(struct task_struct *p, int prio) 3903 void rt_mutex_setprio(struct task_struct *p, int prio)
3894 { 3904 {
3895 unsigned long flags; 3905 unsigned long flags;
3896 int oldprio, on_rq; 3906 int oldprio, on_rq;
3897 struct rq *rq; 3907 struct rq *rq;
3898 u64 now; 3908 u64 now;
3899 3909
3900 BUG_ON(prio < 0 || prio > MAX_PRIO); 3910 BUG_ON(prio < 0 || prio > MAX_PRIO);
3901 3911
3902 rq = task_rq_lock(p, &flags); 3912 rq = task_rq_lock(p, &flags);
3903 now = rq_clock(rq); 3913 now = rq_clock(rq);
3904 3914
3905 oldprio = p->prio; 3915 oldprio = p->prio;
3906 on_rq = p->se.on_rq; 3916 on_rq = p->se.on_rq;
3907 if (on_rq) 3917 if (on_rq)
3908 dequeue_task(rq, p, 0, now); 3918 dequeue_task(rq, p, 0, now);
3909 3919
3910 if (rt_prio(prio)) 3920 if (rt_prio(prio))
3911 p->sched_class = &rt_sched_class; 3921 p->sched_class = &rt_sched_class;
3912 else 3922 else
3913 p->sched_class = &fair_sched_class; 3923 p->sched_class = &fair_sched_class;
3914 3924
3915 p->prio = prio; 3925 p->prio = prio;
3916 3926
3917 if (on_rq) { 3927 if (on_rq) {
3918 enqueue_task(rq, p, 0, now); 3928 enqueue_task(rq, p, 0, now);
3919 /* 3929 /*
3920 * Reschedule if we are currently running on this runqueue and 3930 * Reschedule if we are currently running on this runqueue and
3921 * our priority decreased, or if we are not currently running on 3931 * our priority decreased, or if we are not currently running on
3922 * this runqueue and our priority is higher than the current's 3932 * this runqueue and our priority is higher than the current's
3923 */ 3933 */
3924 if (task_running(rq, p)) { 3934 if (task_running(rq, p)) {
3925 if (p->prio > oldprio) 3935 if (p->prio > oldprio)
3926 resched_task(rq->curr); 3936 resched_task(rq->curr);
3927 } else { 3937 } else {
3928 check_preempt_curr(rq, p); 3938 check_preempt_curr(rq, p);
3929 } 3939 }
3930 } 3940 }
3931 task_rq_unlock(rq, &flags); 3941 task_rq_unlock(rq, &flags);
3932 } 3942 }
3933 3943
3934 #endif 3944 #endif
3935 3945
3936 void set_user_nice(struct task_struct *p, long nice) 3946 void set_user_nice(struct task_struct *p, long nice)
3937 { 3947 {
3938 int old_prio, delta, on_rq; 3948 int old_prio, delta, on_rq;
3939 unsigned long flags; 3949 unsigned long flags;
3940 struct rq *rq; 3950 struct rq *rq;
3941 u64 now; 3951 u64 now;
3942 3952
3943 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3953 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3944 return; 3954 return;
3945 /* 3955 /*
3946 * We have to be careful, if called from sys_setpriority(), 3956 * We have to be careful, if called from sys_setpriority(),
3947 * the task might be in the middle of scheduling on another CPU. 3957 * the task might be in the middle of scheduling on another CPU.
3948 */ 3958 */
3949 rq = task_rq_lock(p, &flags); 3959 rq = task_rq_lock(p, &flags);
3950 now = rq_clock(rq); 3960 now = rq_clock(rq);
3951 /* 3961 /*
3952 * The RT priorities are set via sched_setscheduler(), but we still 3962 * The RT priorities are set via sched_setscheduler(), but we still
3953 * allow the 'normal' nice value to be set - but as expected 3963 * allow the 'normal' nice value to be set - but as expected
3954 * it wont have any effect on scheduling until the task is 3964 * it wont have any effect on scheduling until the task is
3955 * SCHED_FIFO/SCHED_RR: 3965 * SCHED_FIFO/SCHED_RR:
3956 */ 3966 */
3957 if (task_has_rt_policy(p)) { 3967 if (task_has_rt_policy(p)) {
3958 p->static_prio = NICE_TO_PRIO(nice); 3968 p->static_prio = NICE_TO_PRIO(nice);
3959 goto out_unlock; 3969 goto out_unlock;
3960 } 3970 }
3961 on_rq = p->se.on_rq; 3971 on_rq = p->se.on_rq;
3962 if (on_rq) { 3972 if (on_rq) {
3963 dequeue_task(rq, p, 0, now); 3973 dequeue_task(rq, p, 0, now);
3964 dec_load(rq, p, now); 3974 dec_load(rq, p, now);
3965 } 3975 }
3966 3976
3967 p->static_prio = NICE_TO_PRIO(nice); 3977 p->static_prio = NICE_TO_PRIO(nice);
3968 set_load_weight(p); 3978 set_load_weight(p);
3969 old_prio = p->prio; 3979 old_prio = p->prio;
3970 p->prio = effective_prio(p); 3980 p->prio = effective_prio(p);
3971 delta = p->prio - old_prio; 3981 delta = p->prio - old_prio;
3972 3982
3973 if (on_rq) { 3983 if (on_rq) {
3974 enqueue_task(rq, p, 0, now); 3984 enqueue_task(rq, p, 0, now);
3975 inc_load(rq, p, now); 3985 inc_load(rq, p, now);
3976 /* 3986 /*
3977 * If the task increased its priority or is running and 3987 * If the task increased its priority or is running and
3978 * lowered its priority, then reschedule its CPU: 3988 * lowered its priority, then reschedule its CPU:
3979 */ 3989 */
3980 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3990 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3981 resched_task(rq->curr); 3991 resched_task(rq->curr);
3982 } 3992 }
3983 out_unlock: 3993 out_unlock:
3984 task_rq_unlock(rq, &flags); 3994 task_rq_unlock(rq, &flags);
3985 } 3995 }
3986 EXPORT_SYMBOL(set_user_nice); 3996 EXPORT_SYMBOL(set_user_nice);
3987 3997
3988 /* 3998 /*
3989 * can_nice - check if a task can reduce its nice value 3999 * can_nice - check if a task can reduce its nice value
3990 * @p: task 4000 * @p: task
3991 * @nice: nice value 4001 * @nice: nice value
3992 */ 4002 */
3993 int can_nice(const struct task_struct *p, const int nice) 4003 int can_nice(const struct task_struct *p, const int nice)
3994 { 4004 {
3995 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4005 /* convert nice value [19,-20] to rlimit style value [1,40] */
3996 int nice_rlim = 20 - nice; 4006 int nice_rlim = 20 - nice;
3997 4007
3998 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4008 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3999 capable(CAP_SYS_NICE)); 4009 capable(CAP_SYS_NICE));
4000 } 4010 }
4001 4011
4002 #ifdef __ARCH_WANT_SYS_NICE 4012 #ifdef __ARCH_WANT_SYS_NICE
4003 4013
4004 /* 4014 /*
4005 * sys_nice - change the priority of the current process. 4015 * sys_nice - change the priority of the current process.
4006 * @increment: priority increment 4016 * @increment: priority increment
4007 * 4017 *
4008 * sys_setpriority is a more generic, but much slower function that 4018 * sys_setpriority is a more generic, but much slower function that
4009 * does similar things. 4019 * does similar things.
4010 */ 4020 */
4011 asmlinkage long sys_nice(int increment) 4021 asmlinkage long sys_nice(int increment)
4012 { 4022 {
4013 long nice, retval; 4023 long nice, retval;
4014 4024
4015 /* 4025 /*
4016 * Setpriority might change our priority at the same moment. 4026 * Setpriority might change our priority at the same moment.
4017 * We don't have to worry. Conceptually one call occurs first 4027 * We don't have to worry. Conceptually one call occurs first
4018 * and we have a single winner. 4028 * and we have a single winner.
4019 */ 4029 */
4020 if (increment < -40) 4030 if (increment < -40)
4021 increment = -40; 4031 increment = -40;
4022 if (increment > 40) 4032 if (increment > 40)
4023 increment = 40; 4033 increment = 40;
4024 4034
4025 nice = PRIO_TO_NICE(current->static_prio) + increment; 4035 nice = PRIO_TO_NICE(current->static_prio) + increment;
4026 if (nice < -20) 4036 if (nice < -20)
4027 nice = -20; 4037 nice = -20;
4028 if (nice > 19) 4038 if (nice > 19)
4029 nice = 19; 4039 nice = 19;
4030 4040
4031 if (increment < 0 && !can_nice(current, nice)) 4041 if (increment < 0 && !can_nice(current, nice))
4032 return -EPERM; 4042 return -EPERM;
4033 4043
4034 retval = security_task_setnice(current, nice); 4044 retval = security_task_setnice(current, nice);
4035 if (retval) 4045 if (retval)
4036 return retval; 4046 return retval;
4037 4047
4038 set_user_nice(current, nice); 4048 set_user_nice(current, nice);
4039 return 0; 4049 return 0;
4040 } 4050 }
4041 4051
4042 #endif 4052 #endif
4043 4053
4044 /** 4054 /**
4045 * task_prio - return the priority value of a given task. 4055 * task_prio - return the priority value of a given task.
4046 * @p: the task in question. 4056 * @p: the task in question.
4047 * 4057 *
4048 * This is the priority value as seen by users in /proc. 4058 * This is the priority value as seen by users in /proc.
4049 * RT tasks are offset by -200. Normal tasks are centered 4059 * RT tasks are offset by -200. Normal tasks are centered
4050 * around 0, value goes from -16 to +15. 4060 * around 0, value goes from -16 to +15.
4051 */ 4061 */
4052 int task_prio(const struct task_struct *p) 4062 int task_prio(const struct task_struct *p)
4053 { 4063 {
4054 return p->prio - MAX_RT_PRIO; 4064 return p->prio - MAX_RT_PRIO;
4055 } 4065 }
4056 4066
4057 /** 4067 /**
4058 * task_nice - return the nice value of a given task. 4068 * task_nice - return the nice value of a given task.
4059 * @p: the task in question. 4069 * @p: the task in question.
4060 */ 4070 */
4061 int task_nice(const struct task_struct *p) 4071 int task_nice(const struct task_struct *p)
4062 { 4072 {
4063 return TASK_NICE(p); 4073 return TASK_NICE(p);
4064 } 4074 }
4065 EXPORT_SYMBOL_GPL(task_nice); 4075 EXPORT_SYMBOL_GPL(task_nice);
4066 4076
4067 /** 4077 /**
4068 * idle_cpu - is a given cpu idle currently? 4078 * idle_cpu - is a given cpu idle currently?
4069 * @cpu: the processor in question. 4079 * @cpu: the processor in question.
4070 */ 4080 */
4071 int idle_cpu(int cpu) 4081 int idle_cpu(int cpu)
4072 { 4082 {
4073 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 4083 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4074 } 4084 }
4075 4085
4076 /** 4086 /**
4077 * idle_task - return the idle task for a given cpu. 4087 * idle_task - return the idle task for a given cpu.
4078 * @cpu: the processor in question. 4088 * @cpu: the processor in question.
4079 */ 4089 */
4080 struct task_struct *idle_task(int cpu) 4090 struct task_struct *idle_task(int cpu)
4081 { 4091 {
4082 return cpu_rq(cpu)->idle; 4092 return cpu_rq(cpu)->idle;
4083 } 4093 }
4084 4094
4085 /** 4095 /**
4086 * find_process_by_pid - find a process with a matching PID value. 4096 * find_process_by_pid - find a process with a matching PID value.
4087 * @pid: the pid in question. 4097 * @pid: the pid in question.
4088 */ 4098 */
4089 static inline struct task_struct *find_process_by_pid(pid_t pid) 4099 static inline struct task_struct *find_process_by_pid(pid_t pid)
4090 { 4100 {
4091 return pid ? find_task_by_pid(pid) : current; 4101 return pid ? find_task_by_pid(pid) : current;
4092 } 4102 }
4093 4103
4094 /* Actually do priority change: must hold rq lock. */ 4104 /* Actually do priority change: must hold rq lock. */
4095 static void 4105 static void
4096 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4106 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4097 { 4107 {
4098 BUG_ON(p->se.on_rq); 4108 BUG_ON(p->se.on_rq);
4099 4109
4100 p->policy = policy; 4110 p->policy = policy;
4101 switch (p->policy) { 4111 switch (p->policy) {
4102 case SCHED_NORMAL: 4112 case SCHED_NORMAL:
4103 case SCHED_BATCH: 4113 case SCHED_BATCH:
4104 case SCHED_IDLE: 4114 case SCHED_IDLE:
4105 p->sched_class = &fair_sched_class; 4115 p->sched_class = &fair_sched_class;
4106 break; 4116 break;
4107 case SCHED_FIFO: 4117 case SCHED_FIFO:
4108 case SCHED_RR: 4118 case SCHED_RR:
4109 p->sched_class = &rt_sched_class; 4119 p->sched_class = &rt_sched_class;
4110 break; 4120 break;
4111 } 4121 }
4112 4122
4113 p->rt_priority = prio; 4123 p->rt_priority = prio;
4114 p->normal_prio = normal_prio(p); 4124 p->normal_prio = normal_prio(p);
4115 /* we are holding p->pi_lock already */ 4125 /* we are holding p->pi_lock already */
4116 p->prio = rt_mutex_getprio(p); 4126 p->prio = rt_mutex_getprio(p);
4117 set_load_weight(p); 4127 set_load_weight(p);
4118 } 4128 }
4119 4129
4120 /** 4130 /**
4121 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4131 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4122 * @p: the task in question. 4132 * @p: the task in question.
4123 * @policy: new policy. 4133 * @policy: new policy.
4124 * @param: structure containing the new RT priority. 4134 * @param: structure containing the new RT priority.
4125 * 4135 *
4126 * NOTE that the task may be already dead. 4136 * NOTE that the task may be already dead.
4127 */ 4137 */
4128 int sched_setscheduler(struct task_struct *p, int policy, 4138 int sched_setscheduler(struct task_struct *p, int policy,
4129 struct sched_param *param) 4139 struct sched_param *param)
4130 { 4140 {
4131 int retval, oldprio, oldpolicy = -1, on_rq; 4141 int retval, oldprio, oldpolicy = -1, on_rq;
4132 unsigned long flags; 4142 unsigned long flags;
4133 struct rq *rq; 4143 struct rq *rq;
4134 4144
4135 /* may grab non-irq protected spin_locks */ 4145 /* may grab non-irq protected spin_locks */
4136 BUG_ON(in_interrupt()); 4146 BUG_ON(in_interrupt());
4137 recheck: 4147 recheck:
4138 /* double check policy once rq lock held */ 4148 /* double check policy once rq lock held */
4139 if (policy < 0) 4149 if (policy < 0)
4140 policy = oldpolicy = p->policy; 4150 policy = oldpolicy = p->policy;
4141 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4151 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4142 policy != SCHED_NORMAL && policy != SCHED_BATCH && 4152 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4143 policy != SCHED_IDLE) 4153 policy != SCHED_IDLE)
4144 return -EINVAL; 4154 return -EINVAL;
4145 /* 4155 /*
4146 * Valid priorities for SCHED_FIFO and SCHED_RR are 4156 * Valid priorities for SCHED_FIFO and SCHED_RR are
4147 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 4157 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4148 * SCHED_BATCH and SCHED_IDLE is 0. 4158 * SCHED_BATCH and SCHED_IDLE is 0.
4149 */ 4159 */
4150 if (param->sched_priority < 0 || 4160 if (param->sched_priority < 0 ||
4151 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4161 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4152 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4162 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4153 return -EINVAL; 4163 return -EINVAL;
4154 if (rt_policy(policy) != (param->sched_priority != 0)) 4164 if (rt_policy(policy) != (param->sched_priority != 0))
4155 return -EINVAL; 4165 return -EINVAL;
4156 4166
4157 /* 4167 /*
4158 * Allow unprivileged RT tasks to decrease priority: 4168 * Allow unprivileged RT tasks to decrease priority:
4159 */ 4169 */
4160 if (!capable(CAP_SYS_NICE)) { 4170 if (!capable(CAP_SYS_NICE)) {
4161 if (rt_policy(policy)) { 4171 if (rt_policy(policy)) {
4162 unsigned long rlim_rtprio; 4172 unsigned long rlim_rtprio;
4163 4173
4164 if (!lock_task_sighand(p, &flags)) 4174 if (!lock_task_sighand(p, &flags))
4165 return -ESRCH; 4175 return -ESRCH;
4166 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4176 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4167 unlock_task_sighand(p, &flags); 4177 unlock_task_sighand(p, &flags);
4168 4178
4169 /* can't set/change the rt policy */ 4179 /* can't set/change the rt policy */
4170 if (policy != p->policy && !rlim_rtprio) 4180 if (policy != p->policy && !rlim_rtprio)
4171 return -EPERM; 4181 return -EPERM;
4172 4182
4173 /* can't increase priority */ 4183 /* can't increase priority */
4174 if (param->sched_priority > p->rt_priority && 4184 if (param->sched_priority > p->rt_priority &&
4175 param->sched_priority > rlim_rtprio) 4185 param->sched_priority > rlim_rtprio)
4176 return -EPERM; 4186 return -EPERM;
4177 } 4187 }
4178 /* 4188 /*
4179 * Like positive nice levels, dont allow tasks to 4189 * Like positive nice levels, dont allow tasks to
4180 * move out of SCHED_IDLE either: 4190 * move out of SCHED_IDLE either:
4181 */ 4191 */
4182 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) 4192 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4183 return -EPERM; 4193 return -EPERM;
4184 4194
4185 /* can't change other user's priorities */ 4195 /* can't change other user's priorities */
4186 if ((current->euid != p->euid) && 4196 if ((current->euid != p->euid) &&
4187 (current->euid != p->uid)) 4197 (current->euid != p->uid))
4188 return -EPERM; 4198 return -EPERM;
4189 } 4199 }
4190 4200
4191 retval = security_task_setscheduler(p, policy, param); 4201 retval = security_task_setscheduler(p, policy, param);
4192 if (retval) 4202 if (retval)
4193 return retval; 4203 return retval;
4194 /* 4204 /*
4195 * make sure no PI-waiters arrive (or leave) while we are 4205 * make sure no PI-waiters arrive (or leave) while we are
4196 * changing the priority of the task: 4206 * changing the priority of the task:
4197 */ 4207 */
4198 spin_lock_irqsave(&p->pi_lock, flags); 4208 spin_lock_irqsave(&p->pi_lock, flags);
4199 /* 4209 /*
4200 * To be able to change p->policy safely, the apropriate 4210 * To be able to change p->policy safely, the apropriate
4201 * runqueue lock must be held. 4211 * runqueue lock must be held.
4202 */ 4212 */
4203 rq = __task_rq_lock(p); 4213 rq = __task_rq_lock(p);
4204 /* recheck policy now with rq lock held */ 4214 /* recheck policy now with rq lock held */
4205 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4215 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4206 policy = oldpolicy = -1; 4216 policy = oldpolicy = -1;
4207 __task_rq_unlock(rq); 4217 __task_rq_unlock(rq);
4208 spin_unlock_irqrestore(&p->pi_lock, flags); 4218 spin_unlock_irqrestore(&p->pi_lock, flags);
4209 goto recheck; 4219 goto recheck;
4210 } 4220 }
4211 on_rq = p->se.on_rq; 4221 on_rq = p->se.on_rq;
4212 if (on_rq) 4222 if (on_rq)
4213 deactivate_task(rq, p, 0); 4223 deactivate_task(rq, p, 0);
4214 oldprio = p->prio; 4224 oldprio = p->prio;
4215 __setscheduler(rq, p, policy, param->sched_priority); 4225 __setscheduler(rq, p, policy, param->sched_priority);
4216 if (on_rq) { 4226 if (on_rq) {
4217 activate_task(rq, p, 0); 4227 activate_task(rq, p, 0);
4218 /* 4228 /*
4219 * Reschedule if we are currently running on this runqueue and 4229 * Reschedule if we are currently running on this runqueue and
4220 * our priority decreased, or if we are not currently running on 4230 * our priority decreased, or if we are not currently running on
4221 * this runqueue and our priority is higher than the current's 4231 * this runqueue and our priority is higher than the current's
4222 */ 4232 */
4223 if (task_running(rq, p)) { 4233 if (task_running(rq, p)) {
4224 if (p->prio > oldprio) 4234 if (p->prio > oldprio)
4225 resched_task(rq->curr); 4235 resched_task(rq->curr);
4226 } else { 4236 } else {
4227 check_preempt_curr(rq, p); 4237 check_preempt_curr(rq, p);
4228 } 4238 }
4229 } 4239 }
4230 __task_rq_unlock(rq); 4240 __task_rq_unlock(rq);
4231 spin_unlock_irqrestore(&p->pi_lock, flags); 4241 spin_unlock_irqrestore(&p->pi_lock, flags);
4232 4242
4233 rt_mutex_adjust_pi(p); 4243 rt_mutex_adjust_pi(p);
4234 4244
4235 return 0; 4245 return 0;
4236 } 4246 }
4237 EXPORT_SYMBOL_GPL(sched_setscheduler); 4247 EXPORT_SYMBOL_GPL(sched_setscheduler);
4238 4248
4239 static int 4249 static int
4240 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4250 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4241 { 4251 {
4242 struct sched_param lparam; 4252 struct sched_param lparam;
4243 struct task_struct *p; 4253 struct task_struct *p;
4244 int retval; 4254 int retval;
4245 4255
4246 if (!param || pid < 0) 4256 if (!param || pid < 0)
4247 return -EINVAL; 4257 return -EINVAL;
4248 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4258 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4249 return -EFAULT; 4259 return -EFAULT;
4250 4260
4251 rcu_read_lock(); 4261 rcu_read_lock();
4252 retval = -ESRCH; 4262 retval = -ESRCH;
4253 p = find_process_by_pid(pid); 4263 p = find_process_by_pid(pid);
4254 if (p != NULL) 4264 if (p != NULL)
4255 retval = sched_setscheduler(p, policy, &lparam); 4265 retval = sched_setscheduler(p, policy, &lparam);
4256 rcu_read_unlock(); 4266 rcu_read_unlock();
4257 4267
4258 return retval; 4268 return retval;
4259 } 4269 }
4260 4270
4261 /** 4271 /**
4262 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4272 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4263 * @pid: the pid in question. 4273 * @pid: the pid in question.
4264 * @policy: new policy. 4274 * @policy: new policy.
4265 * @param: structure containing the new RT priority. 4275 * @param: structure containing the new RT priority.
4266 */ 4276 */
4267 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 4277 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4268 struct sched_param __user *param) 4278 struct sched_param __user *param)
4269 { 4279 {
4270 /* negative values for policy are not valid */ 4280 /* negative values for policy are not valid */
4271 if (policy < 0) 4281 if (policy < 0)
4272 return -EINVAL; 4282 return -EINVAL;
4273 4283
4274 return do_sched_setscheduler(pid, policy, param); 4284 return do_sched_setscheduler(pid, policy, param);
4275 } 4285 }
4276 4286
4277 /** 4287 /**
4278 * sys_sched_setparam - set/change the RT priority of a thread 4288 * sys_sched_setparam - set/change the RT priority of a thread
4279 * @pid: the pid in question. 4289 * @pid: the pid in question.
4280 * @param: structure containing the new RT priority. 4290 * @param: structure containing the new RT priority.
4281 */ 4291 */
4282 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 4292 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4283 { 4293 {
4284 return do_sched_setscheduler(pid, -1, param); 4294 return do_sched_setscheduler(pid, -1, param);
4285 } 4295 }
4286 4296
4287 /** 4297 /**
4288 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4298 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4289 * @pid: the pid in question. 4299 * @pid: the pid in question.
4290 */ 4300 */
4291 asmlinkage long sys_sched_getscheduler(pid_t pid) 4301 asmlinkage long sys_sched_getscheduler(pid_t pid)
4292 { 4302 {
4293 struct task_struct *p; 4303 struct task_struct *p;
4294 int retval = -EINVAL; 4304 int retval = -EINVAL;
4295 4305
4296 if (pid < 0) 4306 if (pid < 0)
4297 goto out_nounlock; 4307 goto out_nounlock;
4298 4308
4299 retval = -ESRCH; 4309 retval = -ESRCH;
4300 read_lock(&tasklist_lock); 4310 read_lock(&tasklist_lock);
4301 p = find_process_by_pid(pid); 4311 p = find_process_by_pid(pid);
4302 if (p) { 4312 if (p) {
4303 retval = security_task_getscheduler(p); 4313 retval = security_task_getscheduler(p);
4304 if (!retval) 4314 if (!retval)
4305 retval = p->policy; 4315 retval = p->policy;
4306 } 4316 }
4307 read_unlock(&tasklist_lock); 4317 read_unlock(&tasklist_lock);
4308 4318
4309 out_nounlock: 4319 out_nounlock:
4310 return retval; 4320 return retval;
4311 } 4321 }
4312 4322
4313 /** 4323 /**
4314 * sys_sched_getscheduler - get the RT priority of a thread 4324 * sys_sched_getscheduler - get the RT priority of a thread
4315 * @pid: the pid in question. 4325 * @pid: the pid in question.
4316 * @param: structure containing the RT priority. 4326 * @param: structure containing the RT priority.
4317 */ 4327 */
4318 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 4328 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4319 { 4329 {
4320 struct sched_param lp; 4330 struct sched_param lp;
4321 struct task_struct *p; 4331 struct task_struct *p;
4322 int retval = -EINVAL; 4332 int retval = -EINVAL;
4323 4333
4324 if (!param || pid < 0) 4334 if (!param || pid < 0)
4325 goto out_nounlock; 4335 goto out_nounlock;
4326 4336
4327 read_lock(&tasklist_lock); 4337 read_lock(&tasklist_lock);
4328 p = find_process_by_pid(pid); 4338 p = find_process_by_pid(pid);
4329 retval = -ESRCH; 4339 retval = -ESRCH;
4330 if (!p) 4340 if (!p)
4331 goto out_unlock; 4341 goto out_unlock;
4332 4342
4333 retval = security_task_getscheduler(p); 4343 retval = security_task_getscheduler(p);
4334 if (retval) 4344 if (retval)
4335 goto out_unlock; 4345 goto out_unlock;
4336 4346
4337 lp.sched_priority = p->rt_priority; 4347 lp.sched_priority = p->rt_priority;
4338 read_unlock(&tasklist_lock); 4348 read_unlock(&tasklist_lock);
4339 4349
4340 /* 4350 /*
4341 * This one might sleep, we cannot do it with a spinlock held ... 4351 * This one might sleep, we cannot do it with a spinlock held ...
4342 */ 4352 */
4343 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4353 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4344 4354
4345 out_nounlock: 4355 out_nounlock:
4346 return retval; 4356 return retval;
4347 4357
4348 out_unlock: 4358 out_unlock:
4349 read_unlock(&tasklist_lock); 4359 read_unlock(&tasklist_lock);
4350 return retval; 4360 return retval;
4351 } 4361 }
4352 4362
4353 long sched_setaffinity(pid_t pid, cpumask_t new_mask) 4363 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4354 { 4364 {
4355 cpumask_t cpus_allowed; 4365 cpumask_t cpus_allowed;
4356 struct task_struct *p; 4366 struct task_struct *p;
4357 int retval; 4367 int retval;
4358 4368
4359 mutex_lock(&sched_hotcpu_mutex); 4369 mutex_lock(&sched_hotcpu_mutex);
4360 read_lock(&tasklist_lock); 4370 read_lock(&tasklist_lock);
4361 4371
4362 p = find_process_by_pid(pid); 4372 p = find_process_by_pid(pid);
4363 if (!p) { 4373 if (!p) {
4364 read_unlock(&tasklist_lock); 4374 read_unlock(&tasklist_lock);
4365 mutex_unlock(&sched_hotcpu_mutex); 4375 mutex_unlock(&sched_hotcpu_mutex);
4366 return -ESRCH; 4376 return -ESRCH;
4367 } 4377 }
4368 4378
4369 /* 4379 /*
4370 * It is not safe to call set_cpus_allowed with the 4380 * It is not safe to call set_cpus_allowed with the
4371 * tasklist_lock held. We will bump the task_struct's 4381 * tasklist_lock held. We will bump the task_struct's
4372 * usage count and then drop tasklist_lock. 4382 * usage count and then drop tasklist_lock.
4373 */ 4383 */
4374 get_task_struct(p); 4384 get_task_struct(p);
4375 read_unlock(&tasklist_lock); 4385 read_unlock(&tasklist_lock);
4376 4386
4377 retval = -EPERM; 4387 retval = -EPERM;
4378 if ((current->euid != p->euid) && (current->euid != p->uid) && 4388 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4379 !capable(CAP_SYS_NICE)) 4389 !capable(CAP_SYS_NICE))
4380 goto out_unlock; 4390 goto out_unlock;
4381 4391
4382 retval = security_task_setscheduler(p, 0, NULL); 4392 retval = security_task_setscheduler(p, 0, NULL);
4383 if (retval) 4393 if (retval)
4384 goto out_unlock; 4394 goto out_unlock;
4385 4395
4386 cpus_allowed = cpuset_cpus_allowed(p); 4396 cpus_allowed = cpuset_cpus_allowed(p);
4387 cpus_and(new_mask, new_mask, cpus_allowed); 4397 cpus_and(new_mask, new_mask, cpus_allowed);
4388 retval = set_cpus_allowed(p, new_mask); 4398 retval = set_cpus_allowed(p, new_mask);
4389 4399
4390 out_unlock: 4400 out_unlock:
4391 put_task_struct(p); 4401 put_task_struct(p);
4392 mutex_unlock(&sched_hotcpu_mutex); 4402 mutex_unlock(&sched_hotcpu_mutex);
4393 return retval; 4403 return retval;
4394 } 4404 }
4395 4405
4396 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4406 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4397 cpumask_t *new_mask) 4407 cpumask_t *new_mask)
4398 { 4408 {
4399 if (len < sizeof(cpumask_t)) { 4409 if (len < sizeof(cpumask_t)) {
4400 memset(new_mask, 0, sizeof(cpumask_t)); 4410 memset(new_mask, 0, sizeof(cpumask_t));
4401 } else if (len > sizeof(cpumask_t)) { 4411 } else if (len > sizeof(cpumask_t)) {
4402 len = sizeof(cpumask_t); 4412 len = sizeof(cpumask_t);
4403 } 4413 }
4404 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4414 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4405 } 4415 }
4406 4416
4407 /** 4417 /**
4408 * sys_sched_setaffinity - set the cpu affinity of a process 4418 * sys_sched_setaffinity - set the cpu affinity of a process
4409 * @pid: pid of the process 4419 * @pid: pid of the process
4410 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4420 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4411 * @user_mask_ptr: user-space pointer to the new cpu mask 4421 * @user_mask_ptr: user-space pointer to the new cpu mask
4412 */ 4422 */
4413 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 4423 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4414 unsigned long __user *user_mask_ptr) 4424 unsigned long __user *user_mask_ptr)
4415 { 4425 {
4416 cpumask_t new_mask; 4426 cpumask_t new_mask;
4417 int retval; 4427 int retval;
4418 4428
4419 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 4429 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4420 if (retval) 4430 if (retval)
4421 return retval; 4431 return retval;
4422 4432
4423 return sched_setaffinity(pid, new_mask); 4433 return sched_setaffinity(pid, new_mask);
4424 } 4434 }
4425 4435
4426 /* 4436 /*
4427 * Represents all cpu's present in the system 4437 * Represents all cpu's present in the system
4428 * In systems capable of hotplug, this map could dynamically grow 4438 * In systems capable of hotplug, this map could dynamically grow
4429 * as new cpu's are detected in the system via any platform specific 4439 * as new cpu's are detected in the system via any platform specific
4430 * method, such as ACPI for e.g. 4440 * method, such as ACPI for e.g.
4431 */ 4441 */
4432 4442
4433 cpumask_t cpu_present_map __read_mostly; 4443 cpumask_t cpu_present_map __read_mostly;
4434 EXPORT_SYMBOL(cpu_present_map); 4444 EXPORT_SYMBOL(cpu_present_map);
4435 4445
4436 #ifndef CONFIG_SMP 4446 #ifndef CONFIG_SMP
4437 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; 4447 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4438 EXPORT_SYMBOL(cpu_online_map); 4448 EXPORT_SYMBOL(cpu_online_map);
4439 4449
4440 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; 4450 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4441 EXPORT_SYMBOL(cpu_possible_map); 4451 EXPORT_SYMBOL(cpu_possible_map);
4442 #endif 4452 #endif
4443 4453
4444 long sched_getaffinity(pid_t pid, cpumask_t *mask) 4454 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4445 { 4455 {
4446 struct task_struct *p; 4456 struct task_struct *p;
4447 int retval; 4457 int retval;
4448 4458
4449 mutex_lock(&sched_hotcpu_mutex); 4459 mutex_lock(&sched_hotcpu_mutex);
4450 read_lock(&tasklist_lock); 4460 read_lock(&tasklist_lock);
4451 4461
4452 retval = -ESRCH; 4462 retval = -ESRCH;
4453 p = find_process_by_pid(pid); 4463 p = find_process_by_pid(pid);
4454 if (!p) 4464 if (!p)
4455 goto out_unlock; 4465 goto out_unlock;
4456 4466
4457 retval = security_task_getscheduler(p); 4467 retval = security_task_getscheduler(p);
4458 if (retval) 4468 if (retval)
4459 goto out_unlock; 4469 goto out_unlock;
4460 4470
4461 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 4471 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4462 4472
4463 out_unlock: 4473 out_unlock:
4464 read_unlock(&tasklist_lock); 4474 read_unlock(&tasklist_lock);
4465 mutex_unlock(&sched_hotcpu_mutex); 4475 mutex_unlock(&sched_hotcpu_mutex);
4466 if (retval) 4476 if (retval)
4467 return retval; 4477 return retval;
4468 4478
4469 return 0; 4479 return 0;
4470 } 4480 }
4471 4481
4472 /** 4482 /**
4473 * sys_sched_getaffinity - get the cpu affinity of a process 4483 * sys_sched_getaffinity - get the cpu affinity of a process
4474 * @pid: pid of the process 4484 * @pid: pid of the process
4475 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4485 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4476 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4486 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4477 */ 4487 */
4478 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 4488 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4479 unsigned long __user *user_mask_ptr) 4489 unsigned long __user *user_mask_ptr)
4480 { 4490 {
4481 int ret; 4491 int ret;
4482 cpumask_t mask; 4492 cpumask_t mask;
4483 4493
4484 if (len < sizeof(cpumask_t)) 4494 if (len < sizeof(cpumask_t))
4485 return -EINVAL; 4495 return -EINVAL;
4486 4496
4487 ret = sched_getaffinity(pid, &mask); 4497 ret = sched_getaffinity(pid, &mask);
4488 if (ret < 0) 4498 if (ret < 0)
4489 return ret; 4499 return ret;
4490 4500
4491 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 4501 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4492 return -EFAULT; 4502 return -EFAULT;
4493 4503
4494 return sizeof(cpumask_t); 4504 return sizeof(cpumask_t);
4495 } 4505 }
4496 4506
4497 /** 4507 /**
4498 * sys_sched_yield - yield the current processor to other threads. 4508 * sys_sched_yield - yield the current processor to other threads.
4499 * 4509 *
4500 * This function yields the current CPU to other tasks. If there are no 4510 * This function yields the current CPU to other tasks. If there are no
4501 * other threads running on this CPU then this function will return. 4511 * other threads running on this CPU then this function will return.
4502 */ 4512 */
4503 asmlinkage long sys_sched_yield(void) 4513 asmlinkage long sys_sched_yield(void)
4504 { 4514 {
4505 struct rq *rq = this_rq_lock(); 4515 struct rq *rq = this_rq_lock();
4506 4516
4507 schedstat_inc(rq, yld_cnt); 4517 schedstat_inc(rq, yld_cnt);
4508 if (unlikely(rq->nr_running == 1)) 4518 if (unlikely(rq->nr_running == 1))
4509 schedstat_inc(rq, yld_act_empty); 4519 schedstat_inc(rq, yld_act_empty);
4510 else 4520 else
4511 current->sched_class->yield_task(rq, current); 4521 current->sched_class->yield_task(rq, current);
4512 4522
4513 /* 4523 /*
4514 * Since we are going to call schedule() anyway, there's 4524 * Since we are going to call schedule() anyway, there's
4515 * no need to preempt or enable interrupts: 4525 * no need to preempt or enable interrupts:
4516 */ 4526 */
4517 __release(rq->lock); 4527 __release(rq->lock);
4518 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4528 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4519 _raw_spin_unlock(&rq->lock); 4529 _raw_spin_unlock(&rq->lock);
4520 preempt_enable_no_resched(); 4530 preempt_enable_no_resched();
4521 4531
4522 schedule(); 4532 schedule();
4523 4533
4524 return 0; 4534 return 0;
4525 } 4535 }
4526 4536
4527 static void __cond_resched(void) 4537 static void __cond_resched(void)
4528 { 4538 {
4529 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 4539 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4530 __might_sleep(__FILE__, __LINE__); 4540 __might_sleep(__FILE__, __LINE__);
4531 #endif 4541 #endif
4532 /* 4542 /*
4533 * The BKS might be reacquired before we have dropped 4543 * The BKS might be reacquired before we have dropped
4534 * PREEMPT_ACTIVE, which could trigger a second 4544 * PREEMPT_ACTIVE, which could trigger a second
4535 * cond_resched() call. 4545 * cond_resched() call.
4536 */ 4546 */
4537 do { 4547 do {
4538 add_preempt_count(PREEMPT_ACTIVE); 4548 add_preempt_count(PREEMPT_ACTIVE);
4539 schedule(); 4549 schedule();
4540 sub_preempt_count(PREEMPT_ACTIVE); 4550 sub_preempt_count(PREEMPT_ACTIVE);
4541 } while (need_resched()); 4551 } while (need_resched());
4542 } 4552 }
4543 4553
4544 int __sched cond_resched(void) 4554 int __sched cond_resched(void)
4545 { 4555 {
4546 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4556 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4547 system_state == SYSTEM_RUNNING) { 4557 system_state == SYSTEM_RUNNING) {
4548 __cond_resched(); 4558 __cond_resched();
4549 return 1; 4559 return 1;
4550 } 4560 }
4551 return 0; 4561 return 0;
4552 } 4562 }
4553 EXPORT_SYMBOL(cond_resched); 4563 EXPORT_SYMBOL(cond_resched);
4554 4564
4555 /* 4565 /*
4556 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4566 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4557 * call schedule, and on return reacquire the lock. 4567 * call schedule, and on return reacquire the lock.
4558 * 4568 *
4559 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4569 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4560 * operations here to prevent schedule() from being called twice (once via 4570 * operations here to prevent schedule() from being called twice (once via
4561 * spin_unlock(), once by hand). 4571 * spin_unlock(), once by hand).
4562 */ 4572 */
4563 int cond_resched_lock(spinlock_t *lock) 4573 int cond_resched_lock(spinlock_t *lock)
4564 { 4574 {
4565 int ret = 0; 4575 int ret = 0;
4566 4576
4567 if (need_lockbreak(lock)) { 4577 if (need_lockbreak(lock)) {
4568 spin_unlock(lock); 4578 spin_unlock(lock);
4569 cpu_relax(); 4579 cpu_relax();
4570 ret = 1; 4580 ret = 1;
4571 spin_lock(lock); 4581 spin_lock(lock);
4572 } 4582 }
4573 if (need_resched() && system_state == SYSTEM_RUNNING) { 4583 if (need_resched() && system_state == SYSTEM_RUNNING) {
4574 spin_release(&lock->dep_map, 1, _THIS_IP_); 4584 spin_release(&lock->dep_map, 1, _THIS_IP_);
4575 _raw_spin_unlock(lock); 4585 _raw_spin_unlock(lock);
4576 preempt_enable_no_resched(); 4586 preempt_enable_no_resched();
4577 __cond_resched(); 4587 __cond_resched();
4578 ret = 1; 4588 ret = 1;
4579 spin_lock(lock); 4589 spin_lock(lock);
4580 } 4590 }
4581 return ret; 4591 return ret;
4582 } 4592 }
4583 EXPORT_SYMBOL(cond_resched_lock); 4593 EXPORT_SYMBOL(cond_resched_lock);
4584 4594
4585 int __sched cond_resched_softirq(void) 4595 int __sched cond_resched_softirq(void)
4586 { 4596 {
4587 BUG_ON(!in_softirq()); 4597 BUG_ON(!in_softirq());
4588 4598
4589 if (need_resched() && system_state == SYSTEM_RUNNING) { 4599 if (need_resched() && system_state == SYSTEM_RUNNING) {
4590 local_bh_enable(); 4600 local_bh_enable();
4591 __cond_resched(); 4601 __cond_resched();
4592 local_bh_disable(); 4602 local_bh_disable();
4593 return 1; 4603 return 1;
4594 } 4604 }
4595 return 0; 4605 return 0;
4596 } 4606 }
4597 EXPORT_SYMBOL(cond_resched_softirq); 4607 EXPORT_SYMBOL(cond_resched_softirq);
4598 4608
4599 /** 4609 /**
4600 * yield - yield the current processor to other threads. 4610 * yield - yield the current processor to other threads.
4601 * 4611 *
4602 * This is a shortcut for kernel-space yielding - it marks the 4612 * This is a shortcut for kernel-space yielding - it marks the
4603 * thread runnable and calls sys_sched_yield(). 4613 * thread runnable and calls sys_sched_yield().
4604 */ 4614 */
4605 void __sched yield(void) 4615 void __sched yield(void)
4606 { 4616 {
4607 set_current_state(TASK_RUNNING); 4617 set_current_state(TASK_RUNNING);
4608 sys_sched_yield(); 4618 sys_sched_yield();
4609 } 4619 }
4610 EXPORT_SYMBOL(yield); 4620 EXPORT_SYMBOL(yield);
4611 4621
4612 /* 4622 /*
4613 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4623 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4614 * that process accounting knows that this is a task in IO wait state. 4624 * that process accounting knows that this is a task in IO wait state.
4615 * 4625 *
4616 * But don't do that if it is a deliberate, throttling IO wait (this task 4626 * But don't do that if it is a deliberate, throttling IO wait (this task
4617 * has set its backing_dev_info: the queue against which it should throttle) 4627 * has set its backing_dev_info: the queue against which it should throttle)
4618 */ 4628 */
4619 void __sched io_schedule(void) 4629 void __sched io_schedule(void)
4620 { 4630 {
4621 struct rq *rq = &__raw_get_cpu_var(runqueues); 4631 struct rq *rq = &__raw_get_cpu_var(runqueues);
4622 4632
4623 delayacct_blkio_start(); 4633 delayacct_blkio_start();
4624 atomic_inc(&rq->nr_iowait); 4634 atomic_inc(&rq->nr_iowait);
4625 schedule(); 4635 schedule();
4626 atomic_dec(&rq->nr_iowait); 4636 atomic_dec(&rq->nr_iowait);
4627 delayacct_blkio_end(); 4637 delayacct_blkio_end();
4628 } 4638 }
4629 EXPORT_SYMBOL(io_schedule); 4639 EXPORT_SYMBOL(io_schedule);
4630 4640
4631 long __sched io_schedule_timeout(long timeout) 4641 long __sched io_schedule_timeout(long timeout)
4632 { 4642 {
4633 struct rq *rq = &__raw_get_cpu_var(runqueues); 4643 struct rq *rq = &__raw_get_cpu_var(runqueues);
4634 long ret; 4644 long ret;
4635 4645
4636 delayacct_blkio_start(); 4646 delayacct_blkio_start();
4637 atomic_inc(&rq->nr_iowait); 4647 atomic_inc(&rq->nr_iowait);
4638 ret = schedule_timeout(timeout); 4648 ret = schedule_timeout(timeout);
4639 atomic_dec(&rq->nr_iowait); 4649 atomic_dec(&rq->nr_iowait);
4640 delayacct_blkio_end(); 4650 delayacct_blkio_end();
4641 return ret; 4651 return ret;
4642 } 4652 }
4643 4653
4644 /** 4654 /**
4645 * sys_sched_get_priority_max - return maximum RT priority. 4655 * sys_sched_get_priority_max - return maximum RT priority.
4646 * @policy: scheduling class. 4656 * @policy: scheduling class.
4647 * 4657 *
4648 * this syscall returns the maximum rt_priority that can be used 4658 * this syscall returns the maximum rt_priority that can be used
4649 * by a given scheduling class. 4659 * by a given scheduling class.
4650 */ 4660 */
4651 asmlinkage long sys_sched_get_priority_max(int policy) 4661 asmlinkage long sys_sched_get_priority_max(int policy)
4652 { 4662 {
4653 int ret = -EINVAL; 4663 int ret = -EINVAL;
4654 4664
4655 switch (policy) { 4665 switch (policy) {
4656 case SCHED_FIFO: 4666 case SCHED_FIFO:
4657 case SCHED_RR: 4667 case SCHED_RR:
4658 ret = MAX_USER_RT_PRIO-1; 4668 ret = MAX_USER_RT_PRIO-1;
4659 break; 4669 break;
4660 case SCHED_NORMAL: 4670 case SCHED_NORMAL:
4661 case SCHED_BATCH: 4671 case SCHED_BATCH:
4662 case SCHED_IDLE: 4672 case SCHED_IDLE:
4663 ret = 0; 4673 ret = 0;
4664 break; 4674 break;
4665 } 4675 }
4666 return ret; 4676 return ret;
4667 } 4677 }
4668 4678
4669 /** 4679 /**
4670 * sys_sched_get_priority_min - return minimum RT priority. 4680 * sys_sched_get_priority_min - return minimum RT priority.
4671 * @policy: scheduling class. 4681 * @policy: scheduling class.
4672 * 4682 *
4673 * this syscall returns the minimum rt_priority that can be used 4683 * this syscall returns the minimum rt_priority that can be used
4674 * by a given scheduling class. 4684 * by a given scheduling class.
4675 */ 4685 */
4676 asmlinkage long sys_sched_get_priority_min(int policy) 4686 asmlinkage long sys_sched_get_priority_min(int policy)
4677 { 4687 {
4678 int ret = -EINVAL; 4688 int ret = -EINVAL;
4679 4689
4680 switch (policy) { 4690 switch (policy) {
4681 case SCHED_FIFO: 4691 case SCHED_FIFO:
4682 case SCHED_RR: 4692 case SCHED_RR:
4683 ret = 1; 4693 ret = 1;
4684 break; 4694 break;
4685 case SCHED_NORMAL: 4695 case SCHED_NORMAL:
4686 case SCHED_BATCH: 4696 case SCHED_BATCH:
4687 case SCHED_IDLE: 4697 case SCHED_IDLE:
4688 ret = 0; 4698 ret = 0;
4689 } 4699 }
4690 return ret; 4700 return ret;
4691 } 4701 }
4692 4702
4693 /** 4703 /**
4694 * sys_sched_rr_get_interval - return the default timeslice of a process. 4704 * sys_sched_rr_get_interval - return the default timeslice of a process.
4695 * @pid: pid of the process. 4705 * @pid: pid of the process.
4696 * @interval: userspace pointer to the timeslice value. 4706 * @interval: userspace pointer to the timeslice value.
4697 * 4707 *
4698 * this syscall writes the default timeslice value of a given process 4708 * this syscall writes the default timeslice value of a given process
4699 * into the user-space timespec buffer. A value of '0' means infinity. 4709 * into the user-space timespec buffer. A value of '0' means infinity.
4700 */ 4710 */
4701 asmlinkage 4711 asmlinkage
4702 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4712 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4703 { 4713 {
4704 struct task_struct *p; 4714 struct task_struct *p;
4705 int retval = -EINVAL; 4715 int retval = -EINVAL;
4706 struct timespec t; 4716 struct timespec t;
4707 4717
4708 if (pid < 0) 4718 if (pid < 0)
4709 goto out_nounlock; 4719 goto out_nounlock;
4710 4720
4711 retval = -ESRCH; 4721 retval = -ESRCH;
4712 read_lock(&tasklist_lock); 4722 read_lock(&tasklist_lock);
4713 p = find_process_by_pid(pid); 4723 p = find_process_by_pid(pid);
4714 if (!p) 4724 if (!p)
4715 goto out_unlock; 4725 goto out_unlock;
4716 4726
4717 retval = security_task_getscheduler(p); 4727 retval = security_task_getscheduler(p);
4718 if (retval) 4728 if (retval)
4719 goto out_unlock; 4729 goto out_unlock;
4720 4730
4721 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4731 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4722 0 : static_prio_timeslice(p->static_prio), &t); 4732 0 : static_prio_timeslice(p->static_prio), &t);
4723 read_unlock(&tasklist_lock); 4733 read_unlock(&tasklist_lock);
4724 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4734 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4725 out_nounlock: 4735 out_nounlock:
4726 return retval; 4736 return retval;
4727 out_unlock: 4737 out_unlock:
4728 read_unlock(&tasklist_lock); 4738 read_unlock(&tasklist_lock);
4729 return retval; 4739 return retval;
4730 } 4740 }
4731 4741
4732 static const char stat_nam[] = "RSDTtZX"; 4742 static const char stat_nam[] = "RSDTtZX";
4733 4743
4734 static void show_task(struct task_struct *p) 4744 static void show_task(struct task_struct *p)
4735 { 4745 {
4736 unsigned long free = 0; 4746 unsigned long free = 0;
4737 unsigned state; 4747 unsigned state;
4738 4748
4739 state = p->state ? __ffs(p->state) + 1 : 0; 4749 state = p->state ? __ffs(p->state) + 1 : 0;
4740 printk("%-13.13s %c", p->comm, 4750 printk("%-13.13s %c", p->comm,
4741 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4751 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4742 #if BITS_PER_LONG == 32 4752 #if BITS_PER_LONG == 32
4743 if (state == TASK_RUNNING) 4753 if (state == TASK_RUNNING)
4744 printk(" running "); 4754 printk(" running ");
4745 else 4755 else
4746 printk(" %08lx ", thread_saved_pc(p)); 4756 printk(" %08lx ", thread_saved_pc(p));
4747 #else 4757 #else
4748 if (state == TASK_RUNNING) 4758 if (state == TASK_RUNNING)
4749 printk(" running task "); 4759 printk(" running task ");
4750 else 4760 else
4751 printk(" %016lx ", thread_saved_pc(p)); 4761 printk(" %016lx ", thread_saved_pc(p));
4752 #endif 4762 #endif
4753 #ifdef CONFIG_DEBUG_STACK_USAGE 4763 #ifdef CONFIG_DEBUG_STACK_USAGE
4754 { 4764 {
4755 unsigned long *n = end_of_stack(p); 4765 unsigned long *n = end_of_stack(p);
4756 while (!*n) 4766 while (!*n)
4757 n++; 4767 n++;
4758 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4768 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4759 } 4769 }
4760 #endif 4770 #endif
4761 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); 4771 printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
4762 4772
4763 if (state != TASK_RUNNING) 4773 if (state != TASK_RUNNING)
4764 show_stack(p, NULL); 4774 show_stack(p, NULL);
4765 } 4775 }
4766 4776
4767 void show_state_filter(unsigned long state_filter) 4777 void show_state_filter(unsigned long state_filter)
4768 { 4778 {
4769 struct task_struct *g, *p; 4779 struct task_struct *g, *p;
4770 4780
4771 #if BITS_PER_LONG == 32 4781 #if BITS_PER_LONG == 32
4772 printk(KERN_INFO 4782 printk(KERN_INFO
4773 " task PC stack pid father\n"); 4783 " task PC stack pid father\n");
4774 #else 4784 #else
4775 printk(KERN_INFO 4785 printk(KERN_INFO
4776 " task PC stack pid father\n"); 4786 " task PC stack pid father\n");
4777 #endif 4787 #endif
4778 read_lock(&tasklist_lock); 4788 read_lock(&tasklist_lock);
4779 do_each_thread(g, p) { 4789 do_each_thread(g, p) {
4780 /* 4790 /*
4781 * reset the NMI-timeout, listing all files on a slow 4791 * reset the NMI-timeout, listing all files on a slow
4782 * console might take alot of time: 4792 * console might take alot of time:
4783 */ 4793 */
4784 touch_nmi_watchdog(); 4794 touch_nmi_watchdog();
4785 if (!state_filter || (p->state & state_filter)) 4795 if (!state_filter || (p->state & state_filter))
4786 show_task(p); 4796 show_task(p);
4787 } while_each_thread(g, p); 4797 } while_each_thread(g, p);
4788 4798
4789 touch_all_softlockup_watchdogs(); 4799 touch_all_softlockup_watchdogs();
4790 4800
4791 #ifdef CONFIG_SCHED_DEBUG 4801 #ifdef CONFIG_SCHED_DEBUG
4792 sysrq_sched_debug_show(); 4802 sysrq_sched_debug_show();
4793 #endif 4803 #endif
4794 read_unlock(&tasklist_lock); 4804 read_unlock(&tasklist_lock);
4795 /* 4805 /*
4796 * Only show locks if all tasks are dumped: 4806 * Only show locks if all tasks are dumped:
4797 */ 4807 */
4798 if (state_filter == -1) 4808 if (state_filter == -1)
4799 debug_show_all_locks(); 4809 debug_show_all_locks();
4800 } 4810 }
4801 4811
4802 void __cpuinit init_idle_bootup_task(struct task_struct *idle) 4812 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4803 { 4813 {
4804 idle->sched_class = &idle_sched_class; 4814 idle->sched_class = &idle_sched_class;
4805 } 4815 }
4806 4816
4807 /** 4817 /**
4808 * init_idle - set up an idle thread for a given CPU 4818 * init_idle - set up an idle thread for a given CPU
4809 * @idle: task in question 4819 * @idle: task in question
4810 * @cpu: cpu the idle task belongs to 4820 * @cpu: cpu the idle task belongs to
4811 * 4821 *
4812 * NOTE: this function does not set the idle thread's NEED_RESCHED 4822 * NOTE: this function does not set the idle thread's NEED_RESCHED
4813 * flag, to make booting more robust. 4823 * flag, to make booting more robust.
4814 */ 4824 */
4815 void __cpuinit init_idle(struct task_struct *idle, int cpu) 4825 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4816 { 4826 {
4817 struct rq *rq = cpu_rq(cpu); 4827 struct rq *rq = cpu_rq(cpu);
4818 unsigned long flags; 4828 unsigned long flags;
4819 4829
4820 __sched_fork(idle); 4830 __sched_fork(idle);
4821 idle->se.exec_start = sched_clock(); 4831 idle->se.exec_start = sched_clock();
4822 4832
4823 idle->prio = idle->normal_prio = MAX_PRIO; 4833 idle->prio = idle->normal_prio = MAX_PRIO;
4824 idle->cpus_allowed = cpumask_of_cpu(cpu); 4834 idle->cpus_allowed = cpumask_of_cpu(cpu);
4825 __set_task_cpu(idle, cpu); 4835 __set_task_cpu(idle, cpu);
4826 4836
4827 spin_lock_irqsave(&rq->lock, flags); 4837 spin_lock_irqsave(&rq->lock, flags);
4828 rq->curr = rq->idle = idle; 4838 rq->curr = rq->idle = idle;
4829 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 4839 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4830 idle->oncpu = 1; 4840 idle->oncpu = 1;
4831 #endif 4841 #endif
4832 spin_unlock_irqrestore(&rq->lock, flags); 4842 spin_unlock_irqrestore(&rq->lock, flags);
4833 4843
4834 /* Set the preempt count _outside_ the spinlocks! */ 4844 /* Set the preempt count _outside_ the spinlocks! */
4835 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) 4845 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4836 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 4846 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4837 #else 4847 #else
4838 task_thread_info(idle)->preempt_count = 0; 4848 task_thread_info(idle)->preempt_count = 0;
4839 #endif 4849 #endif
4840 /* 4850 /*
4841 * The idle tasks have their own, simple scheduling class: 4851 * The idle tasks have their own, simple scheduling class:
4842 */ 4852 */
4843 idle->sched_class = &idle_sched_class; 4853 idle->sched_class = &idle_sched_class;
4844 } 4854 }
4845 4855
4846 /* 4856 /*
4847 * In a system that switches off the HZ timer nohz_cpu_mask 4857 * In a system that switches off the HZ timer nohz_cpu_mask
4848 * indicates which cpus entered this state. This is used 4858 * indicates which cpus entered this state. This is used
4849 * in the rcu update to wait only for active cpus. For system 4859 * in the rcu update to wait only for active cpus. For system
4850 * which do not switch off the HZ timer nohz_cpu_mask should 4860 * which do not switch off the HZ timer nohz_cpu_mask should
4851 * always be CPU_MASK_NONE. 4861 * always be CPU_MASK_NONE.
4852 */ 4862 */
4853 cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4863 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4854 4864
4855 /* 4865 /*
4856 * Increase the granularity value when there are more CPUs, 4866 * Increase the granularity value when there are more CPUs,
4857 * because with more CPUs the 'effective latency' as visible 4867 * because with more CPUs the 'effective latency' as visible
4858 * to users decreases. But the relationship is not linear, 4868 * to users decreases. But the relationship is not linear,
4859 * so pick a second-best guess by going with the log2 of the 4869 * so pick a second-best guess by going with the log2 of the
4860 * number of CPUs. 4870 * number of CPUs.
4861 * 4871 *
4862 * This idea comes from the SD scheduler of Con Kolivas: 4872 * This idea comes from the SD scheduler of Con Kolivas:
4863 */ 4873 */
4864 static inline void sched_init_granularity(void) 4874 static inline void sched_init_granularity(void)
4865 { 4875 {
4866 unsigned int factor = 1 + ilog2(num_online_cpus()); 4876 unsigned int factor = 1 + ilog2(num_online_cpus());
4867 const unsigned long gran_limit = 100000000; 4877 const unsigned long gran_limit = 100000000;
4868 4878
4869 sysctl_sched_granularity *= factor; 4879 sysctl_sched_granularity *= factor;
4870 if (sysctl_sched_granularity > gran_limit) 4880 if (sysctl_sched_granularity > gran_limit)
4871 sysctl_sched_granularity = gran_limit; 4881 sysctl_sched_granularity = gran_limit;
4872 4882
4873 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; 4883 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4874 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; 4884 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4875 } 4885 }
4876 4886
4877 #ifdef CONFIG_SMP 4887 #ifdef CONFIG_SMP
4878 /* 4888 /*
4879 * This is how migration works: 4889 * This is how migration works:
4880 * 4890 *
4881 * 1) we queue a struct migration_req structure in the source CPU's 4891 * 1) we queue a struct migration_req structure in the source CPU's
4882 * runqueue and wake up that CPU's migration thread. 4892 * runqueue and wake up that CPU's migration thread.
4883 * 2) we down() the locked semaphore => thread blocks. 4893 * 2) we down() the locked semaphore => thread blocks.
4884 * 3) migration thread wakes up (implicitly it forces the migrated 4894 * 3) migration thread wakes up (implicitly it forces the migrated
4885 * thread off the CPU) 4895 * thread off the CPU)
4886 * 4) it gets the migration request and checks whether the migrated 4896 * 4) it gets the migration request and checks whether the migrated
4887 * task is still in the wrong runqueue. 4897 * task is still in the wrong runqueue.
4888 * 5) if it's in the wrong runqueue then the migration thread removes 4898 * 5) if it's in the wrong runqueue then the migration thread removes
4889 * it and puts it into the right queue. 4899 * it and puts it into the right queue.
4890 * 6) migration thread up()s the semaphore. 4900 * 6) migration thread up()s the semaphore.
4891 * 7) we wake up and the migration is done. 4901 * 7) we wake up and the migration is done.
4892 */ 4902 */
4893 4903
4894 /* 4904 /*
4895 * Change a given task's CPU affinity. Migrate the thread to a 4905 * Change a given task's CPU affinity. Migrate the thread to a
4896 * proper CPU and schedule it away if the CPU it's executing on 4906 * proper CPU and schedule it away if the CPU it's executing on
4897 * is removed from the allowed bitmask. 4907 * is removed from the allowed bitmask.
4898 * 4908 *
4899 * NOTE: the caller must have a valid reference to the task, the 4909 * NOTE: the caller must have a valid reference to the task, the
4900 * task must not exit() & deallocate itself prematurely. The 4910 * task must not exit() & deallocate itself prematurely. The
4901 * call is not atomic; no spinlocks may be held. 4911 * call is not atomic; no spinlocks may be held.
4902 */ 4912 */
4903 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 4913 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4904 { 4914 {
4905 struct migration_req req; 4915 struct migration_req req;
4906 unsigned long flags; 4916 unsigned long flags;
4907 struct rq *rq; 4917 struct rq *rq;
4908 int ret = 0; 4918 int ret = 0;
4909 4919
4910 rq = task_rq_lock(p, &flags); 4920 rq = task_rq_lock(p, &flags);
4911 if (!cpus_intersects(new_mask, cpu_online_map)) { 4921 if (!cpus_intersects(new_mask, cpu_online_map)) {
4912 ret = -EINVAL; 4922 ret = -EINVAL;
4913 goto out; 4923 goto out;
4914 } 4924 }
4915 4925
4916 p->cpus_allowed = new_mask; 4926 p->cpus_allowed = new_mask;
4917 /* Can the task run on the task's current CPU? If so, we're done */ 4927 /* Can the task run on the task's current CPU? If so, we're done */
4918 if (cpu_isset(task_cpu(p), new_mask)) 4928 if (cpu_isset(task_cpu(p), new_mask))
4919 goto out; 4929 goto out;
4920 4930
4921 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 4931 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4922 /* Need help from migration thread: drop lock and wait. */ 4932 /* Need help from migration thread: drop lock and wait. */
4923 task_rq_unlock(rq, &flags); 4933 task_rq_unlock(rq, &flags);
4924 wake_up_process(rq->migration_thread); 4934 wake_up_process(rq->migration_thread);
4925 wait_for_completion(&req.done); 4935 wait_for_completion(&req.done);
4926 tlb_migrate_finish(p->mm); 4936 tlb_migrate_finish(p->mm);
4927 return 0; 4937 return 0;
4928 } 4938 }
4929 out: 4939 out:
4930 task_rq_unlock(rq, &flags); 4940 task_rq_unlock(rq, &flags);
4931 4941
4932 return ret; 4942 return ret;
4933 } 4943 }
4934 EXPORT_SYMBOL_GPL(set_cpus_allowed); 4944 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4935 4945
4936 /* 4946 /*
4937 * Move (not current) task off this cpu, onto dest cpu. We're doing 4947 * Move (not current) task off this cpu, onto dest cpu. We're doing
4938 * this because either it can't run here any more (set_cpus_allowed() 4948 * this because either it can't run here any more (set_cpus_allowed()
4939 * away from this CPU, or CPU going down), or because we're 4949 * away from this CPU, or CPU going down), or because we're
4940 * attempting to rebalance this task on exec (sched_exec). 4950 * attempting to rebalance this task on exec (sched_exec).
4941 * 4951 *
4942 * So we race with normal scheduler movements, but that's OK, as long 4952 * So we race with normal scheduler movements, but that's OK, as long
4943 * as the task is no longer on this CPU. 4953 * as the task is no longer on this CPU.
4944 * 4954 *
4945 * Returns non-zero if task was successfully migrated. 4955 * Returns non-zero if task was successfully migrated.
4946 */ 4956 */
4947 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4957 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4948 { 4958 {
4949 struct rq *rq_dest, *rq_src; 4959 struct rq *rq_dest, *rq_src;
4950 int ret = 0, on_rq; 4960 int ret = 0, on_rq;
4951 4961
4952 if (unlikely(cpu_is_offline(dest_cpu))) 4962 if (unlikely(cpu_is_offline(dest_cpu)))
4953 return ret; 4963 return ret;
4954 4964
4955 rq_src = cpu_rq(src_cpu); 4965 rq_src = cpu_rq(src_cpu);
4956 rq_dest = cpu_rq(dest_cpu); 4966 rq_dest = cpu_rq(dest_cpu);
4957 4967
4958 double_rq_lock(rq_src, rq_dest); 4968 double_rq_lock(rq_src, rq_dest);
4959 /* Already moved. */ 4969 /* Already moved. */
4960 if (task_cpu(p) != src_cpu) 4970 if (task_cpu(p) != src_cpu)
4961 goto out; 4971 goto out;
4962 /* Affinity changed (again). */ 4972 /* Affinity changed (again). */
4963 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4973 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4964 goto out; 4974 goto out;
4965 4975
4966 on_rq = p->se.on_rq; 4976 on_rq = p->se.on_rq;
4967 if (on_rq) 4977 if (on_rq)
4968 deactivate_task(rq_src, p, 0); 4978 deactivate_task(rq_src, p, 0);
4969 set_task_cpu(p, dest_cpu); 4979 set_task_cpu(p, dest_cpu);
4970 if (on_rq) { 4980 if (on_rq) {
4971 activate_task(rq_dest, p, 0); 4981 activate_task(rq_dest, p, 0);
4972 check_preempt_curr(rq_dest, p); 4982 check_preempt_curr(rq_dest, p);
4973 } 4983 }
4974 ret = 1; 4984 ret = 1;
4975 out: 4985 out:
4976 double_rq_unlock(rq_src, rq_dest); 4986 double_rq_unlock(rq_src, rq_dest);
4977 return ret; 4987 return ret;
4978 } 4988 }
4979 4989
4980 /* 4990 /*
4981 * migration_thread - this is a highprio system thread that performs 4991 * migration_thread - this is a highprio system thread that performs
4982 * thread migration by bumping thread off CPU then 'pushing' onto 4992 * thread migration by bumping thread off CPU then 'pushing' onto
4983 * another runqueue. 4993 * another runqueue.
4984 */ 4994 */
4985 static int migration_thread(void *data) 4995 static int migration_thread(void *data)
4986 { 4996 {
4987 int cpu = (long)data; 4997 int cpu = (long)data;
4988 struct rq *rq; 4998 struct rq *rq;
4989 4999
4990 rq = cpu_rq(cpu); 5000 rq = cpu_rq(cpu);
4991 BUG_ON(rq->migration_thread != current); 5001 BUG_ON(rq->migration_thread != current);
4992 5002
4993 set_current_state(TASK_INTERRUPTIBLE); 5003 set_current_state(TASK_INTERRUPTIBLE);
4994 while (!kthread_should_stop()) { 5004 while (!kthread_should_stop()) {
4995 struct migration_req *req; 5005 struct migration_req *req;
4996 struct list_head *head; 5006 struct list_head *head;
4997 5007
4998 spin_lock_irq(&rq->lock); 5008 spin_lock_irq(&rq->lock);
4999 5009
5000 if (cpu_is_offline(cpu)) { 5010 if (cpu_is_offline(cpu)) {
5001 spin_unlock_irq(&rq->lock); 5011 spin_unlock_irq(&rq->lock);
5002 goto wait_to_die; 5012 goto wait_to_die;
5003 } 5013 }
5004 5014
5005 if (rq->active_balance) { 5015 if (rq->active_balance) {
5006 active_load_balance(rq, cpu); 5016 active_load_balance(rq, cpu);
5007 rq->active_balance = 0; 5017 rq->active_balance = 0;
5008 } 5018 }
5009 5019
5010 head = &rq->migration_queue; 5020 head = &rq->migration_queue;
5011 5021
5012 if (list_empty(head)) { 5022 if (list_empty(head)) {
5013 spin_unlock_irq(&rq->lock); 5023 spin_unlock_irq(&rq->lock);
5014 schedule(); 5024 schedule();
5015 set_current_state(TASK_INTERRUPTIBLE); 5025 set_current_state(TASK_INTERRUPTIBLE);
5016 continue; 5026 continue;
5017 } 5027 }
5018 req = list_entry(head->next, struct migration_req, list); 5028 req = list_entry(head->next, struct migration_req, list);
5019 list_del_init(head->next); 5029 list_del_init(head->next);
5020 5030
5021 spin_unlock(&rq->lock); 5031 spin_unlock(&rq->lock);
5022 __migrate_task(req->task, cpu, req->dest_cpu); 5032 __migrate_task(req->task, cpu, req->dest_cpu);
5023 local_irq_enable(); 5033 local_irq_enable();
5024 5034
5025 complete(&req->done); 5035 complete(&req->done);
5026 } 5036 }
5027 __set_current_state(TASK_RUNNING); 5037 __set_current_state(TASK_RUNNING);
5028 return 0; 5038 return 0;
5029 5039
5030 wait_to_die: 5040 wait_to_die:
5031 /* Wait for kthread_stop */ 5041 /* Wait for kthread_stop */
5032 set_current_state(TASK_INTERRUPTIBLE); 5042 set_current_state(TASK_INTERRUPTIBLE);
5033 while (!kthread_should_stop()) { 5043 while (!kthread_should_stop()) {
5034 schedule(); 5044 schedule();
5035 set_current_state(TASK_INTERRUPTIBLE); 5045 set_current_state(TASK_INTERRUPTIBLE);
5036 } 5046 }
5037 __set_current_state(TASK_RUNNING); 5047 __set_current_state(TASK_RUNNING);
5038 return 0; 5048 return 0;
5039 } 5049 }
5040 5050
5041 #ifdef CONFIG_HOTPLUG_CPU 5051 #ifdef CONFIG_HOTPLUG_CPU
5042 /* 5052 /*
5043 * Figure out where task on dead CPU should go, use force if neccessary. 5053 * Figure out where task on dead CPU should go, use force if neccessary.
5044 * NOTE: interrupts should be disabled by the caller 5054 * NOTE: interrupts should be disabled by the caller
5045 */ 5055 */
5046 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5056 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5047 { 5057 {
5048 unsigned long flags; 5058 unsigned long flags;
5049 cpumask_t mask; 5059 cpumask_t mask;
5050 struct rq *rq; 5060 struct rq *rq;
5051 int dest_cpu; 5061 int dest_cpu;
5052 5062
5053 restart: 5063 restart:
5054 /* On same node? */ 5064 /* On same node? */
5055 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5065 mask = node_to_cpumask(cpu_to_node(dead_cpu));
5056 cpus_and(mask, mask, p->cpus_allowed); 5066 cpus_and(mask, mask, p->cpus_allowed);
5057 dest_cpu = any_online_cpu(mask); 5067 dest_cpu = any_online_cpu(mask);
5058 5068
5059 /* On any allowed CPU? */ 5069 /* On any allowed CPU? */
5060 if (dest_cpu == NR_CPUS) 5070 if (dest_cpu == NR_CPUS)
5061 dest_cpu = any_online_cpu(p->cpus_allowed); 5071 dest_cpu = any_online_cpu(p->cpus_allowed);
5062 5072
5063 /* No more Mr. Nice Guy. */ 5073 /* No more Mr. Nice Guy. */
5064 if (dest_cpu == NR_CPUS) { 5074 if (dest_cpu == NR_CPUS) {
5065 rq = task_rq_lock(p, &flags); 5075 rq = task_rq_lock(p, &flags);
5066 cpus_setall(p->cpus_allowed); 5076 cpus_setall(p->cpus_allowed);
5067 dest_cpu = any_online_cpu(p->cpus_allowed); 5077 dest_cpu = any_online_cpu(p->cpus_allowed);
5068 task_rq_unlock(rq, &flags); 5078 task_rq_unlock(rq, &flags);
5069 5079
5070 /* 5080 /*
5071 * Don't tell them about moving exiting tasks or 5081 * Don't tell them about moving exiting tasks or
5072 * kernel threads (both mm NULL), since they never 5082 * kernel threads (both mm NULL), since they never
5073 * leave kernel. 5083 * leave kernel.
5074 */ 5084 */
5075 if (p->mm && printk_ratelimit()) 5085 if (p->mm && printk_ratelimit())
5076 printk(KERN_INFO "process %d (%s) no " 5086 printk(KERN_INFO "process %d (%s) no "
5077 "longer affine to cpu%d\n", 5087 "longer affine to cpu%d\n",
5078 p->pid, p->comm, dead_cpu); 5088 p->pid, p->comm, dead_cpu);
5079 } 5089 }
5080 if (!__migrate_task(p, dead_cpu, dest_cpu)) 5090 if (!__migrate_task(p, dead_cpu, dest_cpu))
5081 goto restart; 5091 goto restart;
5082 } 5092 }
5083 5093
5084 /* 5094 /*
5085 * While a dead CPU has no uninterruptible tasks queued at this point, 5095 * While a dead CPU has no uninterruptible tasks queued at this point,
5086 * it might still have a nonzero ->nr_uninterruptible counter, because 5096 * it might still have a nonzero ->nr_uninterruptible counter, because
5087 * for performance reasons the counter is not stricly tracking tasks to 5097 * for performance reasons the counter is not stricly tracking tasks to
5088 * their home CPUs. So we just add the counter to another CPU's counter, 5098 * their home CPUs. So we just add the counter to another CPU's counter,
5089 * to keep the global sum constant after CPU-down: 5099 * to keep the global sum constant after CPU-down:
5090 */ 5100 */
5091 static void migrate_nr_uninterruptible(struct rq *rq_src) 5101 static void migrate_nr_uninterruptible(struct rq *rq_src)
5092 { 5102 {
5093 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 5103 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5094 unsigned long flags; 5104 unsigned long flags;
5095 5105
5096 local_irq_save(flags); 5106 local_irq_save(flags);
5097 double_rq_lock(rq_src, rq_dest); 5107 double_rq_lock(rq_src, rq_dest);
5098 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5108 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5099 rq_src->nr_uninterruptible = 0; 5109 rq_src->nr_uninterruptible = 0;
5100 double_rq_unlock(rq_src, rq_dest); 5110 double_rq_unlock(rq_src, rq_dest);
5101 local_irq_restore(flags); 5111 local_irq_restore(flags);
5102 } 5112 }
5103 5113
5104 /* Run through task list and migrate tasks from the dead cpu. */ 5114 /* Run through task list and migrate tasks from the dead cpu. */
5105 static void migrate_live_tasks(int src_cpu) 5115 static void migrate_live_tasks(int src_cpu)
5106 { 5116 {
5107 struct task_struct *p, *t; 5117 struct task_struct *p, *t;
5108 5118
5109 write_lock_irq(&tasklist_lock); 5119 write_lock_irq(&tasklist_lock);
5110 5120
5111 do_each_thread(t, p) { 5121 do_each_thread(t, p) {
5112 if (p == current) 5122 if (p == current)
5113 continue; 5123 continue;
5114 5124
5115 if (task_cpu(p) == src_cpu) 5125 if (task_cpu(p) == src_cpu)
5116 move_task_off_dead_cpu(src_cpu, p); 5126 move_task_off_dead_cpu(src_cpu, p);
5117 } while_each_thread(t, p); 5127 } while_each_thread(t, p);
5118 5128
5119 write_unlock_irq(&tasklist_lock); 5129 write_unlock_irq(&tasklist_lock);
5120 } 5130 }
5121 5131
5122 /* 5132 /*
5123 * Schedules idle task to be the next runnable task on current CPU. 5133 * Schedules idle task to be the next runnable task on current CPU.
5124 * It does so by boosting its priority to highest possible and adding it to 5134 * It does so by boosting its priority to highest possible and adding it to
5125 * the _front_ of the runqueue. Used by CPU offline code. 5135 * the _front_ of the runqueue. Used by CPU offline code.
5126 */ 5136 */
5127 void sched_idle_next(void) 5137 void sched_idle_next(void)
5128 { 5138 {
5129 int this_cpu = smp_processor_id(); 5139 int this_cpu = smp_processor_id();
5130 struct rq *rq = cpu_rq(this_cpu); 5140 struct rq *rq = cpu_rq(this_cpu);
5131 struct task_struct *p = rq->idle; 5141 struct task_struct *p = rq->idle;
5132 unsigned long flags; 5142 unsigned long flags;
5133 5143
5134 /* cpu has to be offline */ 5144 /* cpu has to be offline */
5135 BUG_ON(cpu_online(this_cpu)); 5145 BUG_ON(cpu_online(this_cpu));
5136 5146
5137 /* 5147 /*
5138 * Strictly not necessary since rest of the CPUs are stopped by now 5148 * Strictly not necessary since rest of the CPUs are stopped by now
5139 * and interrupts disabled on the current cpu. 5149 * and interrupts disabled on the current cpu.
5140 */ 5150 */
5141 spin_lock_irqsave(&rq->lock, flags); 5151 spin_lock_irqsave(&rq->lock, flags);
5142 5152
5143 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5153 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5144 5154
5145 /* Add idle task to the _front_ of its priority queue: */ 5155 /* Add idle task to the _front_ of its priority queue: */
5146 activate_idle_task(p, rq); 5156 activate_idle_task(p, rq);
5147 5157
5148 spin_unlock_irqrestore(&rq->lock, flags); 5158 spin_unlock_irqrestore(&rq->lock, flags);
5149 } 5159 }
5150 5160
5151 /* 5161 /*
5152 * Ensures that the idle task is using init_mm right before its cpu goes 5162 * Ensures that the idle task is using init_mm right before its cpu goes
5153 * offline. 5163 * offline.
5154 */ 5164 */
5155 void idle_task_exit(void) 5165 void idle_task_exit(void)
5156 { 5166 {
5157 struct mm_struct *mm = current->active_mm; 5167 struct mm_struct *mm = current->active_mm;
5158 5168
5159 BUG_ON(cpu_online(smp_processor_id())); 5169 BUG_ON(cpu_online(smp_processor_id()));
5160 5170
5161 if (mm != &init_mm) 5171 if (mm != &init_mm)
5162 switch_mm(mm, &init_mm, current); 5172 switch_mm(mm, &init_mm, current);
5163 mmdrop(mm); 5173 mmdrop(mm);
5164 } 5174 }
5165 5175
5166 /* called under rq->lock with disabled interrupts */ 5176 /* called under rq->lock with disabled interrupts */
5167 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5177 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5168 { 5178 {
5169 struct rq *rq = cpu_rq(dead_cpu); 5179 struct rq *rq = cpu_rq(dead_cpu);
5170 5180
5171 /* Must be exiting, otherwise would be on tasklist. */ 5181 /* Must be exiting, otherwise would be on tasklist. */
5172 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5182 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5173 5183
5174 /* Cannot have done final schedule yet: would have vanished. */ 5184 /* Cannot have done final schedule yet: would have vanished. */
5175 BUG_ON(p->state == TASK_DEAD); 5185 BUG_ON(p->state == TASK_DEAD);
5176 5186
5177 get_task_struct(p); 5187 get_task_struct(p);
5178 5188
5179 /* 5189 /*
5180 * Drop lock around migration; if someone else moves it, 5190 * Drop lock around migration; if someone else moves it,
5181 * that's OK. No task can be added to this CPU, so iteration is 5191 * that's OK. No task can be added to this CPU, so iteration is
5182 * fine. 5192 * fine.
5183 * NOTE: interrupts should be left disabled --dev@ 5193 * NOTE: interrupts should be left disabled --dev@
5184 */ 5194 */
5185 spin_unlock(&rq->lock); 5195 spin_unlock(&rq->lock);
5186 move_task_off_dead_cpu(dead_cpu, p); 5196 move_task_off_dead_cpu(dead_cpu, p);
5187 spin_lock(&rq->lock); 5197 spin_lock(&rq->lock);
5188 5198
5189 put_task_struct(p); 5199 put_task_struct(p);
5190 } 5200 }
5191 5201
5192 /* release_task() removes task from tasklist, so we won't find dead tasks. */ 5202 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5193 static void migrate_dead_tasks(unsigned int dead_cpu) 5203 static void migrate_dead_tasks(unsigned int dead_cpu)
5194 { 5204 {
5195 struct rq *rq = cpu_rq(dead_cpu); 5205 struct rq *rq = cpu_rq(dead_cpu);
5196 struct task_struct *next; 5206 struct task_struct *next;
5197 5207
5198 for ( ; ; ) { 5208 for ( ; ; ) {
5199 if (!rq->nr_running) 5209 if (!rq->nr_running)
5200 break; 5210 break;
5201 next = pick_next_task(rq, rq->curr, rq_clock(rq)); 5211 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5202 if (!next) 5212 if (!next)
5203 break; 5213 break;
5204 migrate_dead(dead_cpu, next); 5214 migrate_dead(dead_cpu, next);
5205 5215
5206 } 5216 }
5207 } 5217 }
5208 #endif /* CONFIG_HOTPLUG_CPU */ 5218 #endif /* CONFIG_HOTPLUG_CPU */
5209 5219
5210 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5220 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
5211 5221
5212 static struct ctl_table sd_ctl_dir[] = { 5222 static struct ctl_table sd_ctl_dir[] = {
5213 {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, }, 5223 {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, },
5214 {0,}, 5224 {0,},
5215 }; 5225 };
5216 5226
5217 static struct ctl_table sd_ctl_root[] = { 5227 static struct ctl_table sd_ctl_root[] = {
5218 {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, }, 5228 {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, },
5219 {0,}, 5229 {0,},
5220 }; 5230 };
5221 5231
5222 static struct ctl_table *sd_alloc_ctl_entry(int n) 5232 static struct ctl_table *sd_alloc_ctl_entry(int n)
5223 { 5233 {
5224 struct ctl_table *entry = 5234 struct ctl_table *entry =
5225 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); 5235 kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
5226 5236
5227 BUG_ON(!entry); 5237 BUG_ON(!entry);
5228 memset(entry, 0, n * sizeof(struct ctl_table)); 5238 memset(entry, 0, n * sizeof(struct ctl_table));
5229 5239
5230 return entry; 5240 return entry;
5231 } 5241 }
5232 5242
5233 static void 5243 static void
5234 set_table_entry(struct ctl_table *entry, int ctl_name, 5244 set_table_entry(struct ctl_table *entry, int ctl_name,
5235 const char *procname, void *data, int maxlen, 5245 const char *procname, void *data, int maxlen,
5236 mode_t mode, proc_handler *proc_handler) 5246 mode_t mode, proc_handler *proc_handler)
5237 { 5247 {
5238 entry->ctl_name = ctl_name; 5248 entry->ctl_name = ctl_name;
5239 entry->procname = procname; 5249 entry->procname = procname;
5240 entry->data = data; 5250 entry->data = data;
5241 entry->maxlen = maxlen; 5251 entry->maxlen = maxlen;
5242 entry->mode = mode; 5252 entry->mode = mode;
5243 entry->proc_handler = proc_handler; 5253 entry->proc_handler = proc_handler;
5244 } 5254 }
5245 5255
5246 static struct ctl_table * 5256 static struct ctl_table *
5247 sd_alloc_ctl_domain_table(struct sched_domain *sd) 5257 sd_alloc_ctl_domain_table(struct sched_domain *sd)
5248 { 5258 {
5249 struct ctl_table *table = sd_alloc_ctl_entry(14); 5259 struct ctl_table *table = sd_alloc_ctl_entry(14);
5250 5260
5251 set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, 5261 set_table_entry(&table[0], 1, "min_interval", &sd->min_interval,
5252 sizeof(long), 0644, proc_doulongvec_minmax); 5262 sizeof(long), 0644, proc_doulongvec_minmax);
5253 set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, 5263 set_table_entry(&table[1], 2, "max_interval", &sd->max_interval,
5254 sizeof(long), 0644, proc_doulongvec_minmax); 5264 sizeof(long), 0644, proc_doulongvec_minmax);
5255 set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx, 5265 set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx,
5256 sizeof(int), 0644, proc_dointvec_minmax); 5266 sizeof(int), 0644, proc_dointvec_minmax);
5257 set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx, 5267 set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx,
5258 sizeof(int), 0644, proc_dointvec_minmax); 5268 sizeof(int), 0644, proc_dointvec_minmax);
5259 set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx, 5269 set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx,
5260 sizeof(int), 0644, proc_dointvec_minmax); 5270 sizeof(int), 0644, proc_dointvec_minmax);
5261 set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx, 5271 set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx,
5262 sizeof(int), 0644, proc_dointvec_minmax); 5272 sizeof(int), 0644, proc_dointvec_minmax);
5263 set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx, 5273 set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx,
5264 sizeof(int), 0644, proc_dointvec_minmax); 5274 sizeof(int), 0644, proc_dointvec_minmax);
5265 set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor, 5275 set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor,
5266 sizeof(int), 0644, proc_dointvec_minmax); 5276 sizeof(int), 0644, proc_dointvec_minmax);
5267 set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct, 5277 set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
5268 sizeof(int), 0644, proc_dointvec_minmax); 5278 sizeof(int), 0644, proc_dointvec_minmax);
5269 set_table_entry(&table[10], 11, "cache_nice_tries", 5279 set_table_entry(&table[10], 11, "cache_nice_tries",
5270 &sd->cache_nice_tries, 5280 &sd->cache_nice_tries,
5271 sizeof(int), 0644, proc_dointvec_minmax); 5281 sizeof(int), 0644, proc_dointvec_minmax);
5272 set_table_entry(&table[12], 13, "flags", &sd->flags, 5282 set_table_entry(&table[12], 13, "flags", &sd->flags,
5273 sizeof(int), 0644, proc_dointvec_minmax); 5283 sizeof(int), 0644, proc_dointvec_minmax);
5274 5284
5275 return table; 5285 return table;
5276 } 5286 }
5277 5287
5278 static ctl_table *sd_alloc_ctl_cpu_table(int cpu) 5288 static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5279 { 5289 {
5280 struct ctl_table *entry, *table; 5290 struct ctl_table *entry, *table;
5281 struct sched_domain *sd; 5291 struct sched_domain *sd;
5282 int domain_num = 0, i; 5292 int domain_num = 0, i;
5283 char buf[32]; 5293 char buf[32];
5284 5294
5285 for_each_domain(cpu, sd) 5295 for_each_domain(cpu, sd)
5286 domain_num++; 5296 domain_num++;
5287 entry = table = sd_alloc_ctl_entry(domain_num + 1); 5297 entry = table = sd_alloc_ctl_entry(domain_num + 1);
5288 5298
5289 i = 0; 5299 i = 0;
5290 for_each_domain(cpu, sd) { 5300 for_each_domain(cpu, sd) {
5291 snprintf(buf, 32, "domain%d", i); 5301 snprintf(buf, 32, "domain%d", i);
5292 entry->ctl_name = i + 1; 5302 entry->ctl_name = i + 1;
5293 entry->procname = kstrdup(buf, GFP_KERNEL); 5303 entry->procname = kstrdup(buf, GFP_KERNEL);
5294 entry->mode = 0755; 5304 entry->mode = 0755;
5295 entry->child = sd_alloc_ctl_domain_table(sd); 5305 entry->child = sd_alloc_ctl_domain_table(sd);
5296 entry++; 5306 entry++;
5297 i++; 5307 i++;
5298 } 5308 }
5299 return table; 5309 return table;
5300 } 5310 }
5301 5311
5302 static struct ctl_table_header *sd_sysctl_header; 5312 static struct ctl_table_header *sd_sysctl_header;
5303 static void init_sched_domain_sysctl(void) 5313 static void init_sched_domain_sysctl(void)
5304 { 5314 {
5305 int i, cpu_num = num_online_cpus(); 5315 int i, cpu_num = num_online_cpus();
5306 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5316 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
5307 char buf[32]; 5317 char buf[32];
5308 5318
5309 sd_ctl_dir[0].child = entry; 5319 sd_ctl_dir[0].child = entry;
5310 5320
5311 for (i = 0; i < cpu_num; i++, entry++) { 5321 for (i = 0; i < cpu_num; i++, entry++) {
5312 snprintf(buf, 32, "cpu%d", i); 5322 snprintf(buf, 32, "cpu%d", i);
5313 entry->ctl_name = i + 1; 5323 entry->ctl_name = i + 1;
5314 entry->procname = kstrdup(buf, GFP_KERNEL); 5324 entry->procname = kstrdup(buf, GFP_KERNEL);
5315 entry->mode = 0755; 5325 entry->mode = 0755;
5316 entry->child = sd_alloc_ctl_cpu_table(i); 5326 entry->child = sd_alloc_ctl_cpu_table(i);
5317 } 5327 }
5318 sd_sysctl_header = register_sysctl_table(sd_ctl_root); 5328 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
5319 } 5329 }
5320 #else 5330 #else
5321 static void init_sched_domain_sysctl(void) 5331 static void init_sched_domain_sysctl(void)
5322 { 5332 {
5323 } 5333 }
5324 #endif 5334 #endif
5325 5335
5326 /* 5336 /*
5327 * migration_call - callback that gets triggered when a CPU is added. 5337 * migration_call - callback that gets triggered when a CPU is added.
5328 * Here we can start up the necessary migration thread for the new CPU. 5338 * Here we can start up the necessary migration thread for the new CPU.
5329 */ 5339 */
5330 static int __cpuinit 5340 static int __cpuinit
5331 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5341 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5332 { 5342 {
5333 struct task_struct *p; 5343 struct task_struct *p;
5334 int cpu = (long)hcpu; 5344 int cpu = (long)hcpu;
5335 unsigned long flags; 5345 unsigned long flags;
5336 struct rq *rq; 5346 struct rq *rq;
5337 5347
5338 switch (action) { 5348 switch (action) {
5339 case CPU_LOCK_ACQUIRE: 5349 case CPU_LOCK_ACQUIRE:
5340 mutex_lock(&sched_hotcpu_mutex); 5350 mutex_lock(&sched_hotcpu_mutex);
5341 break; 5351 break;
5342 5352
5343 case CPU_UP_PREPARE: 5353 case CPU_UP_PREPARE:
5344 case CPU_UP_PREPARE_FROZEN: 5354 case CPU_UP_PREPARE_FROZEN:
5345 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); 5355 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5346 if (IS_ERR(p)) 5356 if (IS_ERR(p))
5347 return NOTIFY_BAD; 5357 return NOTIFY_BAD;
5348 kthread_bind(p, cpu); 5358 kthread_bind(p, cpu);
5349 /* Must be high prio: stop_machine expects to yield to it. */ 5359 /* Must be high prio: stop_machine expects to yield to it. */
5350 rq = task_rq_lock(p, &flags); 5360 rq = task_rq_lock(p, &flags);
5351 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5361 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5352 task_rq_unlock(rq, &flags); 5362 task_rq_unlock(rq, &flags);
5353 cpu_rq(cpu)->migration_thread = p; 5363 cpu_rq(cpu)->migration_thread = p;
5354 break; 5364 break;
5355 5365
5356 case CPU_ONLINE: 5366 case CPU_ONLINE:
5357 case CPU_ONLINE_FROZEN: 5367 case CPU_ONLINE_FROZEN:
5358 /* Strictly unneccessary, as first user will wake it. */ 5368 /* Strictly unneccessary, as first user will wake it. */
5359 wake_up_process(cpu_rq(cpu)->migration_thread); 5369 wake_up_process(cpu_rq(cpu)->migration_thread);
5360 break; 5370 break;
5361 5371
5362 #ifdef CONFIG_HOTPLUG_CPU 5372 #ifdef CONFIG_HOTPLUG_CPU
5363 case CPU_UP_CANCELED: 5373 case CPU_UP_CANCELED:
5364 case CPU_UP_CANCELED_FROZEN: 5374 case CPU_UP_CANCELED_FROZEN:
5365 if (!cpu_rq(cpu)->migration_thread) 5375 if (!cpu_rq(cpu)->migration_thread)
5366 break; 5376 break;
5367 /* Unbind it from offline cpu so it can run. Fall thru. */ 5377 /* Unbind it from offline cpu so it can run. Fall thru. */
5368 kthread_bind(cpu_rq(cpu)->migration_thread, 5378 kthread_bind(cpu_rq(cpu)->migration_thread,
5369 any_online_cpu(cpu_online_map)); 5379 any_online_cpu(cpu_online_map));
5370 kthread_stop(cpu_rq(cpu)->migration_thread); 5380 kthread_stop(cpu_rq(cpu)->migration_thread);
5371 cpu_rq(cpu)->migration_thread = NULL; 5381 cpu_rq(cpu)->migration_thread = NULL;
5372 break; 5382 break;
5373 5383
5374 case CPU_DEAD: 5384 case CPU_DEAD:
5375 case CPU_DEAD_FROZEN: 5385 case CPU_DEAD_FROZEN:
5376 migrate_live_tasks(cpu); 5386 migrate_live_tasks(cpu);
5377 rq = cpu_rq(cpu); 5387 rq = cpu_rq(cpu);
5378 kthread_stop(rq->migration_thread); 5388 kthread_stop(rq->migration_thread);
5379 rq->migration_thread = NULL; 5389 rq->migration_thread = NULL;
5380 /* Idle task back to normal (off runqueue, low prio) */ 5390 /* Idle task back to normal (off runqueue, low prio) */
5381 rq = task_rq_lock(rq->idle, &flags); 5391 rq = task_rq_lock(rq->idle, &flags);
5382 deactivate_task(rq, rq->idle, 0); 5392 deactivate_task(rq, rq->idle, 0);
5383 rq->idle->static_prio = MAX_PRIO; 5393 rq->idle->static_prio = MAX_PRIO;
5384 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5394 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5385 rq->idle->sched_class = &idle_sched_class; 5395 rq->idle->sched_class = &idle_sched_class;
5386 migrate_dead_tasks(cpu); 5396 migrate_dead_tasks(cpu);
5387 task_rq_unlock(rq, &flags); 5397 task_rq_unlock(rq, &flags);
5388 migrate_nr_uninterruptible(rq); 5398 migrate_nr_uninterruptible(rq);
5389 BUG_ON(rq->nr_running != 0); 5399 BUG_ON(rq->nr_running != 0);
5390 5400
5391 /* No need to migrate the tasks: it was best-effort if 5401 /* No need to migrate the tasks: it was best-effort if
5392 * they didn't take sched_hotcpu_mutex. Just wake up 5402 * they didn't take sched_hotcpu_mutex. Just wake up
5393 * the requestors. */ 5403 * the requestors. */
5394 spin_lock_irq(&rq->lock); 5404 spin_lock_irq(&rq->lock);
5395 while (!list_empty(&rq->migration_queue)) { 5405 while (!list_empty(&rq->migration_queue)) {
5396 struct migration_req *req; 5406 struct migration_req *req;
5397 5407
5398 req = list_entry(rq->migration_queue.next, 5408 req = list_entry(rq->migration_queue.next,
5399 struct migration_req, list); 5409 struct migration_req, list);
5400 list_del_init(&req->list); 5410 list_del_init(&req->list);
5401 complete(&req->done); 5411 complete(&req->done);
5402 } 5412 }
5403 spin_unlock_irq(&rq->lock); 5413 spin_unlock_irq(&rq->lock);
5404 break; 5414 break;
5405 #endif 5415 #endif
5406 case CPU_LOCK_RELEASE: 5416 case CPU_LOCK_RELEASE:
5407 mutex_unlock(&sched_hotcpu_mutex); 5417 mutex_unlock(&sched_hotcpu_mutex);
5408 break; 5418 break;
5409 } 5419 }
5410 return NOTIFY_OK; 5420 return NOTIFY_OK;
5411 } 5421 }
5412 5422
5413 /* Register at highest priority so that task migration (migrate_all_tasks) 5423 /* Register at highest priority so that task migration (migrate_all_tasks)
5414 * happens before everything else. 5424 * happens before everything else.
5415 */ 5425 */
5416 static struct notifier_block __cpuinitdata migration_notifier = { 5426 static struct notifier_block __cpuinitdata migration_notifier = {
5417 .notifier_call = migration_call, 5427 .notifier_call = migration_call,
5418 .priority = 10 5428 .priority = 10
5419 }; 5429 };
5420 5430
5421 int __init migration_init(void) 5431 int __init migration_init(void)
5422 { 5432 {
5423 void *cpu = (void *)(long)smp_processor_id(); 5433 void *cpu = (void *)(long)smp_processor_id();
5424 int err; 5434 int err;
5425 5435
5426 /* Start one for the boot CPU: */ 5436 /* Start one for the boot CPU: */
5427 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5437 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5428 BUG_ON(err == NOTIFY_BAD); 5438 BUG_ON(err == NOTIFY_BAD);
5429 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5439 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5430 register_cpu_notifier(&migration_notifier); 5440 register_cpu_notifier(&migration_notifier);
5431 5441
5432 return 0; 5442 return 0;
5433 } 5443 }
5434 #endif 5444 #endif
5435 5445
5436 #ifdef CONFIG_SMP 5446 #ifdef CONFIG_SMP
5437 5447
5438 /* Number of possible processor ids */ 5448 /* Number of possible processor ids */
5439 int nr_cpu_ids __read_mostly = NR_CPUS; 5449 int nr_cpu_ids __read_mostly = NR_CPUS;
5440 EXPORT_SYMBOL(nr_cpu_ids); 5450 EXPORT_SYMBOL(nr_cpu_ids);
5441 5451
5442 #undef SCHED_DOMAIN_DEBUG 5452 #undef SCHED_DOMAIN_DEBUG
5443 #ifdef SCHED_DOMAIN_DEBUG 5453 #ifdef SCHED_DOMAIN_DEBUG
5444 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5454 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5445 { 5455 {
5446 int level = 0; 5456 int level = 0;
5447 5457
5448 if (!sd) { 5458 if (!sd) {
5449 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5459 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5450 return; 5460 return;
5451 } 5461 }
5452 5462
5453 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5463 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5454 5464
5455 do { 5465 do {
5456 int i; 5466 int i;
5457 char str[NR_CPUS]; 5467 char str[NR_CPUS];
5458 struct sched_group *group = sd->groups; 5468 struct sched_group *group = sd->groups;
5459 cpumask_t groupmask; 5469 cpumask_t groupmask;
5460 5470
5461 cpumask_scnprintf(str, NR_CPUS, sd->span); 5471 cpumask_scnprintf(str, NR_CPUS, sd->span);
5462 cpus_clear(groupmask); 5472 cpus_clear(groupmask);
5463 5473
5464 printk(KERN_DEBUG); 5474 printk(KERN_DEBUG);
5465 for (i = 0; i < level + 1; i++) 5475 for (i = 0; i < level + 1; i++)
5466 printk(" "); 5476 printk(" ");
5467 printk("domain %d: ", level); 5477 printk("domain %d: ", level);
5468 5478
5469 if (!(sd->flags & SD_LOAD_BALANCE)) { 5479 if (!(sd->flags & SD_LOAD_BALANCE)) {
5470 printk("does not load-balance\n"); 5480 printk("does not load-balance\n");
5471 if (sd->parent) 5481 if (sd->parent)
5472 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5482 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5473 " has parent"); 5483 " has parent");
5474 break; 5484 break;
5475 } 5485 }
5476 5486
5477 printk("span %s\n", str); 5487 printk("span %s\n", str);
5478 5488
5479 if (!cpu_isset(cpu, sd->span)) 5489 if (!cpu_isset(cpu, sd->span))
5480 printk(KERN_ERR "ERROR: domain->span does not contain " 5490 printk(KERN_ERR "ERROR: domain->span does not contain "
5481 "CPU%d\n", cpu); 5491 "CPU%d\n", cpu);
5482 if (!cpu_isset(cpu, group->cpumask)) 5492 if (!cpu_isset(cpu, group->cpumask))
5483 printk(KERN_ERR "ERROR: domain->groups does not contain" 5493 printk(KERN_ERR "ERROR: domain->groups does not contain"
5484 " CPU%d\n", cpu); 5494 " CPU%d\n", cpu);
5485 5495
5486 printk(KERN_DEBUG); 5496 printk(KERN_DEBUG);
5487 for (i = 0; i < level + 2; i++) 5497 for (i = 0; i < level + 2; i++)
5488 printk(" "); 5498 printk(" ");
5489 printk("groups:"); 5499 printk("groups:");
5490 do { 5500 do {
5491 if (!group) { 5501 if (!group) {
5492 printk("\n"); 5502 printk("\n");
5493 printk(KERN_ERR "ERROR: group is NULL\n"); 5503 printk(KERN_ERR "ERROR: group is NULL\n");
5494 break; 5504 break;
5495 } 5505 }
5496 5506
5497 if (!group->__cpu_power) { 5507 if (!group->__cpu_power) {
5498 printk("\n"); 5508 printk("\n");
5499 printk(KERN_ERR "ERROR: domain->cpu_power not " 5509 printk(KERN_ERR "ERROR: domain->cpu_power not "
5500 "set\n"); 5510 "set\n");
5501 } 5511 }
5502 5512
5503 if (!cpus_weight(group->cpumask)) { 5513 if (!cpus_weight(group->cpumask)) {
5504 printk("\n"); 5514 printk("\n");
5505 printk(KERN_ERR "ERROR: empty group\n"); 5515 printk(KERN_ERR "ERROR: empty group\n");
5506 } 5516 }
5507 5517
5508 if (cpus_intersects(groupmask, group->cpumask)) { 5518 if (cpus_intersects(groupmask, group->cpumask)) {
5509 printk("\n"); 5519 printk("\n");
5510 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5520 printk(KERN_ERR "ERROR: repeated CPUs\n");
5511 } 5521 }
5512 5522
5513 cpus_or(groupmask, groupmask, group->cpumask); 5523 cpus_or(groupmask, groupmask, group->cpumask);
5514 5524
5515 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 5525 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5516 printk(" %s", str); 5526 printk(" %s", str);
5517 5527
5518 group = group->next; 5528 group = group->next;
5519 } while (group != sd->groups); 5529 } while (group != sd->groups);
5520 printk("\n"); 5530 printk("\n");
5521 5531
5522 if (!cpus_equal(sd->span, groupmask)) 5532 if (!cpus_equal(sd->span, groupmask))
5523 printk(KERN_ERR "ERROR: groups don't span " 5533 printk(KERN_ERR "ERROR: groups don't span "
5524 "domain->span\n"); 5534 "domain->span\n");
5525 5535
5526 level++; 5536 level++;
5527 sd = sd->parent; 5537 sd = sd->parent;
5528 if (!sd) 5538 if (!sd)
5529 continue; 5539 continue;
5530 5540
5531 if (!cpus_subset(groupmask, sd->span)) 5541 if (!cpus_subset(groupmask, sd->span))
5532 printk(KERN_ERR "ERROR: parent span is not a superset " 5542 printk(KERN_ERR "ERROR: parent span is not a superset "
5533 "of domain->span\n"); 5543 "of domain->span\n");
5534 5544
5535 } while (sd); 5545 } while (sd);
5536 } 5546 }
5537 #else 5547 #else
5538 # define sched_domain_debug(sd, cpu) do { } while (0) 5548 # define sched_domain_debug(sd, cpu) do { } while (0)
5539 #endif 5549 #endif
5540 5550
5541 static int sd_degenerate(struct sched_domain *sd) 5551 static int sd_degenerate(struct sched_domain *sd)
5542 { 5552 {
5543 if (cpus_weight(sd->span) == 1) 5553 if (cpus_weight(sd->span) == 1)
5544 return 1; 5554 return 1;
5545 5555
5546 /* Following flags need at least 2 groups */ 5556 /* Following flags need at least 2 groups */
5547 if (sd->flags & (SD_LOAD_BALANCE | 5557 if (sd->flags & (SD_LOAD_BALANCE |
5548 SD_BALANCE_NEWIDLE | 5558 SD_BALANCE_NEWIDLE |
5549 SD_BALANCE_FORK | 5559 SD_BALANCE_FORK |
5550 SD_BALANCE_EXEC | 5560 SD_BALANCE_EXEC |
5551 SD_SHARE_CPUPOWER | 5561 SD_SHARE_CPUPOWER |
5552 SD_SHARE_PKG_RESOURCES)) { 5562 SD_SHARE_PKG_RESOURCES)) {
5553 if (sd->groups != sd->groups->next) 5563 if (sd->groups != sd->groups->next)
5554 return 0; 5564 return 0;
5555 } 5565 }
5556 5566
5557 /* Following flags don't use groups */ 5567 /* Following flags don't use groups */
5558 if (sd->flags & (SD_WAKE_IDLE | 5568 if (sd->flags & (SD_WAKE_IDLE |
5559 SD_WAKE_AFFINE | 5569 SD_WAKE_AFFINE |
5560 SD_WAKE_BALANCE)) 5570 SD_WAKE_BALANCE))
5561 return 0; 5571 return 0;
5562 5572
5563 return 1; 5573 return 1;
5564 } 5574 }
5565 5575
5566 static int 5576 static int
5567 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5577 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5568 { 5578 {
5569 unsigned long cflags = sd->flags, pflags = parent->flags; 5579 unsigned long cflags = sd->flags, pflags = parent->flags;
5570 5580
5571 if (sd_degenerate(parent)) 5581 if (sd_degenerate(parent))
5572 return 1; 5582 return 1;
5573 5583
5574 if (!cpus_equal(sd->span, parent->span)) 5584 if (!cpus_equal(sd->span, parent->span))
5575 return 0; 5585 return 0;
5576 5586
5577 /* Does parent contain flags not in child? */ 5587 /* Does parent contain flags not in child? */
5578 /* WAKE_BALANCE is a subset of WAKE_AFFINE */ 5588 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5579 if (cflags & SD_WAKE_AFFINE) 5589 if (cflags & SD_WAKE_AFFINE)
5580 pflags &= ~SD_WAKE_BALANCE; 5590 pflags &= ~SD_WAKE_BALANCE;
5581 /* Flags needing groups don't count if only 1 group in parent */ 5591 /* Flags needing groups don't count if only 1 group in parent */
5582 if (parent->groups == parent->groups->next) { 5592 if (parent->groups == parent->groups->next) {
5583 pflags &= ~(SD_LOAD_BALANCE | 5593 pflags &= ~(SD_LOAD_BALANCE |
5584 SD_BALANCE_NEWIDLE | 5594 SD_BALANCE_NEWIDLE |
5585 SD_BALANCE_FORK | 5595 SD_BALANCE_FORK |
5586 SD_BALANCE_EXEC | 5596 SD_BALANCE_EXEC |
5587 SD_SHARE_CPUPOWER | 5597 SD_SHARE_CPUPOWER |
5588 SD_SHARE_PKG_RESOURCES); 5598 SD_SHARE_PKG_RESOURCES);
5589 } 5599 }
5590 if (~cflags & pflags) 5600 if (~cflags & pflags)
5591 return 0; 5601 return 0;
5592 5602
5593 return 1; 5603 return 1;
5594 } 5604 }
5595 5605
5596 /* 5606 /*
5597 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5607 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5598 * hold the hotplug lock. 5608 * hold the hotplug lock.
5599 */ 5609 */
5600 static void cpu_attach_domain(struct sched_domain *sd, int cpu) 5610 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5601 { 5611 {
5602 struct rq *rq = cpu_rq(cpu); 5612 struct rq *rq = cpu_rq(cpu);
5603 struct sched_domain *tmp; 5613 struct sched_domain *tmp;
5604 5614
5605 /* Remove the sched domains which do not contribute to scheduling. */ 5615 /* Remove the sched domains which do not contribute to scheduling. */
5606 for (tmp = sd; tmp; tmp = tmp->parent) { 5616 for (tmp = sd; tmp; tmp = tmp->parent) {
5607 struct sched_domain *parent = tmp->parent; 5617 struct sched_domain *parent = tmp->parent;
5608 if (!parent) 5618 if (!parent)
5609 break; 5619 break;
5610 if (sd_parent_degenerate(tmp, parent)) { 5620 if (sd_parent_degenerate(tmp, parent)) {
5611 tmp->parent = parent->parent; 5621 tmp->parent = parent->parent;
5612 if (parent->parent) 5622 if (parent->parent)
5613 parent->parent->child = tmp; 5623 parent->parent->child = tmp;
5614 } 5624 }
5615 } 5625 }
5616 5626
5617 if (sd && sd_degenerate(sd)) { 5627 if (sd && sd_degenerate(sd)) {
5618 sd = sd->parent; 5628 sd = sd->parent;
5619 if (sd) 5629 if (sd)
5620 sd->child = NULL; 5630 sd->child = NULL;
5621 } 5631 }
5622 5632
5623 sched_domain_debug(sd, cpu); 5633 sched_domain_debug(sd, cpu);
5624 5634
5625 rcu_assign_pointer(rq->sd, sd); 5635 rcu_assign_pointer(rq->sd, sd);
5626 } 5636 }
5627 5637
5628 /* cpus with isolated domains */ 5638 /* cpus with isolated domains */
5629 static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 5639 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5630 5640
5631 /* Setup the mask of cpus configured for isolated domains */ 5641 /* Setup the mask of cpus configured for isolated domains */
5632 static int __init isolated_cpu_setup(char *str) 5642 static int __init isolated_cpu_setup(char *str)
5633 { 5643 {
5634 int ints[NR_CPUS], i; 5644 int ints[NR_CPUS], i;
5635 5645
5636 str = get_options(str, ARRAY_SIZE(ints), ints); 5646 str = get_options(str, ARRAY_SIZE(ints), ints);
5637 cpus_clear(cpu_isolated_map); 5647 cpus_clear(cpu_isolated_map);
5638 for (i = 1; i <= ints[0]; i++) 5648 for (i = 1; i <= ints[0]; i++)
5639 if (ints[i] < NR_CPUS) 5649 if (ints[i] < NR_CPUS)
5640 cpu_set(ints[i], cpu_isolated_map); 5650 cpu_set(ints[i], cpu_isolated_map);
5641 return 1; 5651 return 1;
5642 } 5652 }
5643 5653
5644 __setup ("isolcpus=", isolated_cpu_setup); 5654 __setup ("isolcpus=", isolated_cpu_setup);
5645 5655
5646 /* 5656 /*
5647 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 5657 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5648 * to a function which identifies what group(along with sched group) a CPU 5658 * to a function which identifies what group(along with sched group) a CPU
5649 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 5659 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5650 * (due to the fact that we keep track of groups covered with a cpumask_t). 5660 * (due to the fact that we keep track of groups covered with a cpumask_t).
5651 * 5661 *
5652 * init_sched_build_groups will build a circular linked list of the groups 5662 * init_sched_build_groups will build a circular linked list of the groups
5653 * covered by the given span, and will set each group's ->cpumask correctly, 5663 * covered by the given span, and will set each group's ->cpumask correctly,
5654 * and ->cpu_power to 0. 5664 * and ->cpu_power to 0.
5655 */ 5665 */
5656 static void 5666 static void
5657 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, 5667 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5658 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 5668 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5659 struct sched_group **sg)) 5669 struct sched_group **sg))
5660 { 5670 {
5661 struct sched_group *first = NULL, *last = NULL; 5671 struct sched_group *first = NULL, *last = NULL;
5662 cpumask_t covered = CPU_MASK_NONE; 5672 cpumask_t covered = CPU_MASK_NONE;
5663 int i; 5673 int i;
5664 5674
5665 for_each_cpu_mask(i, span) { 5675 for_each_cpu_mask(i, span) {
5666 struct sched_group *sg; 5676 struct sched_group *sg;
5667 int group = group_fn(i, cpu_map, &sg); 5677 int group = group_fn(i, cpu_map, &sg);
5668 int j; 5678 int j;
5669 5679
5670 if (cpu_isset(i, covered)) 5680 if (cpu_isset(i, covered))
5671 continue; 5681 continue;
5672 5682
5673 sg->cpumask = CPU_MASK_NONE; 5683 sg->cpumask = CPU_MASK_NONE;
5674 sg->__cpu_power = 0; 5684 sg->__cpu_power = 0;
5675 5685
5676 for_each_cpu_mask(j, span) { 5686 for_each_cpu_mask(j, span) {
5677 if (group_fn(j, cpu_map, NULL) != group) 5687 if (group_fn(j, cpu_map, NULL) != group)
5678 continue; 5688 continue;
5679 5689
5680 cpu_set(j, covered); 5690 cpu_set(j, covered);
5681 cpu_set(j, sg->cpumask); 5691 cpu_set(j, sg->cpumask);
5682 } 5692 }
5683 if (!first) 5693 if (!first)
5684 first = sg; 5694 first = sg;
5685 if (last) 5695 if (last)
5686 last->next = sg; 5696 last->next = sg;
5687 last = sg; 5697 last = sg;
5688 } 5698 }
5689 last->next = first; 5699 last->next = first;
5690 } 5700 }
5691 5701
5692 #define SD_NODES_PER_DOMAIN 16 5702 #define SD_NODES_PER_DOMAIN 16
5693 5703
5694 #ifdef CONFIG_NUMA 5704 #ifdef CONFIG_NUMA
5695 5705
5696 /** 5706 /**
5697 * find_next_best_node - find the next node to include in a sched_domain 5707 * find_next_best_node - find the next node to include in a sched_domain
5698 * @node: node whose sched_domain we're building 5708 * @node: node whose sched_domain we're building
5699 * @used_nodes: nodes already in the sched_domain 5709 * @used_nodes: nodes already in the sched_domain
5700 * 5710 *
5701 * Find the next node to include in a given scheduling domain. Simply 5711 * Find the next node to include in a given scheduling domain. Simply
5702 * finds the closest node not already in the @used_nodes map. 5712 * finds the closest node not already in the @used_nodes map.
5703 * 5713 *
5704 * Should use nodemask_t. 5714 * Should use nodemask_t.
5705 */ 5715 */
5706 static int find_next_best_node(int node, unsigned long *used_nodes) 5716 static int find_next_best_node(int node, unsigned long *used_nodes)
5707 { 5717 {
5708 int i, n, val, min_val, best_node = 0; 5718 int i, n, val, min_val, best_node = 0;
5709 5719
5710 min_val = INT_MAX; 5720 min_val = INT_MAX;
5711 5721
5712 for (i = 0; i < MAX_NUMNODES; i++) { 5722 for (i = 0; i < MAX_NUMNODES; i++) {
5713 /* Start at @node */ 5723 /* Start at @node */
5714 n = (node + i) % MAX_NUMNODES; 5724 n = (node + i) % MAX_NUMNODES;
5715 5725
5716 if (!nr_cpus_node(n)) 5726 if (!nr_cpus_node(n))
5717 continue; 5727 continue;
5718 5728
5719 /* Skip already used nodes */ 5729 /* Skip already used nodes */
5720 if (test_bit(n, used_nodes)) 5730 if (test_bit(n, used_nodes))
5721 continue; 5731 continue;
5722 5732
5723 /* Simple min distance search */ 5733 /* Simple min distance search */
5724 val = node_distance(node, n); 5734 val = node_distance(node, n);
5725 5735
5726 if (val < min_val) { 5736 if (val < min_val) {
5727 min_val = val; 5737 min_val = val;
5728 best_node = n; 5738 best_node = n;
5729 } 5739 }
5730 } 5740 }
5731 5741
5732 set_bit(best_node, used_nodes); 5742 set_bit(best_node, used_nodes);
5733 return best_node; 5743 return best_node;
5734 } 5744 }
5735 5745
5736 /** 5746 /**
5737 * sched_domain_node_span - get a cpumask for a node's sched_domain 5747 * sched_domain_node_span - get a cpumask for a node's sched_domain
5738 * @node: node whose cpumask we're constructing 5748 * @node: node whose cpumask we're constructing
5739 * @size: number of nodes to include in this span 5749 * @size: number of nodes to include in this span
5740 * 5750 *
5741 * Given a node, construct a good cpumask for its sched_domain to span. It 5751 * Given a node, construct a good cpumask for its sched_domain to span. It
5742 * should be one that prevents unnecessary balancing, but also spreads tasks 5752 * should be one that prevents unnecessary balancing, but also spreads tasks
5743 * out optimally. 5753 * out optimally.
5744 */ 5754 */
5745 static cpumask_t sched_domain_node_span(int node) 5755 static cpumask_t sched_domain_node_span(int node)
5746 { 5756 {
5747 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 5757 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5748 cpumask_t span, nodemask; 5758 cpumask_t span, nodemask;
5749 int i; 5759 int i;
5750 5760
5751 cpus_clear(span); 5761 cpus_clear(span);
5752 bitmap_zero(used_nodes, MAX_NUMNODES); 5762 bitmap_zero(used_nodes, MAX_NUMNODES);
5753 5763
5754 nodemask = node_to_cpumask(node); 5764 nodemask = node_to_cpumask(node);
5755 cpus_or(span, span, nodemask); 5765 cpus_or(span, span, nodemask);
5756 set_bit(node, used_nodes); 5766 set_bit(node, used_nodes);
5757 5767
5758 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 5768 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5759 int next_node = find_next_best_node(node, used_nodes); 5769 int next_node = find_next_best_node(node, used_nodes);
5760 5770
5761 nodemask = node_to_cpumask(next_node); 5771 nodemask = node_to_cpumask(next_node);
5762 cpus_or(span, span, nodemask); 5772 cpus_or(span, span, nodemask);
5763 } 5773 }
5764 5774
5765 return span; 5775 return span;
5766 } 5776 }
5767 #endif 5777 #endif
5768 5778
5769 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 5779 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5770 5780
5771 /* 5781 /*
5772 * SMT sched-domains: 5782 * SMT sched-domains:
5773 */ 5783 */
5774 #ifdef CONFIG_SCHED_SMT 5784 #ifdef CONFIG_SCHED_SMT
5775 static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 5785 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5776 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 5786 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5777 5787
5778 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, 5788 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5779 struct sched_group **sg) 5789 struct sched_group **sg)
5780 { 5790 {
5781 if (sg) 5791 if (sg)
5782 *sg = &per_cpu(sched_group_cpus, cpu); 5792 *sg = &per_cpu(sched_group_cpus, cpu);
5783 return cpu; 5793 return cpu;
5784 } 5794 }
5785 #endif 5795 #endif
5786 5796
5787 /* 5797 /*
5788 * multi-core sched-domains: 5798 * multi-core sched-domains:
5789 */ 5799 */
5790 #ifdef CONFIG_SCHED_MC 5800 #ifdef CONFIG_SCHED_MC
5791 static DEFINE_PER_CPU(struct sched_domain, core_domains); 5801 static DEFINE_PER_CPU(struct sched_domain, core_domains);
5792 static DEFINE_PER_CPU(struct sched_group, sched_group_core); 5802 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5793 #endif 5803 #endif
5794 5804
5795 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 5805 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
5796 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 5806 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5797 struct sched_group **sg) 5807 struct sched_group **sg)
5798 { 5808 {
5799 int group; 5809 int group;
5800 cpumask_t mask = cpu_sibling_map[cpu]; 5810 cpumask_t mask = cpu_sibling_map[cpu];
5801 cpus_and(mask, mask, *cpu_map); 5811 cpus_and(mask, mask, *cpu_map);
5802 group = first_cpu(mask); 5812 group = first_cpu(mask);
5803 if (sg) 5813 if (sg)
5804 *sg = &per_cpu(sched_group_core, group); 5814 *sg = &per_cpu(sched_group_core, group);
5805 return group; 5815 return group;
5806 } 5816 }
5807 #elif defined(CONFIG_SCHED_MC) 5817 #elif defined(CONFIG_SCHED_MC)
5808 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 5818 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5809 struct sched_group **sg) 5819 struct sched_group **sg)
5810 { 5820 {
5811 if (sg) 5821 if (sg)
5812 *sg = &per_cpu(sched_group_core, cpu); 5822 *sg = &per_cpu(sched_group_core, cpu);
5813 return cpu; 5823 return cpu;
5814 } 5824 }
5815 #endif 5825 #endif
5816 5826
5817 static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5827 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5818 static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 5828 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
5819 5829
5820 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, 5830 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5821 struct sched_group **sg) 5831 struct sched_group **sg)
5822 { 5832 {
5823 int group; 5833 int group;
5824 #ifdef CONFIG_SCHED_MC 5834 #ifdef CONFIG_SCHED_MC
5825 cpumask_t mask = cpu_coregroup_map(cpu); 5835 cpumask_t mask = cpu_coregroup_map(cpu);
5826 cpus_and(mask, mask, *cpu_map); 5836 cpus_and(mask, mask, *cpu_map);
5827 group = first_cpu(mask); 5837 group = first_cpu(mask);
5828 #elif defined(CONFIG_SCHED_SMT) 5838 #elif defined(CONFIG_SCHED_SMT)
5829 cpumask_t mask = cpu_sibling_map[cpu]; 5839 cpumask_t mask = cpu_sibling_map[cpu];
5830 cpus_and(mask, mask, *cpu_map); 5840 cpus_and(mask, mask, *cpu_map);
5831 group = first_cpu(mask); 5841 group = first_cpu(mask);
5832 #else 5842 #else
5833 group = cpu; 5843 group = cpu;
5834 #endif 5844 #endif
5835 if (sg) 5845 if (sg)
5836 *sg = &per_cpu(sched_group_phys, group); 5846 *sg = &per_cpu(sched_group_phys, group);
5837 return group; 5847 return group;
5838 } 5848 }
5839 5849
5840 #ifdef CONFIG_NUMA 5850 #ifdef CONFIG_NUMA
5841 /* 5851 /*
5842 * The init_sched_build_groups can't handle what we want to do with node 5852 * The init_sched_build_groups can't handle what we want to do with node
5843 * groups, so roll our own. Now each node has its own list of groups which 5853 * groups, so roll our own. Now each node has its own list of groups which
5844 * gets dynamically allocated. 5854 * gets dynamically allocated.
5845 */ 5855 */
5846 static DEFINE_PER_CPU(struct sched_domain, node_domains); 5856 static DEFINE_PER_CPU(struct sched_domain, node_domains);
5847 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 5857 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
5848 5858
5849 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 5859 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
5850 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 5860 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
5851 5861
5852 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 5862 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5853 struct sched_group **sg) 5863 struct sched_group **sg)
5854 { 5864 {
5855 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); 5865 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5856 int group; 5866 int group;
5857 5867
5858 cpus_and(nodemask, nodemask, *cpu_map); 5868 cpus_and(nodemask, nodemask, *cpu_map);
5859 group = first_cpu(nodemask); 5869 group = first_cpu(nodemask);
5860 5870
5861 if (sg) 5871 if (sg)
5862 *sg = &per_cpu(sched_group_allnodes, group); 5872 *sg = &per_cpu(sched_group_allnodes, group);
5863 return group; 5873 return group;
5864 } 5874 }
5865 5875
5866 static void init_numa_sched_groups_power(struct sched_group *group_head) 5876 static void init_numa_sched_groups_power(struct sched_group *group_head)
5867 { 5877 {
5868 struct sched_group *sg = group_head; 5878 struct sched_group *sg = group_head;
5869 int j; 5879 int j;
5870 5880
5871 if (!sg) 5881 if (!sg)
5872 return; 5882 return;
5873 next_sg: 5883 next_sg:
5874 for_each_cpu_mask(j, sg->cpumask) { 5884 for_each_cpu_mask(j, sg->cpumask) {
5875 struct sched_domain *sd; 5885 struct sched_domain *sd;
5876 5886
5877 sd = &per_cpu(phys_domains, j); 5887 sd = &per_cpu(phys_domains, j);
5878 if (j != first_cpu(sd->groups->cpumask)) { 5888 if (j != first_cpu(sd->groups->cpumask)) {
5879 /* 5889 /*
5880 * Only add "power" once for each 5890 * Only add "power" once for each
5881 * physical package. 5891 * physical package.
5882 */ 5892 */
5883 continue; 5893 continue;
5884 } 5894 }
5885 5895
5886 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 5896 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
5887 } 5897 }
5888 sg = sg->next; 5898 sg = sg->next;
5889 if (sg != group_head) 5899 if (sg != group_head)
5890 goto next_sg; 5900 goto next_sg;
5891 } 5901 }
5892 #endif 5902 #endif
5893 5903
5894 #ifdef CONFIG_NUMA 5904 #ifdef CONFIG_NUMA
5895 /* Free memory allocated for various sched_group structures */ 5905 /* Free memory allocated for various sched_group structures */
5896 static void free_sched_groups(const cpumask_t *cpu_map) 5906 static void free_sched_groups(const cpumask_t *cpu_map)
5897 { 5907 {
5898 int cpu, i; 5908 int cpu, i;
5899 5909
5900 for_each_cpu_mask(cpu, *cpu_map) { 5910 for_each_cpu_mask(cpu, *cpu_map) {
5901 struct sched_group **sched_group_nodes 5911 struct sched_group **sched_group_nodes
5902 = sched_group_nodes_bycpu[cpu]; 5912 = sched_group_nodes_bycpu[cpu];
5903 5913
5904 if (!sched_group_nodes) 5914 if (!sched_group_nodes)
5905 continue; 5915 continue;
5906 5916
5907 for (i = 0; i < MAX_NUMNODES; i++) { 5917 for (i = 0; i < MAX_NUMNODES; i++) {
5908 cpumask_t nodemask = node_to_cpumask(i); 5918 cpumask_t nodemask = node_to_cpumask(i);
5909 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 5919 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5910 5920
5911 cpus_and(nodemask, nodemask, *cpu_map); 5921 cpus_and(nodemask, nodemask, *cpu_map);
5912 if (cpus_empty(nodemask)) 5922 if (cpus_empty(nodemask))
5913 continue; 5923 continue;
5914 5924
5915 if (sg == NULL) 5925 if (sg == NULL)
5916 continue; 5926 continue;
5917 sg = sg->next; 5927 sg = sg->next;
5918 next_sg: 5928 next_sg:
5919 oldsg = sg; 5929 oldsg = sg;
5920 sg = sg->next; 5930 sg = sg->next;
5921 kfree(oldsg); 5931 kfree(oldsg);
5922 if (oldsg != sched_group_nodes[i]) 5932 if (oldsg != sched_group_nodes[i])
5923 goto next_sg; 5933 goto next_sg;
5924 } 5934 }
5925 kfree(sched_group_nodes); 5935 kfree(sched_group_nodes);
5926 sched_group_nodes_bycpu[cpu] = NULL; 5936 sched_group_nodes_bycpu[cpu] = NULL;
5927 } 5937 }
5928 } 5938 }
5929 #else 5939 #else
5930 static void free_sched_groups(const cpumask_t *cpu_map) 5940 static void free_sched_groups(const cpumask_t *cpu_map)
5931 { 5941 {
5932 } 5942 }
5933 #endif 5943 #endif
5934 5944
5935 /* 5945 /*
5936 * Initialize sched groups cpu_power. 5946 * Initialize sched groups cpu_power.
5937 * 5947 *
5938 * cpu_power indicates the capacity of sched group, which is used while 5948 * cpu_power indicates the capacity of sched group, which is used while
5939 * distributing the load between different sched groups in a sched domain. 5949 * distributing the load between different sched groups in a sched domain.
5940 * Typically cpu_power for all the groups in a sched domain will be same unless 5950 * Typically cpu_power for all the groups in a sched domain will be same unless
5941 * there are asymmetries in the topology. If there are asymmetries, group 5951 * there are asymmetries in the topology. If there are asymmetries, group
5942 * having more cpu_power will pickup more load compared to the group having 5952 * having more cpu_power will pickup more load compared to the group having
5943 * less cpu_power. 5953 * less cpu_power.
5944 * 5954 *
5945 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents 5955 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5946 * the maximum number of tasks a group can handle in the presence of other idle 5956 * the maximum number of tasks a group can handle in the presence of other idle
5947 * or lightly loaded groups in the same sched domain. 5957 * or lightly loaded groups in the same sched domain.
5948 */ 5958 */
5949 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 5959 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5950 { 5960 {
5951 struct sched_domain *child; 5961 struct sched_domain *child;
5952 struct sched_group *group; 5962 struct sched_group *group;
5953 5963
5954 WARN_ON(!sd || !sd->groups); 5964 WARN_ON(!sd || !sd->groups);
5955 5965
5956 if (cpu != first_cpu(sd->groups->cpumask)) 5966 if (cpu != first_cpu(sd->groups->cpumask))
5957 return; 5967 return;
5958 5968
5959 child = sd->child; 5969 child = sd->child;
5960 5970
5961 sd->groups->__cpu_power = 0; 5971 sd->groups->__cpu_power = 0;
5962 5972
5963 /* 5973 /*
5964 * For perf policy, if the groups in child domain share resources 5974 * For perf policy, if the groups in child domain share resources
5965 * (for example cores sharing some portions of the cache hierarchy 5975 * (for example cores sharing some portions of the cache hierarchy
5966 * or SMT), then set this domain groups cpu_power such that each group 5976 * or SMT), then set this domain groups cpu_power such that each group
5967 * can handle only one task, when there are other idle groups in the 5977 * can handle only one task, when there are other idle groups in the
5968 * same sched domain. 5978 * same sched domain.
5969 */ 5979 */
5970 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 5980 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5971 (child->flags & 5981 (child->flags &
5972 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 5982 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
5973 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 5983 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
5974 return; 5984 return;
5975 } 5985 }
5976 5986
5977 /* 5987 /*
5978 * add cpu_power of each child group to this groups cpu_power 5988 * add cpu_power of each child group to this groups cpu_power
5979 */ 5989 */
5980 group = child->groups; 5990 group = child->groups;
5981 do { 5991 do {
5982 sg_inc_cpu_power(sd->groups, group->__cpu_power); 5992 sg_inc_cpu_power(sd->groups, group->__cpu_power);
5983 group = group->next; 5993 group = group->next;
5984 } while (group != child->groups); 5994 } while (group != child->groups);
5985 } 5995 }
5986 5996
5987 /* 5997 /*
5988 * Build sched domains for a given set of cpus and attach the sched domains 5998 * Build sched domains for a given set of cpus and attach the sched domains
5989 * to the individual cpus 5999 * to the individual cpus
5990 */ 6000 */
5991 static int build_sched_domains(const cpumask_t *cpu_map) 6001 static int build_sched_domains(const cpumask_t *cpu_map)
5992 { 6002 {
5993 int i; 6003 int i;
5994 #ifdef CONFIG_NUMA 6004 #ifdef CONFIG_NUMA
5995 struct sched_group **sched_group_nodes = NULL; 6005 struct sched_group **sched_group_nodes = NULL;
5996 int sd_allnodes = 0; 6006 int sd_allnodes = 0;
5997 6007
5998 /* 6008 /*
5999 * Allocate the per-node list of sched groups 6009 * Allocate the per-node list of sched groups
6000 */ 6010 */
6001 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, 6011 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
6002 GFP_KERNEL); 6012 GFP_KERNEL);
6003 if (!sched_group_nodes) { 6013 if (!sched_group_nodes) {
6004 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6014 printk(KERN_WARNING "Can not alloc sched group node list\n");
6005 return -ENOMEM; 6015 return -ENOMEM;
6006 } 6016 }
6007 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6017 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6008 #endif 6018 #endif
6009 6019
6010 /* 6020 /*
6011 * Set up domains for cpus specified by the cpu_map. 6021 * Set up domains for cpus specified by the cpu_map.
6012 */ 6022 */
6013 for_each_cpu_mask(i, *cpu_map) { 6023 for_each_cpu_mask(i, *cpu_map) {
6014 struct sched_domain *sd = NULL, *p; 6024 struct sched_domain *sd = NULL, *p;
6015 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6025 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6016 6026
6017 cpus_and(nodemask, nodemask, *cpu_map); 6027 cpus_and(nodemask, nodemask, *cpu_map);
6018 6028
6019 #ifdef CONFIG_NUMA 6029 #ifdef CONFIG_NUMA
6020 if (cpus_weight(*cpu_map) > 6030 if (cpus_weight(*cpu_map) >
6021 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6031 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6022 sd = &per_cpu(allnodes_domains, i); 6032 sd = &per_cpu(allnodes_domains, i);
6023 *sd = SD_ALLNODES_INIT; 6033 *sd = SD_ALLNODES_INIT;
6024 sd->span = *cpu_map; 6034 sd->span = *cpu_map;
6025 cpu_to_allnodes_group(i, cpu_map, &sd->groups); 6035 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6026 p = sd; 6036 p = sd;
6027 sd_allnodes = 1; 6037 sd_allnodes = 1;
6028 } else 6038 } else
6029 p = NULL; 6039 p = NULL;
6030 6040
6031 sd = &per_cpu(node_domains, i); 6041 sd = &per_cpu(node_domains, i);
6032 *sd = SD_NODE_INIT; 6042 *sd = SD_NODE_INIT;
6033 sd->span = sched_domain_node_span(cpu_to_node(i)); 6043 sd->span = sched_domain_node_span(cpu_to_node(i));
6034 sd->parent = p; 6044 sd->parent = p;
6035 if (p) 6045 if (p)
6036 p->child = sd; 6046 p->child = sd;
6037 cpus_and(sd->span, sd->span, *cpu_map); 6047 cpus_and(sd->span, sd->span, *cpu_map);
6038 #endif 6048 #endif
6039 6049
6040 p = sd; 6050 p = sd;
6041 sd = &per_cpu(phys_domains, i); 6051 sd = &per_cpu(phys_domains, i);
6042 *sd = SD_CPU_INIT; 6052 *sd = SD_CPU_INIT;
6043 sd->span = nodemask; 6053 sd->span = nodemask;
6044 sd->parent = p; 6054 sd->parent = p;
6045 if (p) 6055 if (p)
6046 p->child = sd; 6056 p->child = sd;
6047 cpu_to_phys_group(i, cpu_map, &sd->groups); 6057 cpu_to_phys_group(i, cpu_map, &sd->groups);
6048 6058
6049 #ifdef CONFIG_SCHED_MC 6059 #ifdef CONFIG_SCHED_MC
6050 p = sd; 6060 p = sd;
6051 sd = &per_cpu(core_domains, i); 6061 sd = &per_cpu(core_domains, i);
6052 *sd = SD_MC_INIT; 6062 *sd = SD_MC_INIT;
6053 sd->span = cpu_coregroup_map(i); 6063 sd->span = cpu_coregroup_map(i);
6054 cpus_and(sd->span, sd->span, *cpu_map); 6064 cpus_and(sd->span, sd->span, *cpu_map);
6055 sd->parent = p; 6065 sd->parent = p;
6056 p->child = sd; 6066 p->child = sd;
6057 cpu_to_core_group(i, cpu_map, &sd->groups); 6067 cpu_to_core_group(i, cpu_map, &sd->groups);
6058 #endif 6068 #endif
6059 6069
6060 #ifdef CONFIG_SCHED_SMT 6070 #ifdef CONFIG_SCHED_SMT
6061 p = sd; 6071 p = sd;
6062 sd = &per_cpu(cpu_domains, i); 6072 sd = &per_cpu(cpu_domains, i);
6063 *sd = SD_SIBLING_INIT; 6073 *sd = SD_SIBLING_INIT;
6064 sd->span = cpu_sibling_map[i]; 6074 sd->span = cpu_sibling_map[i];
6065 cpus_and(sd->span, sd->span, *cpu_map); 6075 cpus_and(sd->span, sd->span, *cpu_map);
6066 sd->parent = p; 6076 sd->parent = p;
6067 p->child = sd; 6077 p->child = sd;
6068 cpu_to_cpu_group(i, cpu_map, &sd->groups); 6078 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6069 #endif 6079 #endif
6070 } 6080 }
6071 6081
6072 #ifdef CONFIG_SCHED_SMT 6082 #ifdef CONFIG_SCHED_SMT
6073 /* Set up CPU (sibling) groups */ 6083 /* Set up CPU (sibling) groups */
6074 for_each_cpu_mask(i, *cpu_map) { 6084 for_each_cpu_mask(i, *cpu_map) {
6075 cpumask_t this_sibling_map = cpu_sibling_map[i]; 6085 cpumask_t this_sibling_map = cpu_sibling_map[i];
6076 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 6086 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6077 if (i != first_cpu(this_sibling_map)) 6087 if (i != first_cpu(this_sibling_map))
6078 continue; 6088 continue;
6079 6089
6080 init_sched_build_groups(this_sibling_map, cpu_map, 6090 init_sched_build_groups(this_sibling_map, cpu_map,
6081 &cpu_to_cpu_group); 6091 &cpu_to_cpu_group);
6082 } 6092 }
6083 #endif 6093 #endif
6084 6094
6085 #ifdef CONFIG_SCHED_MC 6095 #ifdef CONFIG_SCHED_MC
6086 /* Set up multi-core groups */ 6096 /* Set up multi-core groups */
6087 for_each_cpu_mask(i, *cpu_map) { 6097 for_each_cpu_mask(i, *cpu_map) {
6088 cpumask_t this_core_map = cpu_coregroup_map(i); 6098 cpumask_t this_core_map = cpu_coregroup_map(i);
6089 cpus_and(this_core_map, this_core_map, *cpu_map); 6099 cpus_and(this_core_map, this_core_map, *cpu_map);
6090 if (i != first_cpu(this_core_map)) 6100 if (i != first_cpu(this_core_map))
6091 continue; 6101 continue;
6092 init_sched_build_groups(this_core_map, cpu_map, 6102 init_sched_build_groups(this_core_map, cpu_map,
6093 &cpu_to_core_group); 6103 &cpu_to_core_group);
6094 } 6104 }
6095 #endif 6105 #endif
6096 6106
6097 /* Set up physical groups */ 6107 /* Set up physical groups */
6098 for (i = 0; i < MAX_NUMNODES; i++) { 6108 for (i = 0; i < MAX_NUMNODES; i++) {
6099 cpumask_t nodemask = node_to_cpumask(i); 6109 cpumask_t nodemask = node_to_cpumask(i);
6100 6110
6101 cpus_and(nodemask, nodemask, *cpu_map); 6111 cpus_and(nodemask, nodemask, *cpu_map);
6102 if (cpus_empty(nodemask)) 6112 if (cpus_empty(nodemask))
6103 continue; 6113 continue;
6104 6114
6105 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); 6115 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6106 } 6116 }
6107 6117
6108 #ifdef CONFIG_NUMA 6118 #ifdef CONFIG_NUMA
6109 /* Set up node groups */ 6119 /* Set up node groups */
6110 if (sd_allnodes) 6120 if (sd_allnodes)
6111 init_sched_build_groups(*cpu_map, cpu_map, 6121 init_sched_build_groups(*cpu_map, cpu_map,
6112 &cpu_to_allnodes_group); 6122 &cpu_to_allnodes_group);
6113 6123
6114 for (i = 0; i < MAX_NUMNODES; i++) { 6124 for (i = 0; i < MAX_NUMNODES; i++) {
6115 /* Set up node groups */ 6125 /* Set up node groups */
6116 struct sched_group *sg, *prev; 6126 struct sched_group *sg, *prev;
6117 cpumask_t nodemask = node_to_cpumask(i); 6127 cpumask_t nodemask = node_to_cpumask(i);
6118 cpumask_t domainspan; 6128 cpumask_t domainspan;
6119 cpumask_t covered = CPU_MASK_NONE; 6129 cpumask_t covered = CPU_MASK_NONE;
6120 int j; 6130 int j;
6121 6131
6122 cpus_and(nodemask, nodemask, *cpu_map); 6132 cpus_and(nodemask, nodemask, *cpu_map);
6123 if (cpus_empty(nodemask)) { 6133 if (cpus_empty(nodemask)) {
6124 sched_group_nodes[i] = NULL; 6134 sched_group_nodes[i] = NULL;
6125 continue; 6135 continue;
6126 } 6136 }
6127 6137
6128 domainspan = sched_domain_node_span(i); 6138 domainspan = sched_domain_node_span(i);
6129 cpus_and(domainspan, domainspan, *cpu_map); 6139 cpus_and(domainspan, domainspan, *cpu_map);
6130 6140
6131 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 6141 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6132 if (!sg) { 6142 if (!sg) {
6133 printk(KERN_WARNING "Can not alloc domain group for " 6143 printk(KERN_WARNING "Can not alloc domain group for "
6134 "node %d\n", i); 6144 "node %d\n", i);
6135 goto error; 6145 goto error;
6136 } 6146 }
6137 sched_group_nodes[i] = sg; 6147 sched_group_nodes[i] = sg;
6138 for_each_cpu_mask(j, nodemask) { 6148 for_each_cpu_mask(j, nodemask) {
6139 struct sched_domain *sd; 6149 struct sched_domain *sd;
6140 6150
6141 sd = &per_cpu(node_domains, j); 6151 sd = &per_cpu(node_domains, j);
6142 sd->groups = sg; 6152 sd->groups = sg;
6143 } 6153 }
6144 sg->__cpu_power = 0; 6154 sg->__cpu_power = 0;
6145 sg->cpumask = nodemask; 6155 sg->cpumask = nodemask;
6146 sg->next = sg; 6156 sg->next = sg;
6147 cpus_or(covered, covered, nodemask); 6157 cpus_or(covered, covered, nodemask);
6148 prev = sg; 6158 prev = sg;
6149 6159
6150 for (j = 0; j < MAX_NUMNODES; j++) { 6160 for (j = 0; j < MAX_NUMNODES; j++) {
6151 cpumask_t tmp, notcovered; 6161 cpumask_t tmp, notcovered;
6152 int n = (i + j) % MAX_NUMNODES; 6162 int n = (i + j) % MAX_NUMNODES;
6153 6163
6154 cpus_complement(notcovered, covered); 6164 cpus_complement(notcovered, covered);
6155 cpus_and(tmp, notcovered, *cpu_map); 6165 cpus_and(tmp, notcovered, *cpu_map);
6156 cpus_and(tmp, tmp, domainspan); 6166 cpus_and(tmp, tmp, domainspan);
6157 if (cpus_empty(tmp)) 6167 if (cpus_empty(tmp))
6158 break; 6168 break;
6159 6169
6160 nodemask = node_to_cpumask(n); 6170 nodemask = node_to_cpumask(n);
6161 cpus_and(tmp, tmp, nodemask); 6171 cpus_and(tmp, tmp, nodemask);
6162 if (cpus_empty(tmp)) 6172 if (cpus_empty(tmp))
6163 continue; 6173 continue;
6164 6174
6165 sg = kmalloc_node(sizeof(struct sched_group), 6175 sg = kmalloc_node(sizeof(struct sched_group),
6166 GFP_KERNEL, i); 6176 GFP_KERNEL, i);
6167 if (!sg) { 6177 if (!sg) {
6168 printk(KERN_WARNING 6178 printk(KERN_WARNING
6169 "Can not alloc domain group for node %d\n", j); 6179 "Can not alloc domain group for node %d\n", j);
6170 goto error; 6180 goto error;
6171 } 6181 }
6172 sg->__cpu_power = 0; 6182 sg->__cpu_power = 0;
6173 sg->cpumask = tmp; 6183 sg->cpumask = tmp;
6174 sg->next = prev->next; 6184 sg->next = prev->next;
6175 cpus_or(covered, covered, tmp); 6185 cpus_or(covered, covered, tmp);
6176 prev->next = sg; 6186 prev->next = sg;
6177 prev = sg; 6187 prev = sg;
6178 } 6188 }
6179 } 6189 }
6180 #endif 6190 #endif
6181 6191
6182 /* Calculate CPU power for physical packages and nodes */ 6192 /* Calculate CPU power for physical packages and nodes */
6183 #ifdef CONFIG_SCHED_SMT 6193 #ifdef CONFIG_SCHED_SMT
6184 for_each_cpu_mask(i, *cpu_map) { 6194 for_each_cpu_mask(i, *cpu_map) {
6185 struct sched_domain *sd = &per_cpu(cpu_domains, i); 6195 struct sched_domain *sd = &per_cpu(cpu_domains, i);
6186 6196
6187 init_sched_groups_power(i, sd); 6197 init_sched_groups_power(i, sd);
6188 } 6198 }
6189 #endif 6199 #endif
6190 #ifdef CONFIG_SCHED_MC 6200 #ifdef CONFIG_SCHED_MC
6191 for_each_cpu_mask(i, *cpu_map) { 6201 for_each_cpu_mask(i, *cpu_map) {
6192 struct sched_domain *sd = &per_cpu(core_domains, i); 6202 struct sched_domain *sd = &per_cpu(core_domains, i);
6193 6203
6194 init_sched_groups_power(i, sd); 6204 init_sched_groups_power(i, sd);
6195 } 6205 }
6196 #endif 6206 #endif
6197 6207
6198 for_each_cpu_mask(i, *cpu_map) { 6208 for_each_cpu_mask(i, *cpu_map) {
6199 struct sched_domain *sd = &per_cpu(phys_domains, i); 6209 struct sched_domain *sd = &per_cpu(phys_domains, i);
6200 6210
6201 init_sched_groups_power(i, sd); 6211 init_sched_groups_power(i, sd);
6202 } 6212 }
6203 6213
6204 #ifdef CONFIG_NUMA 6214 #ifdef CONFIG_NUMA
6205 for (i = 0; i < MAX_NUMNODES; i++) 6215 for (i = 0; i < MAX_NUMNODES; i++)
6206 init_numa_sched_groups_power(sched_group_nodes[i]); 6216 init_numa_sched_groups_power(sched_group_nodes[i]);
6207 6217
6208 if (sd_allnodes) { 6218 if (sd_allnodes) {
6209 struct sched_group *sg; 6219 struct sched_group *sg;
6210 6220
6211 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); 6221 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6212 init_numa_sched_groups_power(sg); 6222 init_numa_sched_groups_power(sg);
6213 } 6223 }
6214 #endif 6224 #endif
6215 6225
6216 /* Attach the domains */ 6226 /* Attach the domains */
6217 for_each_cpu_mask(i, *cpu_map) { 6227 for_each_cpu_mask(i, *cpu_map) {
6218 struct sched_domain *sd; 6228 struct sched_domain *sd;
6219 #ifdef CONFIG_SCHED_SMT 6229 #ifdef CONFIG_SCHED_SMT
6220 sd = &per_cpu(cpu_domains, i); 6230 sd = &per_cpu(cpu_domains, i);
6221 #elif defined(CONFIG_SCHED_MC) 6231 #elif defined(CONFIG_SCHED_MC)
6222 sd = &per_cpu(core_domains, i); 6232 sd = &per_cpu(core_domains, i);
6223 #else 6233 #else
6224 sd = &per_cpu(phys_domains, i); 6234 sd = &per_cpu(phys_domains, i);
6225 #endif 6235 #endif
6226 cpu_attach_domain(sd, i); 6236 cpu_attach_domain(sd, i);
6227 } 6237 }
6228 6238
6229 return 0; 6239 return 0;
6230 6240
6231 #ifdef CONFIG_NUMA 6241 #ifdef CONFIG_NUMA
6232 error: 6242 error:
6233 free_sched_groups(cpu_map); 6243 free_sched_groups(cpu_map);
6234 return -ENOMEM; 6244 return -ENOMEM;
6235 #endif 6245 #endif
6236 } 6246 }
6237 /* 6247 /*
6238 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6248 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6239 */ 6249 */
6240 static int arch_init_sched_domains(const cpumask_t *cpu_map) 6250 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6241 { 6251 {
6242 cpumask_t cpu_default_map; 6252 cpumask_t cpu_default_map;
6243 int err; 6253 int err;
6244 6254
6245 /* 6255 /*
6246 * Setup mask for cpus without special case scheduling requirements. 6256 * Setup mask for cpus without special case scheduling requirements.
6247 * For now this just excludes isolated cpus, but could be used to 6257 * For now this just excludes isolated cpus, but could be used to
6248 * exclude other special cases in the future. 6258 * exclude other special cases in the future.
6249 */ 6259 */
6250 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6260 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6251 6261
6252 err = build_sched_domains(&cpu_default_map); 6262 err = build_sched_domains(&cpu_default_map);
6253 6263
6254 return err; 6264 return err;
6255 } 6265 }
6256 6266
6257 static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6267 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6258 { 6268 {
6259 free_sched_groups(cpu_map); 6269 free_sched_groups(cpu_map);
6260 } 6270 }
6261 6271
6262 /* 6272 /*
6263 * Detach sched domains from a group of cpus specified in cpu_map 6273 * Detach sched domains from a group of cpus specified in cpu_map
6264 * These cpus will now be attached to the NULL domain 6274 * These cpus will now be attached to the NULL domain
6265 */ 6275 */
6266 static void detach_destroy_domains(const cpumask_t *cpu_map) 6276 static void detach_destroy_domains(const cpumask_t *cpu_map)
6267 { 6277 {
6268 int i; 6278 int i;
6269 6279
6270 for_each_cpu_mask(i, *cpu_map) 6280 for_each_cpu_mask(i, *cpu_map)
6271 cpu_attach_domain(NULL, i); 6281 cpu_attach_domain(NULL, i);
6272 synchronize_sched(); 6282 synchronize_sched();
6273 arch_destroy_sched_domains(cpu_map); 6283 arch_destroy_sched_domains(cpu_map);
6274 } 6284 }
6275 6285
6276 /* 6286 /*
6277 * Partition sched domains as specified by the cpumasks below. 6287 * Partition sched domains as specified by the cpumasks below.
6278 * This attaches all cpus from the cpumasks to the NULL domain, 6288 * This attaches all cpus from the cpumasks to the NULL domain,
6279 * waits for a RCU quiescent period, recalculates sched 6289 * waits for a RCU quiescent period, recalculates sched
6280 * domain information and then attaches them back to the 6290 * domain information and then attaches them back to the
6281 * correct sched domains 6291 * correct sched domains
6282 * Call with hotplug lock held 6292 * Call with hotplug lock held
6283 */ 6293 */
6284 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6294 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6285 { 6295 {
6286 cpumask_t change_map; 6296 cpumask_t change_map;
6287 int err = 0; 6297 int err = 0;
6288 6298
6289 cpus_and(*partition1, *partition1, cpu_online_map); 6299 cpus_and(*partition1, *partition1, cpu_online_map);
6290 cpus_and(*partition2, *partition2, cpu_online_map); 6300 cpus_and(*partition2, *partition2, cpu_online_map);
6291 cpus_or(change_map, *partition1, *partition2); 6301 cpus_or(change_map, *partition1, *partition2);
6292 6302
6293 /* Detach sched domains from all of the affected cpus */ 6303 /* Detach sched domains from all of the affected cpus */
6294 detach_destroy_domains(&change_map); 6304 detach_destroy_domains(&change_map);
6295 if (!cpus_empty(*partition1)) 6305 if (!cpus_empty(*partition1))
6296 err = build_sched_domains(partition1); 6306 err = build_sched_domains(partition1);
6297 if (!err && !cpus_empty(*partition2)) 6307 if (!err && !cpus_empty(*partition2))
6298 err = build_sched_domains(partition2); 6308 err = build_sched_domains(partition2);
6299 6309
6300 return err; 6310 return err;
6301 } 6311 }
6302 6312
6303 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6313 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6304 int arch_reinit_sched_domains(void) 6314 int arch_reinit_sched_domains(void)
6305 { 6315 {
6306 int err; 6316 int err;
6307 6317
6308 mutex_lock(&sched_hotcpu_mutex); 6318 mutex_lock(&sched_hotcpu_mutex);
6309 detach_destroy_domains(&cpu_online_map); 6319 detach_destroy_domains(&cpu_online_map);
6310 err = arch_init_sched_domains(&cpu_online_map); 6320 err = arch_init_sched_domains(&cpu_online_map);
6311 mutex_unlock(&sched_hotcpu_mutex); 6321 mutex_unlock(&sched_hotcpu_mutex);
6312 6322
6313 return err; 6323 return err;
6314 } 6324 }
6315 6325
6316 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 6326 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6317 { 6327 {
6318 int ret; 6328 int ret;
6319 6329
6320 if (buf[0] != '0' && buf[0] != '1') 6330 if (buf[0] != '0' && buf[0] != '1')
6321 return -EINVAL; 6331 return -EINVAL;
6322 6332
6323 if (smt) 6333 if (smt)
6324 sched_smt_power_savings = (buf[0] == '1'); 6334 sched_smt_power_savings = (buf[0] == '1');
6325 else 6335 else
6326 sched_mc_power_savings = (buf[0] == '1'); 6336 sched_mc_power_savings = (buf[0] == '1');
6327 6337
6328 ret = arch_reinit_sched_domains(); 6338 ret = arch_reinit_sched_domains();
6329 6339
6330 return ret ? ret : count; 6340 return ret ? ret : count;
6331 } 6341 }
6332 6342
6333 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 6343 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6334 { 6344 {
6335 int err = 0; 6345 int err = 0;
6336 6346
6337 #ifdef CONFIG_SCHED_SMT 6347 #ifdef CONFIG_SCHED_SMT
6338 if (smt_capable()) 6348 if (smt_capable())
6339 err = sysfs_create_file(&cls->kset.kobj, 6349 err = sysfs_create_file(&cls->kset.kobj,
6340 &attr_sched_smt_power_savings.attr); 6350 &attr_sched_smt_power_savings.attr);
6341 #endif 6351 #endif
6342 #ifdef CONFIG_SCHED_MC 6352 #ifdef CONFIG_SCHED_MC
6343 if (!err && mc_capable()) 6353 if (!err && mc_capable())
6344 err = sysfs_create_file(&cls->kset.kobj, 6354 err = sysfs_create_file(&cls->kset.kobj,
6345 &attr_sched_mc_power_savings.attr); 6355 &attr_sched_mc_power_savings.attr);
6346 #endif 6356 #endif
6347 return err; 6357 return err;
6348 } 6358 }
6349 #endif 6359 #endif
6350 6360
6351 #ifdef CONFIG_SCHED_MC 6361 #ifdef CONFIG_SCHED_MC
6352 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 6362 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6353 { 6363 {
6354 return sprintf(page, "%u\n", sched_mc_power_savings); 6364 return sprintf(page, "%u\n", sched_mc_power_savings);
6355 } 6365 }
6356 static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 6366 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6357 const char *buf, size_t count) 6367 const char *buf, size_t count)
6358 { 6368 {
6359 return sched_power_savings_store(buf, count, 0); 6369 return sched_power_savings_store(buf, count, 0);
6360 } 6370 }
6361 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 6371 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6362 sched_mc_power_savings_store); 6372 sched_mc_power_savings_store);
6363 #endif 6373 #endif
6364 6374
6365 #ifdef CONFIG_SCHED_SMT 6375 #ifdef CONFIG_SCHED_SMT
6366 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 6376 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6367 { 6377 {
6368 return sprintf(page, "%u\n", sched_smt_power_savings); 6378 return sprintf(page, "%u\n", sched_smt_power_savings);
6369 } 6379 }
6370 static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 6380 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6371 const char *buf, size_t count) 6381 const char *buf, size_t count)
6372 { 6382 {
6373 return sched_power_savings_store(buf, count, 1); 6383 return sched_power_savings_store(buf, count, 1);
6374 } 6384 }
6375 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 6385 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6376 sched_smt_power_savings_store); 6386 sched_smt_power_savings_store);
6377 #endif 6387 #endif
6378 6388
6379 /* 6389 /*
6380 * Force a reinitialization of the sched domains hierarchy. The domains 6390 * Force a reinitialization of the sched domains hierarchy. The domains
6381 * and groups cannot be updated in place without racing with the balancing 6391 * and groups cannot be updated in place without racing with the balancing
6382 * code, so we temporarily attach all running cpus to the NULL domain 6392 * code, so we temporarily attach all running cpus to the NULL domain
6383 * which will prevent rebalancing while the sched domains are recalculated. 6393 * which will prevent rebalancing while the sched domains are recalculated.
6384 */ 6394 */
6385 static int update_sched_domains(struct notifier_block *nfb, 6395 static int update_sched_domains(struct notifier_block *nfb,
6386 unsigned long action, void *hcpu) 6396 unsigned long action, void *hcpu)
6387 { 6397 {
6388 switch (action) { 6398 switch (action) {
6389 case CPU_UP_PREPARE: 6399 case CPU_UP_PREPARE:
6390 case CPU_UP_PREPARE_FROZEN: 6400 case CPU_UP_PREPARE_FROZEN:
6391 case CPU_DOWN_PREPARE: 6401 case CPU_DOWN_PREPARE:
6392 case CPU_DOWN_PREPARE_FROZEN: 6402 case CPU_DOWN_PREPARE_FROZEN:
6393 detach_destroy_domains(&cpu_online_map); 6403 detach_destroy_domains(&cpu_online_map);
6394 return NOTIFY_OK; 6404 return NOTIFY_OK;
6395 6405
6396 case CPU_UP_CANCELED: 6406 case CPU_UP_CANCELED:
6397 case CPU_UP_CANCELED_FROZEN: 6407 case CPU_UP_CANCELED_FROZEN:
6398 case CPU_DOWN_FAILED: 6408 case CPU_DOWN_FAILED:
6399 case CPU_DOWN_FAILED_FROZEN: 6409 case CPU_DOWN_FAILED_FROZEN:
6400 case CPU_ONLINE: 6410 case CPU_ONLINE:
6401 case CPU_ONLINE_FROZEN: 6411 case CPU_ONLINE_FROZEN:
6402 case CPU_DEAD: 6412 case CPU_DEAD:
6403 case CPU_DEAD_FROZEN: 6413 case CPU_DEAD_FROZEN:
6404 /* 6414 /*
6405 * Fall through and re-initialise the domains. 6415 * Fall through and re-initialise the domains.
6406 */ 6416 */
6407 break; 6417 break;
6408 default: 6418 default:
6409 return NOTIFY_DONE; 6419 return NOTIFY_DONE;
6410 } 6420 }
6411 6421
6412 /* The hotplug lock is already held by cpu_up/cpu_down */ 6422 /* The hotplug lock is already held by cpu_up/cpu_down */
6413 arch_init_sched_domains(&cpu_online_map); 6423 arch_init_sched_domains(&cpu_online_map);
6414 6424
6415 return NOTIFY_OK; 6425 return NOTIFY_OK;
6416 } 6426 }
6417 6427
6418 void __init sched_init_smp(void) 6428 void __init sched_init_smp(void)
6419 { 6429 {
6420 cpumask_t non_isolated_cpus; 6430 cpumask_t non_isolated_cpus;
6421 6431
6422 mutex_lock(&sched_hotcpu_mutex); 6432 mutex_lock(&sched_hotcpu_mutex);
6423 arch_init_sched_domains(&cpu_online_map); 6433 arch_init_sched_domains(&cpu_online_map);
6424 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 6434 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6425 if (cpus_empty(non_isolated_cpus)) 6435 if (cpus_empty(non_isolated_cpus))
6426 cpu_set(smp_processor_id(), non_isolated_cpus); 6436 cpu_set(smp_processor_id(), non_isolated_cpus);
6427 mutex_unlock(&sched_hotcpu_mutex); 6437 mutex_unlock(&sched_hotcpu_mutex);
6428 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6438 /* XXX: Theoretical race here - CPU may be hotplugged now */
6429 hotcpu_notifier(update_sched_domains, 0); 6439 hotcpu_notifier(update_sched_domains, 0);
6430 6440
6431 init_sched_domain_sysctl(); 6441 init_sched_domain_sysctl();
6432 6442
6433 /* Move init over to a non-isolated CPU */ 6443 /* Move init over to a non-isolated CPU */
6434 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6444 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6435 BUG(); 6445 BUG();
6436 sched_init_granularity(); 6446 sched_init_granularity();
6437 } 6447 }
6438 #else 6448 #else
6439 void __init sched_init_smp(void) 6449 void __init sched_init_smp(void)
6440 { 6450 {
6441 sched_init_granularity(); 6451 sched_init_granularity();
6442 } 6452 }
6443 #endif /* CONFIG_SMP */ 6453 #endif /* CONFIG_SMP */
6444 6454
6445 int in_sched_functions(unsigned long addr) 6455 int in_sched_functions(unsigned long addr)
6446 { 6456 {
6447 /* Linker adds these: start and end of __sched functions */ 6457 /* Linker adds these: start and end of __sched functions */
6448 extern char __sched_text_start[], __sched_text_end[]; 6458 extern char __sched_text_start[], __sched_text_end[];
6449 6459
6450 return in_lock_functions(addr) || 6460 return in_lock_functions(addr) ||
6451 (addr >= (unsigned long)__sched_text_start 6461 (addr >= (unsigned long)__sched_text_start
6452 && addr < (unsigned long)__sched_text_end); 6462 && addr < (unsigned long)__sched_text_end);
6453 } 6463 }
6454 6464
6455 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) 6465 static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6456 { 6466 {
6457 cfs_rq->tasks_timeline = RB_ROOT; 6467 cfs_rq->tasks_timeline = RB_ROOT;
6458 cfs_rq->fair_clock = 1; 6468 cfs_rq->fair_clock = 1;
6459 #ifdef CONFIG_FAIR_GROUP_SCHED 6469 #ifdef CONFIG_FAIR_GROUP_SCHED
6460 cfs_rq->rq = rq; 6470 cfs_rq->rq = rq;
6461 #endif 6471 #endif
6462 } 6472 }
6463 6473
6464 void __init sched_init(void) 6474 void __init sched_init(void)
6465 { 6475 {
6466 u64 now = sched_clock(); 6476 u64 now = sched_clock();
6467 int highest_cpu = 0; 6477 int highest_cpu = 0;
6468 int i, j; 6478 int i, j;
6469 6479
6470 /* 6480 /*
6471 * Link up the scheduling class hierarchy: 6481 * Link up the scheduling class hierarchy:
6472 */ 6482 */
6473 rt_sched_class.next = &fair_sched_class; 6483 rt_sched_class.next = &fair_sched_class;
6474 fair_sched_class.next = &idle_sched_class; 6484 fair_sched_class.next = &idle_sched_class;
6475 idle_sched_class.next = NULL; 6485 idle_sched_class.next = NULL;
6476 6486
6477 for_each_possible_cpu(i) { 6487 for_each_possible_cpu(i) {
6478 struct rt_prio_array *array; 6488 struct rt_prio_array *array;
6479 struct rq *rq; 6489 struct rq *rq;
6480 6490
6481 rq = cpu_rq(i); 6491 rq = cpu_rq(i);
6482 spin_lock_init(&rq->lock); 6492 spin_lock_init(&rq->lock);
6483 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6493 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6484 rq->nr_running = 0; 6494 rq->nr_running = 0;
6485 rq->clock = 1; 6495 rq->clock = 1;
6486 init_cfs_rq(&rq->cfs, rq); 6496 init_cfs_rq(&rq->cfs, rq);
6487 #ifdef CONFIG_FAIR_GROUP_SCHED 6497 #ifdef CONFIG_FAIR_GROUP_SCHED
6488 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 6498 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6489 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 6499 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6490 #endif 6500 #endif
6491 rq->ls.load_update_last = now; 6501 rq->ls.load_update_last = now;
6492 rq->ls.load_update_start = now; 6502 rq->ls.load_update_start = now;
6493 6503
6494 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 6504 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6495 rq->cpu_load[j] = 0; 6505 rq->cpu_load[j] = 0;
6496 #ifdef CONFIG_SMP 6506 #ifdef CONFIG_SMP
6497 rq->sd = NULL; 6507 rq->sd = NULL;
6498 rq->active_balance = 0; 6508 rq->active_balance = 0;
6499 rq->next_balance = jiffies; 6509 rq->next_balance = jiffies;
6500 rq->push_cpu = 0; 6510 rq->push_cpu = 0;
6501 rq->cpu = i; 6511 rq->cpu = i;
6502 rq->migration_thread = NULL; 6512 rq->migration_thread = NULL;
6503 INIT_LIST_HEAD(&rq->migration_queue); 6513 INIT_LIST_HEAD(&rq->migration_queue);
6504 #endif 6514 #endif
6505 atomic_set(&rq->nr_iowait, 0); 6515 atomic_set(&rq->nr_iowait, 0);
6506 6516
6507 array = &rq->rt.active; 6517 array = &rq->rt.active;
6508 for (j = 0; j < MAX_RT_PRIO; j++) { 6518 for (j = 0; j < MAX_RT_PRIO; j++) {
6509 INIT_LIST_HEAD(array->queue + j); 6519 INIT_LIST_HEAD(array->queue + j);
6510 __clear_bit(j, array->bitmap); 6520 __clear_bit(j, array->bitmap);
6511 } 6521 }
6512 highest_cpu = i; 6522 highest_cpu = i;
6513 /* delimiter for bitsearch: */ 6523 /* delimiter for bitsearch: */
6514 __set_bit(MAX_RT_PRIO, array->bitmap); 6524 __set_bit(MAX_RT_PRIO, array->bitmap);
6515 } 6525 }
6516 6526
6517 set_load_weight(&init_task); 6527 set_load_weight(&init_task);
6518 6528
6519 #ifdef CONFIG_PREEMPT_NOTIFIERS 6529 #ifdef CONFIG_PREEMPT_NOTIFIERS
6520 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6530 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
6521 #endif 6531 #endif
6522 6532
6523 #ifdef CONFIG_SMP 6533 #ifdef CONFIG_SMP
6524 nr_cpu_ids = highest_cpu + 1; 6534 nr_cpu_ids = highest_cpu + 1;
6525 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 6535 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6526 #endif 6536 #endif
6527 6537
6528 #ifdef CONFIG_RT_MUTEXES 6538 #ifdef CONFIG_RT_MUTEXES
6529 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6539 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6530 #endif 6540 #endif
6531 6541
6532 /* 6542 /*
6533 * The boot idle thread does lazy MMU switching as well: 6543 * The boot idle thread does lazy MMU switching as well:
6534 */ 6544 */
6535 atomic_inc(&init_mm.mm_count); 6545 atomic_inc(&init_mm.mm_count);
6536 enter_lazy_tlb(&init_mm, current); 6546 enter_lazy_tlb(&init_mm, current);
6537 6547
6538 /* 6548 /*
6539 * Make us the idle thread. Technically, schedule() should not be 6549 * Make us the idle thread. Technically, schedule() should not be
6540 * called from this thread, however somewhere below it might be, 6550 * called from this thread, however somewhere below it might be,
6541 * but because we are the idle thread, we just pick up running again 6551 * but because we are the idle thread, we just pick up running again
6542 * when this runqueue becomes "idle". 6552 * when this runqueue becomes "idle".
6543 */ 6553 */
6544 init_idle(current, smp_processor_id()); 6554 init_idle(current, smp_processor_id());
6545 /* 6555 /*
6546 * During early bootup we pretend to be a normal task: 6556 * During early bootup we pretend to be a normal task:
6547 */ 6557 */
6548 current->sched_class = &fair_sched_class; 6558 current->sched_class = &fair_sched_class;
6549 } 6559 }
6550 6560
6551 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6561 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6552 void __might_sleep(char *file, int line) 6562 void __might_sleep(char *file, int line)
6553 { 6563 {
6554 #ifdef in_atomic 6564 #ifdef in_atomic
6555 static unsigned long prev_jiffy; /* ratelimiting */ 6565 static unsigned long prev_jiffy; /* ratelimiting */
6556 6566
6557 if ((in_atomic() || irqs_disabled()) && 6567 if ((in_atomic() || irqs_disabled()) &&
6558 system_state == SYSTEM_RUNNING && !oops_in_progress) { 6568 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6559 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6569 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6560 return; 6570 return;
6561 prev_jiffy = jiffies; 6571 prev_jiffy = jiffies;
6562 printk(KERN_ERR "BUG: sleeping function called from invalid" 6572 printk(KERN_ERR "BUG: sleeping function called from invalid"
6563 " context at %s:%d\n", file, line); 6573 " context at %s:%d\n", file, line);
6564 printk("in_atomic():%d, irqs_disabled():%d\n", 6574 printk("in_atomic():%d, irqs_disabled():%d\n",
6565 in_atomic(), irqs_disabled()); 6575 in_atomic(), irqs_disabled());
6566 debug_show_held_locks(current); 6576 debug_show_held_locks(current);
6567 if (irqs_disabled()) 6577 if (irqs_disabled())
6568 print_irqtrace_events(current); 6578 print_irqtrace_events(current);
6569 dump_stack(); 6579 dump_stack();
6570 } 6580 }
6571 #endif 6581 #endif
6572 } 6582 }
6573 EXPORT_SYMBOL(__might_sleep); 6583 EXPORT_SYMBOL(__might_sleep);
6574 #endif 6584 #endif
6575 6585
6576 #ifdef CONFIG_MAGIC_SYSRQ 6586 #ifdef CONFIG_MAGIC_SYSRQ
6577 void normalize_rt_tasks(void) 6587 void normalize_rt_tasks(void)
6578 { 6588 {
6579 struct task_struct *g, *p; 6589 struct task_struct *g, *p;
6580 unsigned long flags; 6590 unsigned long flags;
6581 struct rq *rq; 6591 struct rq *rq;
6582 int on_rq; 6592 int on_rq;
6583 6593
6584 read_lock_irq(&tasklist_lock); 6594 read_lock_irq(&tasklist_lock);
6585 do_each_thread(g, p) { 6595 do_each_thread(g, p) {
6586 p->se.fair_key = 0; 6596 p->se.fair_key = 0;
6587 p->se.wait_runtime = 0; 6597 p->se.wait_runtime = 0;
6588 p->se.exec_start = 0; 6598 p->se.exec_start = 0;
6589 p->se.wait_start_fair = 0; 6599 p->se.wait_start_fair = 0;
6590 p->se.sleep_start_fair = 0; 6600 p->se.sleep_start_fair = 0;
6591 #ifdef CONFIG_SCHEDSTATS 6601 #ifdef CONFIG_SCHEDSTATS
6592 p->se.wait_start = 0; 6602 p->se.wait_start = 0;
6593 p->se.sleep_start = 0; 6603 p->se.sleep_start = 0;
6594 p->se.block_start = 0; 6604 p->se.block_start = 0;
6595 #endif 6605 #endif
6596 task_rq(p)->cfs.fair_clock = 0; 6606 task_rq(p)->cfs.fair_clock = 0;
6597 task_rq(p)->clock = 0; 6607 task_rq(p)->clock = 0;
6598 6608
6599 if (!rt_task(p)) { 6609 if (!rt_task(p)) {
6600 /* 6610 /*
6601 * Renice negative nice level userspace 6611 * Renice negative nice level userspace
6602 * tasks back to 0: 6612 * tasks back to 0:
6603 */ 6613 */
6604 if (TASK_NICE(p) < 0 && p->mm) 6614 if (TASK_NICE(p) < 0 && p->mm)
6605 set_user_nice(p, 0); 6615 set_user_nice(p, 0);
6606 continue; 6616 continue;
6607 } 6617 }
6608 6618
6609 spin_lock_irqsave(&p->pi_lock, flags); 6619 spin_lock_irqsave(&p->pi_lock, flags);
6610 rq = __task_rq_lock(p); 6620 rq = __task_rq_lock(p);
6611 #ifdef CONFIG_SMP 6621 #ifdef CONFIG_SMP
6612 /* 6622 /*
6613 * Do not touch the migration thread: 6623 * Do not touch the migration thread:
6614 */ 6624 */
6615 if (p == rq->migration_thread) 6625 if (p == rq->migration_thread)
6616 goto out_unlock; 6626 goto out_unlock;
6617 #endif 6627 #endif
6618 6628
6619 on_rq = p->se.on_rq; 6629 on_rq = p->se.on_rq;
6620 if (on_rq) 6630 if (on_rq)
6621 deactivate_task(task_rq(p), p, 0); 6631 deactivate_task(task_rq(p), p, 0);
6622 __setscheduler(rq, p, SCHED_NORMAL, 0); 6632 __setscheduler(rq, p, SCHED_NORMAL, 0);
6623 if (on_rq) { 6633 if (on_rq) {
6624 activate_task(task_rq(p), p, 0); 6634 activate_task(task_rq(p), p, 0);
6625 resched_task(rq->curr); 6635 resched_task(rq->curr);
6626 } 6636 }
6627 #ifdef CONFIG_SMP 6637 #ifdef CONFIG_SMP
6628 out_unlock: 6638 out_unlock:
6629 #endif 6639 #endif
6630 __task_rq_unlock(rq); 6640 __task_rq_unlock(rq);
6631 spin_unlock_irqrestore(&p->pi_lock, flags); 6641 spin_unlock_irqrestore(&p->pi_lock, flags);
6632 } while_each_thread(g, p); 6642 } while_each_thread(g, p);
6633 6643
6634 read_unlock_irq(&tasklist_lock); 6644 read_unlock_irq(&tasklist_lock);
6635 } 6645 }
6636 6646
6637 #endif /* CONFIG_MAGIC_SYSRQ */ 6647 #endif /* CONFIG_MAGIC_SYSRQ */
6638 6648
6639 #ifdef CONFIG_IA64 6649 #ifdef CONFIG_IA64
6640 /* 6650 /*
6641 * These functions are only useful for the IA64 MCA handling. 6651 * These functions are only useful for the IA64 MCA handling.
6642 * 6652 *
6643 * They can only be called when the whole system has been 6653 * They can only be called when the whole system has been
6644 * stopped - every CPU needs to be quiescent, and no scheduling 6654 * stopped - every CPU needs to be quiescent, and no scheduling
6645 * activity can take place. Using them for anything else would 6655 * activity can take place. Using them for anything else would
6646 * be a serious bug, and as a result, they aren't even visible 6656 * be a serious bug, and as a result, they aren't even visible
6647 * under any other configuration. 6657 * under any other configuration.
6648 */ 6658 */
6649 6659
6650 /** 6660 /**
6651 * curr_task - return the current task for a given cpu. 6661 * curr_task - return the current task for a given cpu.
6652 * @cpu: the processor in question. 6662 * @cpu: the processor in question.
6653 * 6663 *
6654 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6664 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6655 */ 6665 */
6656 struct task_struct *curr_task(int cpu) 6666 struct task_struct *curr_task(int cpu)
6657 { 6667 {
6658 return cpu_curr(cpu); 6668 return cpu_curr(cpu);
6659 } 6669 }
6660 6670
6661 /** 6671 /**
6662 * set_curr_task - set the current task for a given cpu. 6672 * set_curr_task - set the current task for a given cpu.
6663 * @cpu: the processor in question. 6673 * @cpu: the processor in question.
6664 * @p: the task pointer to set. 6674 * @p: the task pointer to set.
6665 * 6675 *
6666 * Description: This function must only be used when non-maskable interrupts 6676 * Description: This function must only be used when non-maskable interrupts
6667 * are serviced on a separate stack. It allows the architecture to switch the 6677 * are serviced on a separate stack. It allows the architecture to switch the
6668 * notion of the current task on a cpu in a non-blocking manner. This function 6678 * notion of the current task on a cpu in a non-blocking manner. This function
6669 * must be called with all CPU's synchronized, and interrupts disabled, the 6679 * must be called with all CPU's synchronized, and interrupts disabled, the
6670 * and caller must save the original value of the current task (see 6680 * and caller must save the original value of the current task (see
6671 * curr_task() above) and restore that value before reenabling interrupts and 6681 * curr_task() above) and restore that value before reenabling interrupts and
6672 * re-starting the system. 6682 * re-starting the system.
1 /* 1 /*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) 2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 * 3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 * 5 *
6 * Interactivity improvements by Mike Galbraith 6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de> 7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 * 8 *
9 * Various enhancements by Dmitry Adamushko. 9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> 10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 * 11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri 12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007 13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> 14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 * 15 *
16 * Scaled math optimizations by Thomas Gleixner 16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> 17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 */ 18 */
19 19
20 /* 20 /*
21 * Preemption granularity: 21 * Preemption granularity:
22 * (default: 2 msec, units: nanoseconds) 22 * (default: 2 msec, units: nanoseconds)
23 * 23 *
24 * NOTE: this granularity value is not the same as the concept of 24 * NOTE: this granularity value is not the same as the concept of
25 * 'timeslice length' - timeslices in CFS will typically be somewhat 25 * 'timeslice length' - timeslices in CFS will typically be somewhat
26 * larger than this value. (to see the precise effective timeslice 26 * larger than this value. (to see the precise effective timeslice
27 * length of your workload, run vmstat and monitor the context-switches 27 * length of your workload, run vmstat and monitor the context-switches
28 * field) 28 * field)
29 * 29 *
30 * On SMP systems the value of this is multiplied by the log2 of the 30 * On SMP systems the value of this is multiplied by the log2 of the
31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way 31 * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) 32 * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
33 */ 33 */
34 unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; 34 unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
35 35
36 /* 36 /*
37 * SCHED_BATCH wake-up granularity. 37 * SCHED_BATCH wake-up granularity.
38 * (default: 10 msec, units: nanoseconds) 38 * (default: 10 msec, units: nanoseconds)
39 * 39 *
40 * This option delays the preemption effects of decoupled workloads 40 * This option delays the preemption effects of decoupled workloads
41 * and reduces their over-scheduling. Synchronous workloads will still 41 * and reduces their over-scheduling. Synchronous workloads will still
42 * have immediate wakeup/sleep latencies. 42 * have immediate wakeup/sleep latencies.
43 */ 43 */
44 unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = 44 unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
45 10000000000ULL/HZ; 45 10000000000ULL/HZ;
46 46
47 /* 47 /*
48 * SCHED_OTHER wake-up granularity. 48 * SCHED_OTHER wake-up granularity.
49 * (default: 1 msec, units: nanoseconds) 49 * (default: 1 msec, units: nanoseconds)
50 * 50 *
51 * This option delays the preemption effects of decoupled workloads 51 * This option delays the preemption effects of decoupled workloads
52 * and reduces their over-scheduling. Synchronous workloads will still 52 * and reduces their over-scheduling. Synchronous workloads will still
53 * have immediate wakeup/sleep latencies. 53 * have immediate wakeup/sleep latencies.
54 */ 54 */
55 unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; 55 unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ;
56 56
57 unsigned int sysctl_sched_stat_granularity __read_mostly; 57 unsigned int sysctl_sched_stat_granularity __read_mostly;
58 58
59 /* 59 /*
60 * Initialized in sched_init_granularity(): 60 * Initialized in sched_init_granularity():
61 */ 61 */
62 unsigned int sysctl_sched_runtime_limit __read_mostly; 62 unsigned int sysctl_sched_runtime_limit __read_mostly;
63 63
64 /* 64 /*
65 * Debugging: various feature bits 65 * Debugging: various feature bits
66 */ 66 */
67 enum { 67 enum {
68 SCHED_FEAT_FAIR_SLEEPERS = 1, 68 SCHED_FEAT_FAIR_SLEEPERS = 1,
69 SCHED_FEAT_SLEEPER_AVG = 2, 69 SCHED_FEAT_SLEEPER_AVG = 2,
70 SCHED_FEAT_SLEEPER_LOAD_AVG = 4, 70 SCHED_FEAT_SLEEPER_LOAD_AVG = 4,
71 SCHED_FEAT_PRECISE_CPU_LOAD = 8, 71 SCHED_FEAT_PRECISE_CPU_LOAD = 8,
72 SCHED_FEAT_START_DEBIT = 16, 72 SCHED_FEAT_START_DEBIT = 16,
73 SCHED_FEAT_SKIP_INITIAL = 32, 73 SCHED_FEAT_SKIP_INITIAL = 32,
74 }; 74 };
75 75
76 unsigned int sysctl_sched_features __read_mostly = 76 unsigned int sysctl_sched_features __read_mostly =
77 SCHED_FEAT_FAIR_SLEEPERS *1 | 77 SCHED_FEAT_FAIR_SLEEPERS *1 |
78 SCHED_FEAT_SLEEPER_AVG *1 | 78 SCHED_FEAT_SLEEPER_AVG *1 |
79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 | 79 SCHED_FEAT_SLEEPER_LOAD_AVG *1 |
80 SCHED_FEAT_PRECISE_CPU_LOAD *1 | 80 SCHED_FEAT_PRECISE_CPU_LOAD *1 |
81 SCHED_FEAT_START_DEBIT *1 | 81 SCHED_FEAT_START_DEBIT *1 |
82 SCHED_FEAT_SKIP_INITIAL *0; 82 SCHED_FEAT_SKIP_INITIAL *0;
83 83
84 extern struct sched_class fair_sched_class; 84 extern struct sched_class fair_sched_class;
85 85
86 /************************************************************** 86 /**************************************************************
87 * CFS operations on generic schedulable entities: 87 * CFS operations on generic schedulable entities:
88 */ 88 */
89 89
90 #ifdef CONFIG_FAIR_GROUP_SCHED 90 #ifdef CONFIG_FAIR_GROUP_SCHED
91 91
92 /* cpu runqueue to which this cfs_rq is attached */ 92 /* cpu runqueue to which this cfs_rq is attached */
93 static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 93 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
94 { 94 {
95 return cfs_rq->rq; 95 return cfs_rq->rq;
96 } 96 }
97 97
98 /* currently running entity (if any) on this cfs_rq */ 98 /* currently running entity (if any) on this cfs_rq */
99 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) 99 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
100 { 100 {
101 return cfs_rq->curr; 101 return cfs_rq->curr;
102 } 102 }
103 103
104 /* An entity is a task if it doesn't "own" a runqueue */ 104 /* An entity is a task if it doesn't "own" a runqueue */
105 #define entity_is_task(se) (!se->my_q) 105 #define entity_is_task(se) (!se->my_q)
106 106
107 static inline void 107 static inline void
108 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) 108 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
109 { 109 {
110 cfs_rq->curr = se; 110 cfs_rq->curr = se;
111 } 111 }
112 112
113 #else /* CONFIG_FAIR_GROUP_SCHED */ 113 #else /* CONFIG_FAIR_GROUP_SCHED */
114 114
115 static inline struct rq *rq_of(struct cfs_rq *cfs_rq) 115 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
116 { 116 {
117 return container_of(cfs_rq, struct rq, cfs); 117 return container_of(cfs_rq, struct rq, cfs);
118 } 118 }
119 119
120 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) 120 static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
121 { 121 {
122 struct rq *rq = rq_of(cfs_rq); 122 struct rq *rq = rq_of(cfs_rq);
123 123
124 if (unlikely(rq->curr->sched_class != &fair_sched_class)) 124 if (unlikely(rq->curr->sched_class != &fair_sched_class))
125 return NULL; 125 return NULL;
126 126
127 return &rq->curr->se; 127 return &rq->curr->se;
128 } 128 }
129 129
130 #define entity_is_task(se) 1 130 #define entity_is_task(se) 1
131 131
132 static inline void 132 static inline void
133 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 133 set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
134 134
135 #endif /* CONFIG_FAIR_GROUP_SCHED */ 135 #endif /* CONFIG_FAIR_GROUP_SCHED */
136 136
137 static inline struct task_struct *task_of(struct sched_entity *se) 137 static inline struct task_struct *task_of(struct sched_entity *se)
138 { 138 {
139 return container_of(se, struct task_struct, se); 139 return container_of(se, struct task_struct, se);
140 } 140 }
141 141
142 142
143 /************************************************************** 143 /**************************************************************
144 * Scheduling class tree data structure manipulation methods: 144 * Scheduling class tree data structure manipulation methods:
145 */ 145 */
146 146
147 /* 147 /*
148 * Enqueue an entity into the rb-tree: 148 * Enqueue an entity into the rb-tree:
149 */ 149 */
150 static inline void 150 static inline void
151 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 151 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
152 { 152 {
153 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; 153 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
154 struct rb_node *parent = NULL; 154 struct rb_node *parent = NULL;
155 struct sched_entity *entry; 155 struct sched_entity *entry;
156 s64 key = se->fair_key; 156 s64 key = se->fair_key;
157 int leftmost = 1; 157 int leftmost = 1;
158 158
159 /* 159 /*
160 * Find the right place in the rbtree: 160 * Find the right place in the rbtree:
161 */ 161 */
162 while (*link) { 162 while (*link) {
163 parent = *link; 163 parent = *link;
164 entry = rb_entry(parent, struct sched_entity, run_node); 164 entry = rb_entry(parent, struct sched_entity, run_node);
165 /* 165 /*
166 * We dont care about collisions. Nodes with 166 * We dont care about collisions. Nodes with
167 * the same key stay together. 167 * the same key stay together.
168 */ 168 */
169 if (key - entry->fair_key < 0) { 169 if (key - entry->fair_key < 0) {
170 link = &parent->rb_left; 170 link = &parent->rb_left;
171 } else { 171 } else {
172 link = &parent->rb_right; 172 link = &parent->rb_right;
173 leftmost = 0; 173 leftmost = 0;
174 } 174 }
175 } 175 }
176 176
177 /* 177 /*
178 * Maintain a cache of leftmost tree entries (it is frequently 178 * Maintain a cache of leftmost tree entries (it is frequently
179 * used): 179 * used):
180 */ 180 */
181 if (leftmost) 181 if (leftmost)
182 cfs_rq->rb_leftmost = &se->run_node; 182 cfs_rq->rb_leftmost = &se->run_node;
183 183
184 rb_link_node(&se->run_node, parent, link); 184 rb_link_node(&se->run_node, parent, link);
185 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); 185 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
186 update_load_add(&cfs_rq->load, se->load.weight); 186 update_load_add(&cfs_rq->load, se->load.weight);
187 cfs_rq->nr_running++; 187 cfs_rq->nr_running++;
188 se->on_rq = 1; 188 se->on_rq = 1;
189 } 189 }
190 190
191 static inline void 191 static inline void
192 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 192 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
193 { 193 {
194 if (cfs_rq->rb_leftmost == &se->run_node) 194 if (cfs_rq->rb_leftmost == &se->run_node)
195 cfs_rq->rb_leftmost = rb_next(&se->run_node); 195 cfs_rq->rb_leftmost = rb_next(&se->run_node);
196 rb_erase(&se->run_node, &cfs_rq->tasks_timeline); 196 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
197 update_load_sub(&cfs_rq->load, se->load.weight); 197 update_load_sub(&cfs_rq->load, se->load.weight);
198 cfs_rq->nr_running--; 198 cfs_rq->nr_running--;
199 se->on_rq = 0; 199 se->on_rq = 0;
200 } 200 }
201 201
202 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) 202 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
203 { 203 {
204 return cfs_rq->rb_leftmost; 204 return cfs_rq->rb_leftmost;
205 } 205 }
206 206
207 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) 207 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
208 { 208 {
209 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); 209 return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
210 } 210 }
211 211
212 /************************************************************** 212 /**************************************************************
213 * Scheduling class statistics methods: 213 * Scheduling class statistics methods:
214 */ 214 */
215 215
216 /* 216 /*
217 * We rescale the rescheduling granularity of tasks according to their 217 * We rescale the rescheduling granularity of tasks according to their
218 * nice level, but only linearly, not exponentially: 218 * nice level, but only linearly, not exponentially:
219 */ 219 */
220 static long 220 static long
221 niced_granularity(struct sched_entity *curr, unsigned long granularity) 221 niced_granularity(struct sched_entity *curr, unsigned long granularity)
222 { 222 {
223 u64 tmp; 223 u64 tmp;
224 224
225 /* 225 /*
226 * Negative nice levels get the same granularity as nice-0: 226 * Negative nice levels get the same granularity as nice-0:
227 */ 227 */
228 if (likely(curr->load.weight >= NICE_0_LOAD)) 228 if (likely(curr->load.weight >= NICE_0_LOAD))
229 return granularity; 229 return granularity;
230 /* 230 /*
231 * Positive nice level tasks get linearly finer 231 * Positive nice level tasks get linearly finer
232 * granularity: 232 * granularity:
233 */ 233 */
234 tmp = curr->load.weight * (u64)granularity; 234 tmp = curr->load.weight * (u64)granularity;
235 235
236 /* 236 /*
237 * It will always fit into 'long': 237 * It will always fit into 'long':
238 */ 238 */
239 return (long) (tmp >> NICE_0_SHIFT); 239 return (long) (tmp >> NICE_0_SHIFT);
240 } 240 }
241 241
242 static inline void 242 static inline void
243 limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) 243 limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
244 { 244 {
245 long limit = sysctl_sched_runtime_limit; 245 long limit = sysctl_sched_runtime_limit;
246 246
247 /* 247 /*
248 * Niced tasks have the same history dynamic range as 248 * Niced tasks have the same history dynamic range as
249 * non-niced tasks: 249 * non-niced tasks:
250 */ 250 */
251 if (unlikely(se->wait_runtime > limit)) { 251 if (unlikely(se->wait_runtime > limit)) {
252 se->wait_runtime = limit; 252 se->wait_runtime = limit;
253 schedstat_inc(se, wait_runtime_overruns); 253 schedstat_inc(se, wait_runtime_overruns);
254 schedstat_inc(cfs_rq, wait_runtime_overruns); 254 schedstat_inc(cfs_rq, wait_runtime_overruns);
255 } 255 }
256 if (unlikely(se->wait_runtime < -limit)) { 256 if (unlikely(se->wait_runtime < -limit)) {
257 se->wait_runtime = -limit; 257 se->wait_runtime = -limit;
258 schedstat_inc(se, wait_runtime_underruns); 258 schedstat_inc(se, wait_runtime_underruns);
259 schedstat_inc(cfs_rq, wait_runtime_underruns); 259 schedstat_inc(cfs_rq, wait_runtime_underruns);
260 } 260 }
261 } 261 }
262 262
263 static inline void 263 static inline void
264 __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) 264 __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
265 { 265 {
266 se->wait_runtime += delta; 266 se->wait_runtime += delta;
267 schedstat_add(se, sum_wait_runtime, delta); 267 schedstat_add(se, sum_wait_runtime, delta);
268 limit_wait_runtime(cfs_rq, se); 268 limit_wait_runtime(cfs_rq, se);
269 } 269 }
270 270
271 static void 271 static void
272 add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) 272 add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
273 { 273 {
274 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); 274 schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
275 __add_wait_runtime(cfs_rq, se, delta); 275 __add_wait_runtime(cfs_rq, se, delta);
276 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); 276 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
277 } 277 }
278 278
279 /* 279 /*
280 * Update the current task's runtime statistics. Skip current tasks that 280 * Update the current task's runtime statistics. Skip current tasks that
281 * are not in our scheduling class. 281 * are not in our scheduling class.
282 */ 282 */
283 static inline void 283 static inline void
284 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) 284 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
285 { 285 {
286 unsigned long delta, delta_exec, delta_fair; 286 unsigned long delta, delta_exec, delta_fair;
287 long delta_mine; 287 long delta_mine;
288 struct load_weight *lw = &cfs_rq->load; 288 struct load_weight *lw = &cfs_rq->load;
289 unsigned long load = lw->weight; 289 unsigned long load = lw->weight;
290 290
291 if (unlikely(!load)) 291 if (unlikely(!load))
292 return; 292 return;
293 293
294 delta_exec = curr->delta_exec; 294 delta_exec = curr->delta_exec;
295 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 295 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
296 296
297 curr->sum_exec_runtime += delta_exec; 297 curr->sum_exec_runtime += delta_exec;
298 cfs_rq->exec_clock += delta_exec; 298 cfs_rq->exec_clock += delta_exec;
299 299
300 delta_fair = calc_delta_fair(delta_exec, lw); 300 delta_fair = calc_delta_fair(delta_exec, lw);
301 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); 301 delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
302 302
303 if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { 303 if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
304 delta = calc_delta_mine(cfs_rq->sleeper_bonus, 304 delta = calc_delta_mine(cfs_rq->sleeper_bonus,
305 curr->load.weight, lw); 305 curr->load.weight, lw);
306 if (unlikely(delta > cfs_rq->sleeper_bonus)) 306 if (unlikely(delta > cfs_rq->sleeper_bonus))
307 delta = cfs_rq->sleeper_bonus; 307 delta = cfs_rq->sleeper_bonus;
308 308
309 cfs_rq->sleeper_bonus -= delta; 309 cfs_rq->sleeper_bonus -= delta;
310 delta_mine -= delta; 310 delta_mine -= delta;
311 } 311 }
312 312
313 cfs_rq->fair_clock += delta_fair; 313 cfs_rq->fair_clock += delta_fair;
314 /* 314 /*
315 * We executed delta_exec amount of time on the CPU, 315 * We executed delta_exec amount of time on the CPU,
316 * but we were only entitled to delta_mine amount of 316 * but we were only entitled to delta_mine amount of
317 * time during that period (if nr_running == 1 then 317 * time during that period (if nr_running == 1 then
318 * the two values are equal) 318 * the two values are equal)
319 * [Note: delta_mine - delta_exec is negative]: 319 * [Note: delta_mine - delta_exec is negative]:
320 */ 320 */
321 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); 321 add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec);
322 } 322 }
323 323
324 static void update_curr(struct cfs_rq *cfs_rq, u64 now) 324 static void update_curr(struct cfs_rq *cfs_rq, u64 now)
325 { 325 {
326 struct sched_entity *curr = cfs_rq_curr(cfs_rq); 326 struct sched_entity *curr = cfs_rq_curr(cfs_rq);
327 unsigned long delta_exec; 327 unsigned long delta_exec;
328 328
329 if (unlikely(!curr)) 329 if (unlikely(!curr))
330 return; 330 return;
331 331
332 /* 332 /*
333 * Get the amount of time the current task was running 333 * Get the amount of time the current task was running
334 * since the last time we changed load (this cannot 334 * since the last time we changed load (this cannot
335 * overflow on 32 bits): 335 * overflow on 32 bits):
336 */ 336 */
337 delta_exec = (unsigned long)(now - curr->exec_start); 337 delta_exec = (unsigned long)(now - curr->exec_start);
338 338
339 curr->delta_exec += delta_exec; 339 curr->delta_exec += delta_exec;
340 340
341 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { 341 if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
342 __update_curr(cfs_rq, curr, now); 342 __update_curr(cfs_rq, curr, now);
343 curr->delta_exec = 0; 343 curr->delta_exec = 0;
344 } 344 }
345 curr->exec_start = now; 345 curr->exec_start = now;
346 } 346 }
347 347
348 static inline void 348 static inline void
349 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 349 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
350 { 350 {
351 se->wait_start_fair = cfs_rq->fair_clock; 351 se->wait_start_fair = cfs_rq->fair_clock;
352 schedstat_set(se->wait_start, now); 352 schedstat_set(se->wait_start, now);
353 } 353 }
354 354
355 /* 355 /*
356 * We calculate fair deltas here, so protect against the random effects 356 * We calculate fair deltas here, so protect against the random effects
357 * of a multiplication overflow by capping it to the runtime limit: 357 * of a multiplication overflow by capping it to the runtime limit:
358 */ 358 */
359 #if BITS_PER_LONG == 32 359 #if BITS_PER_LONG == 32
360 static inline unsigned long 360 static inline unsigned long
361 calc_weighted(unsigned long delta, unsigned long weight, int shift) 361 calc_weighted(unsigned long delta, unsigned long weight, int shift)
362 { 362 {
363 u64 tmp = (u64)delta * weight >> shift; 363 u64 tmp = (u64)delta * weight >> shift;
364 364
365 if (unlikely(tmp > sysctl_sched_runtime_limit*2)) 365 if (unlikely(tmp > sysctl_sched_runtime_limit*2))
366 return sysctl_sched_runtime_limit*2; 366 return sysctl_sched_runtime_limit*2;
367 return tmp; 367 return tmp;
368 } 368 }
369 #else 369 #else
370 static inline unsigned long 370 static inline unsigned long
371 calc_weighted(unsigned long delta, unsigned long weight, int shift) 371 calc_weighted(unsigned long delta, unsigned long weight, int shift)
372 { 372 {
373 return delta * weight >> shift; 373 return delta * weight >> shift;
374 } 374 }
375 #endif 375 #endif
376 376
377 /* 377 /*
378 * Task is being enqueued - update stats: 378 * Task is being enqueued - update stats:
379 */ 379 */
380 static void 380 static void
381 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 381 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
382 { 382 {
383 s64 key; 383 s64 key;
384 384
385 /* 385 /*
386 * Are we enqueueing a waiting task? (for current tasks 386 * Are we enqueueing a waiting task? (for current tasks
387 * a dequeue/enqueue event is a NOP) 387 * a dequeue/enqueue event is a NOP)
388 */ 388 */
389 if (se != cfs_rq_curr(cfs_rq)) 389 if (se != cfs_rq_curr(cfs_rq))
390 update_stats_wait_start(cfs_rq, se, now); 390 update_stats_wait_start(cfs_rq, se, now);
391 /* 391 /*
392 * Update the key: 392 * Update the key:
393 */ 393 */
394 key = cfs_rq->fair_clock; 394 key = cfs_rq->fair_clock;
395 395
396 /* 396 /*
397 * Optimize the common nice 0 case: 397 * Optimize the common nice 0 case:
398 */ 398 */
399 if (likely(se->load.weight == NICE_0_LOAD)) { 399 if (likely(se->load.weight == NICE_0_LOAD)) {
400 key -= se->wait_runtime; 400 key -= se->wait_runtime;
401 } else { 401 } else {
402 u64 tmp; 402 u64 tmp;
403 403
404 if (se->wait_runtime < 0) { 404 if (se->wait_runtime < 0) {
405 tmp = -se->wait_runtime; 405 tmp = -se->wait_runtime;
406 key += (tmp * se->load.inv_weight) >> 406 key += (tmp * se->load.inv_weight) >>
407 (WMULT_SHIFT - NICE_0_SHIFT); 407 (WMULT_SHIFT - NICE_0_SHIFT);
408 } else { 408 } else {
409 tmp = se->wait_runtime; 409 tmp = se->wait_runtime;
410 key -= (tmp * se->load.weight) >> NICE_0_SHIFT; 410 key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
411 } 411 }
412 } 412 }
413 413
414 se->fair_key = key; 414 se->fair_key = key;
415 } 415 }
416 416
417 /* 417 /*
418 * Note: must be called with a freshly updated rq->fair_clock. 418 * Note: must be called with a freshly updated rq->fair_clock.
419 */ 419 */
420 static inline void 420 static inline void
421 __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 421 __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
422 { 422 {
423 unsigned long delta_fair = se->delta_fair_run; 423 unsigned long delta_fair = se->delta_fair_run;
424 424
425 schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start)); 425 schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
426 426
427 if (unlikely(se->load.weight != NICE_0_LOAD)) 427 if (unlikely(se->load.weight != NICE_0_LOAD))
428 delta_fair = calc_weighted(delta_fair, se->load.weight, 428 delta_fair = calc_weighted(delta_fair, se->load.weight,
429 NICE_0_SHIFT); 429 NICE_0_SHIFT);
430 430
431 add_wait_runtime(cfs_rq, se, delta_fair); 431 add_wait_runtime(cfs_rq, se, delta_fair);
432 } 432 }
433 433
434 static void 434 static void
435 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 435 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
436 { 436 {
437 unsigned long delta_fair; 437 unsigned long delta_fair;
438 438
439 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), 439 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
440 (u64)(cfs_rq->fair_clock - se->wait_start_fair)); 440 (u64)(cfs_rq->fair_clock - se->wait_start_fair));
441 441
442 se->delta_fair_run += delta_fair; 442 se->delta_fair_run += delta_fair;
443 if (unlikely(abs(se->delta_fair_run) >= 443 if (unlikely(abs(se->delta_fair_run) >=
444 sysctl_sched_stat_granularity)) { 444 sysctl_sched_stat_granularity)) {
445 __update_stats_wait_end(cfs_rq, se, now); 445 __update_stats_wait_end(cfs_rq, se, now);
446 se->delta_fair_run = 0; 446 se->delta_fair_run = 0;
447 } 447 }
448 448
449 se->wait_start_fair = 0; 449 se->wait_start_fair = 0;
450 schedstat_set(se->wait_start, 0); 450 schedstat_set(se->wait_start, 0);
451 } 451 }
452 452
453 static inline void 453 static inline void
454 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 454 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
455 { 455 {
456 update_curr(cfs_rq, now); 456 update_curr(cfs_rq, now);
457 /* 457 /*
458 * Mark the end of the wait period if dequeueing a 458 * Mark the end of the wait period if dequeueing a
459 * waiting task: 459 * waiting task:
460 */ 460 */
461 if (se != cfs_rq_curr(cfs_rq)) 461 if (se != cfs_rq_curr(cfs_rq))
462 update_stats_wait_end(cfs_rq, se, now); 462 update_stats_wait_end(cfs_rq, se, now);
463 } 463 }
464 464
465 /* 465 /*
466 * We are picking a new current task - update its stats: 466 * We are picking a new current task - update its stats:
467 */ 467 */
468 static inline void 468 static inline void
469 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 469 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
470 { 470 {
471 /* 471 /*
472 * We are starting a new run period: 472 * We are starting a new run period:
473 */ 473 */
474 se->exec_start = now; 474 se->exec_start = now;
475 } 475 }
476 476
477 /* 477 /*
478 * We are descheduling a task - update its stats: 478 * We are descheduling a task - update its stats:
479 */ 479 */
480 static inline void 480 static inline void
481 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 481 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
482 { 482 {
483 se->exec_start = 0; 483 se->exec_start = 0;
484 } 484 }
485 485
486 /************************************************** 486 /**************************************************
487 * Scheduling class queueing methods: 487 * Scheduling class queueing methods:
488 */ 488 */
489 489
490 static void 490 static void
491 __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 491 __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
492 { 492 {
493 unsigned long load = cfs_rq->load.weight, delta_fair; 493 unsigned long load = cfs_rq->load.weight, delta_fair;
494 long prev_runtime; 494 long prev_runtime;
495 495
496 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) 496 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
497 load = rq_of(cfs_rq)->cpu_load[2]; 497 load = rq_of(cfs_rq)->cpu_load[2];
498 498
499 delta_fair = se->delta_fair_sleep; 499 delta_fair = se->delta_fair_sleep;
500 500
501 /* 501 /*
502 * Fix up delta_fair with the effect of us running 502 * Fix up delta_fair with the effect of us running
503 * during the whole sleep period: 503 * during the whole sleep period:
504 */ 504 */
505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) 505 if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
506 delta_fair = div64_likely32((u64)delta_fair * load, 506 delta_fair = div64_likely32((u64)delta_fair * load,
507 load + se->load.weight); 507 load + se->load.weight);
508 508
509 if (unlikely(se->load.weight != NICE_0_LOAD)) 509 if (unlikely(se->load.weight != NICE_0_LOAD))
510 delta_fair = calc_weighted(delta_fair, se->load.weight, 510 delta_fair = calc_weighted(delta_fair, se->load.weight,
511 NICE_0_SHIFT); 511 NICE_0_SHIFT);
512 512
513 prev_runtime = se->wait_runtime; 513 prev_runtime = se->wait_runtime;
514 __add_wait_runtime(cfs_rq, se, delta_fair); 514 __add_wait_runtime(cfs_rq, se, delta_fair);
515 delta_fair = se->wait_runtime - prev_runtime; 515 delta_fair = se->wait_runtime - prev_runtime;
516 516
517 /* 517 /*
518 * Track the amount of bonus we've given to sleepers: 518 * Track the amount of bonus we've given to sleepers:
519 */ 519 */
520 cfs_rq->sleeper_bonus += delta_fair; 520 cfs_rq->sleeper_bonus += delta_fair;
521 521
522 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); 522 schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
523 } 523 }
524 524
525 static void 525 static void
526 enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 526 enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
527 { 527 {
528 struct task_struct *tsk = task_of(se); 528 struct task_struct *tsk = task_of(se);
529 unsigned long delta_fair; 529 unsigned long delta_fair;
530 530
531 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || 531 if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
532 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) 532 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
533 return; 533 return;
534 534
535 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), 535 delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
536 (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); 536 (u64)(cfs_rq->fair_clock - se->sleep_start_fair));
537 537
538 se->delta_fair_sleep += delta_fair; 538 se->delta_fair_sleep += delta_fair;
539 if (unlikely(abs(se->delta_fair_sleep) >= 539 if (unlikely(abs(se->delta_fair_sleep) >=
540 sysctl_sched_stat_granularity)) { 540 sysctl_sched_stat_granularity)) {
541 __enqueue_sleeper(cfs_rq, se, now); 541 __enqueue_sleeper(cfs_rq, se, now);
542 se->delta_fair_sleep = 0; 542 se->delta_fair_sleep = 0;
543 } 543 }
544 544
545 se->sleep_start_fair = 0; 545 se->sleep_start_fair = 0;
546 546
547 #ifdef CONFIG_SCHEDSTATS 547 #ifdef CONFIG_SCHEDSTATS
548 if (se->sleep_start) { 548 if (se->sleep_start) {
549 u64 delta = now - se->sleep_start; 549 u64 delta = now - se->sleep_start;
550 550
551 if ((s64)delta < 0) 551 if ((s64)delta < 0)
552 delta = 0; 552 delta = 0;
553 553
554 if (unlikely(delta > se->sleep_max)) 554 if (unlikely(delta > se->sleep_max))
555 se->sleep_max = delta; 555 se->sleep_max = delta;
556 556
557 se->sleep_start = 0; 557 se->sleep_start = 0;
558 se->sum_sleep_runtime += delta; 558 se->sum_sleep_runtime += delta;
559 } 559 }
560 if (se->block_start) { 560 if (se->block_start) {
561 u64 delta = now - se->block_start; 561 u64 delta = now - se->block_start;
562 562
563 if ((s64)delta < 0) 563 if ((s64)delta < 0)
564 delta = 0; 564 delta = 0;
565 565
566 if (unlikely(delta > se->block_max)) 566 if (unlikely(delta > se->block_max))
567 se->block_max = delta; 567 se->block_max = delta;
568 568
569 se->block_start = 0; 569 se->block_start = 0;
570 se->sum_sleep_runtime += delta; 570 se->sum_sleep_runtime += delta;
571 } 571 }
572 #endif 572 #endif
573 } 573 }
574 574
575 static void 575 static void
576 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 576 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
577 int wakeup, u64 now) 577 int wakeup, u64 now)
578 { 578 {
579 /* 579 /*
580 * Update the fair clock. 580 * Update the fair clock.
581 */ 581 */
582 update_curr(cfs_rq, now); 582 update_curr(cfs_rq, now);
583 583
584 if (wakeup) 584 if (wakeup)
585 enqueue_sleeper(cfs_rq, se, now); 585 enqueue_sleeper(cfs_rq, se, now);
586 586
587 update_stats_enqueue(cfs_rq, se, now); 587 update_stats_enqueue(cfs_rq, se, now);
588 __enqueue_entity(cfs_rq, se); 588 __enqueue_entity(cfs_rq, se);
589 } 589 }
590 590
591 static void 591 static void
592 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 592 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
593 int sleep, u64 now) 593 int sleep, u64 now)
594 { 594 {
595 update_stats_dequeue(cfs_rq, se, now); 595 update_stats_dequeue(cfs_rq, se, now);
596 if (sleep) { 596 if (sleep) {
597 se->sleep_start_fair = cfs_rq->fair_clock; 597 se->sleep_start_fair = cfs_rq->fair_clock;
598 #ifdef CONFIG_SCHEDSTATS 598 #ifdef CONFIG_SCHEDSTATS
599 if (entity_is_task(se)) { 599 if (entity_is_task(se)) {
600 struct task_struct *tsk = task_of(se); 600 struct task_struct *tsk = task_of(se);
601 601
602 if (tsk->state & TASK_INTERRUPTIBLE) 602 if (tsk->state & TASK_INTERRUPTIBLE)
603 se->sleep_start = now; 603 se->sleep_start = now;
604 if (tsk->state & TASK_UNINTERRUPTIBLE) 604 if (tsk->state & TASK_UNINTERRUPTIBLE)
605 se->block_start = now; 605 se->block_start = now;
606 } 606 }
607 cfs_rq->wait_runtime -= se->wait_runtime; 607 cfs_rq->wait_runtime -= se->wait_runtime;
608 #endif 608 #endif
609 } 609 }
610 __dequeue_entity(cfs_rq, se); 610 __dequeue_entity(cfs_rq, se);
611 } 611 }
612 612
613 /* 613 /*
614 * Preempt the current task with a newly woken task if needed: 614 * Preempt the current task with a newly woken task if needed:
615 */ 615 */
616 static void 616 static void
617 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, 617 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
618 struct sched_entity *curr, unsigned long granularity) 618 struct sched_entity *curr, unsigned long granularity)
619 { 619 {
620 s64 __delta = curr->fair_key - se->fair_key; 620 s64 __delta = curr->fair_key - se->fair_key;
621 621
622 /* 622 /*
623 * Take scheduling granularity into account - do not 623 * Take scheduling granularity into account - do not
624 * preempt the current task unless the best task has 624 * preempt the current task unless the best task has
625 * a larger than sched_granularity fairness advantage: 625 * a larger than sched_granularity fairness advantage:
626 */ 626 */
627 if (__delta > niced_granularity(curr, granularity)) 627 if (__delta > niced_granularity(curr, granularity))
628 resched_task(rq_of(cfs_rq)->curr); 628 resched_task(rq_of(cfs_rq)->curr);
629 } 629 }
630 630
631 static inline void 631 static inline void
632 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) 632 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
633 { 633 {
634 /* 634 /*
635 * Any task has to be enqueued before it get to execute on 635 * Any task has to be enqueued before it get to execute on
636 * a CPU. So account for the time it spent waiting on the 636 * a CPU. So account for the time it spent waiting on the
637 * runqueue. (note, here we rely on pick_next_task() having 637 * runqueue. (note, here we rely on pick_next_task() having
638 * done a put_prev_task_fair() shortly before this, which 638 * done a put_prev_task_fair() shortly before this, which
639 * updated rq->fair_clock - used by update_stats_wait_end()) 639 * updated rq->fair_clock - used by update_stats_wait_end())
640 */ 640 */
641 update_stats_wait_end(cfs_rq, se, now); 641 update_stats_wait_end(cfs_rq, se, now);
642 update_stats_curr_start(cfs_rq, se, now); 642 update_stats_curr_start(cfs_rq, se, now);
643 set_cfs_rq_curr(cfs_rq, se); 643 set_cfs_rq_curr(cfs_rq, se);
644 } 644 }
645 645
646 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) 646 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
647 { 647 {
648 struct sched_entity *se = __pick_next_entity(cfs_rq); 648 struct sched_entity *se = __pick_next_entity(cfs_rq);
649 649
650 set_next_entity(cfs_rq, se, now); 650 set_next_entity(cfs_rq, se, now);
651 651
652 return se; 652 return se;
653 } 653 }
654 654
655 static void 655 static void
656 put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) 656 put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
657 { 657 {
658 /* 658 /*
659 * If still on the runqueue then deactivate_task() 659 * If still on the runqueue then deactivate_task()
660 * was not called and update_curr() has to be done: 660 * was not called and update_curr() has to be done:
661 */ 661 */
662 if (prev->on_rq) 662 if (prev->on_rq)
663 update_curr(cfs_rq, now); 663 update_curr(cfs_rq, now);
664 664
665 update_stats_curr_end(cfs_rq, prev, now); 665 update_stats_curr_end(cfs_rq, prev, now);
666 666
667 if (prev->on_rq) 667 if (prev->on_rq)
668 update_stats_wait_start(cfs_rq, prev, now); 668 update_stats_wait_start(cfs_rq, prev, now);
669 set_cfs_rq_curr(cfs_rq, NULL); 669 set_cfs_rq_curr(cfs_rq, NULL);
670 } 670 }
671 671
672 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 672 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
673 { 673 {
674 struct rq *rq = rq_of(cfs_rq); 674 struct rq *rq = rq_of(cfs_rq);
675 struct sched_entity *next; 675 struct sched_entity *next;
676 u64 now = __rq_clock(rq); 676 u64 now = __rq_clock(rq);
677 677
678 /* 678 /*
679 * Dequeue and enqueue the task to update its 679 * Dequeue and enqueue the task to update its
680 * position within the tree: 680 * position within the tree:
681 */ 681 */
682 dequeue_entity(cfs_rq, curr, 0, now); 682 dequeue_entity(cfs_rq, curr, 0, now);
683 enqueue_entity(cfs_rq, curr, 0, now); 683 enqueue_entity(cfs_rq, curr, 0, now);
684 684
685 /* 685 /*
686 * Reschedule if another task tops the current one. 686 * Reschedule if another task tops the current one.
687 */ 687 */
688 next = __pick_next_entity(cfs_rq); 688 next = __pick_next_entity(cfs_rq);
689 if (next == curr) 689 if (next == curr)
690 return; 690 return;
691 691
692 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); 692 __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
693 } 693 }
694 694
695 /************************************************** 695 /**************************************************
696 * CFS operations on tasks: 696 * CFS operations on tasks:
697 */ 697 */
698 698
699 #ifdef CONFIG_FAIR_GROUP_SCHED 699 #ifdef CONFIG_FAIR_GROUP_SCHED
700 700
701 /* Walk up scheduling entities hierarchy */ 701 /* Walk up scheduling entities hierarchy */
702 #define for_each_sched_entity(se) \ 702 #define for_each_sched_entity(se) \
703 for (; se; se = se->parent) 703 for (; se; se = se->parent)
704 704
705 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 705 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
706 { 706 {
707 return p->se.cfs_rq; 707 return p->se.cfs_rq;
708 } 708 }
709 709
710 /* runqueue on which this entity is (to be) queued */ 710 /* runqueue on which this entity is (to be) queued */
711 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 711 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
712 { 712 {
713 return se->cfs_rq; 713 return se->cfs_rq;
714 } 714 }
715 715
716 /* runqueue "owned" by this group */ 716 /* runqueue "owned" by this group */
717 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 717 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
718 { 718 {
719 return grp->my_q; 719 return grp->my_q;
720 } 720 }
721 721
722 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on 722 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
723 * another cpu ('this_cpu') 723 * another cpu ('this_cpu')
724 */ 724 */
725 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 725 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
726 { 726 {
727 /* A later patch will take group into account */ 727 /* A later patch will take group into account */
728 return &cpu_rq(this_cpu)->cfs; 728 return &cpu_rq(this_cpu)->cfs;
729 } 729 }
730 730
731 /* Iterate thr' all leaf cfs_rq's on a runqueue */ 731 /* Iterate thr' all leaf cfs_rq's on a runqueue */
732 #define for_each_leaf_cfs_rq(rq, cfs_rq) \ 732 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
733 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 733 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
734 734
735 /* Do the two (enqueued) tasks belong to the same group ? */ 735 /* Do the two (enqueued) tasks belong to the same group ? */
736 static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 736 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
737 { 737 {
738 if (curr->se.cfs_rq == p->se.cfs_rq) 738 if (curr->se.cfs_rq == p->se.cfs_rq)
739 return 1; 739 return 1;
740 740
741 return 0; 741 return 0;
742 } 742 }
743 743
744 #else /* CONFIG_FAIR_GROUP_SCHED */ 744 #else /* CONFIG_FAIR_GROUP_SCHED */
745 745
746 #define for_each_sched_entity(se) \ 746 #define for_each_sched_entity(se) \
747 for (; se; se = NULL) 747 for (; se; se = NULL)
748 748
749 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) 749 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
750 { 750 {
751 return &task_rq(p)->cfs; 751 return &task_rq(p)->cfs;
752 } 752 }
753 753
754 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) 754 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
755 { 755 {
756 struct task_struct *p = task_of(se); 756 struct task_struct *p = task_of(se);
757 struct rq *rq = task_rq(p); 757 struct rq *rq = task_rq(p);
758 758
759 return &rq->cfs; 759 return &rq->cfs;
760 } 760 }
761 761
762 /* runqueue "owned" by this group */ 762 /* runqueue "owned" by this group */
763 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) 763 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
764 { 764 {
765 return NULL; 765 return NULL;
766 } 766 }
767 767
768 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) 768 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
769 { 769 {
770 return &cpu_rq(this_cpu)->cfs; 770 return &cpu_rq(this_cpu)->cfs;
771 } 771 }
772 772
773 #define for_each_leaf_cfs_rq(rq, cfs_rq) \ 773 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
774 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 774 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
775 775
776 static inline int is_same_group(struct task_struct *curr, struct task_struct *p) 776 static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
777 { 777 {
778 return 1; 778 return 1;
779 } 779 }
780 780
781 #endif /* CONFIG_FAIR_GROUP_SCHED */ 781 #endif /* CONFIG_FAIR_GROUP_SCHED */
782 782
783 /* 783 /*
784 * The enqueue_task method is called before nr_running is 784 * The enqueue_task method is called before nr_running is
785 * increased. Here we update the fair scheduling stats and 785 * increased. Here we update the fair scheduling stats and
786 * then put the task into the rbtree: 786 * then put the task into the rbtree:
787 */ 787 */
788 static void 788 static void
789 enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) 789 enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
790 { 790 {
791 struct cfs_rq *cfs_rq; 791 struct cfs_rq *cfs_rq;
792 struct sched_entity *se = &p->se; 792 struct sched_entity *se = &p->se;
793 793
794 for_each_sched_entity(se) { 794 for_each_sched_entity(se) {
795 if (se->on_rq) 795 if (se->on_rq)
796 break; 796 break;
797 cfs_rq = cfs_rq_of(se); 797 cfs_rq = cfs_rq_of(se);
798 enqueue_entity(cfs_rq, se, wakeup, now); 798 enqueue_entity(cfs_rq, se, wakeup, now);
799 } 799 }
800 } 800 }
801 801
802 /* 802 /*
803 * The dequeue_task method is called before nr_running is 803 * The dequeue_task method is called before nr_running is
804 * decreased. We remove the task from the rbtree and 804 * decreased. We remove the task from the rbtree and
805 * update the fair scheduling stats: 805 * update the fair scheduling stats:
806 */ 806 */
807 static void 807 static void
808 dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) 808 dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
809 { 809 {
810 struct cfs_rq *cfs_rq; 810 struct cfs_rq *cfs_rq;
811 struct sched_entity *se = &p->se; 811 struct sched_entity *se = &p->se;
812 812
813 for_each_sched_entity(se) { 813 for_each_sched_entity(se) {
814 cfs_rq = cfs_rq_of(se); 814 cfs_rq = cfs_rq_of(se);
815 dequeue_entity(cfs_rq, se, sleep, now); 815 dequeue_entity(cfs_rq, se, sleep, now);
816 /* Don't dequeue parent if it has other entities besides us */ 816 /* Don't dequeue parent if it has other entities besides us */
817 if (cfs_rq->load.weight) 817 if (cfs_rq->load.weight)
818 break; 818 break;
819 } 819 }
820 } 820 }
821 821
822 /* 822 /*
823 * sched_yield() support is very simple - we dequeue and enqueue 823 * sched_yield() support is very simple - we dequeue and enqueue
824 */ 824 */
825 static void yield_task_fair(struct rq *rq, struct task_struct *p) 825 static void yield_task_fair(struct rq *rq, struct task_struct *p)
826 { 826 {
827 struct cfs_rq *cfs_rq = task_cfs_rq(p); 827 struct cfs_rq *cfs_rq = task_cfs_rq(p);
828 u64 now = __rq_clock(rq); 828 u64 now = __rq_clock(rq);
829 829
830 /* 830 /*
831 * Dequeue and enqueue the task to update its 831 * Dequeue and enqueue the task to update its
832 * position within the tree: 832 * position within the tree:
833 */ 833 */
834 dequeue_entity(cfs_rq, &p->se, 0, now); 834 dequeue_entity(cfs_rq, &p->se, 0, now);
835 enqueue_entity(cfs_rq, &p->se, 0, now); 835 enqueue_entity(cfs_rq, &p->se, 0, now);
836 } 836 }
837 837
838 /* 838 /*
839 * Preempt the current task with a newly woken task if needed: 839 * Preempt the current task with a newly woken task if needed:
840 */ 840 */
841 static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) 841 static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
842 { 842 {
843 struct task_struct *curr = rq->curr; 843 struct task_struct *curr = rq->curr;
844 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 844 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
845 unsigned long gran; 845 unsigned long gran;
846 846
847 if (unlikely(rt_prio(p->prio))) { 847 if (unlikely(rt_prio(p->prio))) {
848 update_curr(cfs_rq, rq_clock(rq)); 848 update_curr(cfs_rq, rq_clock(rq));
849 resched_task(curr); 849 resched_task(curr);
850 return; 850 return;
851 } 851 }
852 852
853 gran = sysctl_sched_wakeup_granularity; 853 gran = sysctl_sched_wakeup_granularity;
854 /* 854 /*
855 * Batch tasks prefer throughput over latency: 855 * Batch tasks prefer throughput over latency:
856 */ 856 */
857 if (unlikely(p->policy == SCHED_BATCH)) 857 if (unlikely(p->policy == SCHED_BATCH))
858 gran = sysctl_sched_batch_wakeup_granularity; 858 gran = sysctl_sched_batch_wakeup_granularity;
859 859
860 if (is_same_group(curr, p)) 860 if (is_same_group(curr, p))
861 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); 861 __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
862 } 862 }
863 863
864 static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) 864 static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now)
865 { 865 {
866 struct cfs_rq *cfs_rq = &rq->cfs; 866 struct cfs_rq *cfs_rq = &rq->cfs;
867 struct sched_entity *se; 867 struct sched_entity *se;
868 868
869 if (unlikely(!cfs_rq->nr_running)) 869 if (unlikely(!cfs_rq->nr_running))
870 return NULL; 870 return NULL;
871 871
872 do { 872 do {
873 se = pick_next_entity(cfs_rq, now); 873 se = pick_next_entity(cfs_rq, now);
874 cfs_rq = group_cfs_rq(se); 874 cfs_rq = group_cfs_rq(se);
875 } while (cfs_rq); 875 } while (cfs_rq);
876 876
877 return task_of(se); 877 return task_of(se);
878 } 878 }
879 879
880 /* 880 /*
881 * Account for a descheduled task: 881 * Account for a descheduled task:
882 */ 882 */
883 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) 883 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
884 { 884 {
885 struct sched_entity *se = &prev->se; 885 struct sched_entity *se = &prev->se;
886 struct cfs_rq *cfs_rq; 886 struct cfs_rq *cfs_rq;
887 887
888 for_each_sched_entity(se) { 888 for_each_sched_entity(se) {
889 cfs_rq = cfs_rq_of(se); 889 cfs_rq = cfs_rq_of(se);
890 put_prev_entity(cfs_rq, se, now); 890 put_prev_entity(cfs_rq, se, now);
891 } 891 }
892 } 892 }
893 893
894 /************************************************** 894 /**************************************************
895 * Fair scheduling class load-balancing methods: 895 * Fair scheduling class load-balancing methods:
896 */ 896 */
897 897
898 /* 898 /*
899 * Load-balancing iterator. Note: while the runqueue stays locked 899 * Load-balancing iterator. Note: while the runqueue stays locked
900 * during the whole iteration, the current task might be 900 * during the whole iteration, the current task might be
901 * dequeued so the iterator has to be dequeue-safe. Here we 901 * dequeued so the iterator has to be dequeue-safe. Here we
902 * achieve that by always pre-iterating before returning 902 * achieve that by always pre-iterating before returning
903 * the current task: 903 * the current task:
904 */ 904 */
905 static inline struct task_struct * 905 static inline struct task_struct *
906 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) 906 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
907 { 907 {
908 struct task_struct *p; 908 struct task_struct *p;
909 909
910 if (!curr) 910 if (!curr)
911 return NULL; 911 return NULL;
912 912
913 p = rb_entry(curr, struct task_struct, se.run_node); 913 p = rb_entry(curr, struct task_struct, se.run_node);
914 cfs_rq->rb_load_balance_curr = rb_next(curr); 914 cfs_rq->rb_load_balance_curr = rb_next(curr);
915 915
916 return p; 916 return p;
917 } 917 }
918 918
919 static struct task_struct *load_balance_start_fair(void *arg) 919 static struct task_struct *load_balance_start_fair(void *arg)
920 { 920 {
921 struct cfs_rq *cfs_rq = arg; 921 struct cfs_rq *cfs_rq = arg;
922 922
923 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); 923 return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
924 } 924 }
925 925
926 static struct task_struct *load_balance_next_fair(void *arg) 926 static struct task_struct *load_balance_next_fair(void *arg)
927 { 927 {
928 struct cfs_rq *cfs_rq = arg; 928 struct cfs_rq *cfs_rq = arg;
929 929
930 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 930 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
931 } 931 }
932 932
933 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 933 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
934 { 934 {
935 struct sched_entity *curr; 935 struct sched_entity *curr;
936 struct task_struct *p; 936 struct task_struct *p;
937 937
938 if (!cfs_rq->nr_running) 938 if (!cfs_rq->nr_running)
939 return MAX_PRIO; 939 return MAX_PRIO;
940 940
941 curr = __pick_next_entity(cfs_rq); 941 curr = __pick_next_entity(cfs_rq);
942 p = task_of(curr); 942 p = task_of(curr);
943 943
944 return p->prio; 944 return p->prio;
945 } 945 }
946 946
947 static int 947 static unsigned long
948 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 948 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
949 unsigned long max_nr_move, unsigned long max_load_move, 949 unsigned long max_nr_move, unsigned long max_load_move,
950 struct sched_domain *sd, enum cpu_idle_type idle, 950 struct sched_domain *sd, enum cpu_idle_type idle,
951 int *all_pinned, unsigned long *total_load_moved) 951 int *all_pinned)
952 { 952 {
953 struct cfs_rq *busy_cfs_rq; 953 struct cfs_rq *busy_cfs_rq;
954 unsigned long load_moved, total_nr_moved = 0, nr_moved; 954 unsigned long load_moved, total_nr_moved = 0, nr_moved;
955 long rem_load_move = max_load_move; 955 long rem_load_move = max_load_move;
956 struct rq_iterator cfs_rq_iterator; 956 struct rq_iterator cfs_rq_iterator;
957 957
958 cfs_rq_iterator.start = load_balance_start_fair; 958 cfs_rq_iterator.start = load_balance_start_fair;
959 cfs_rq_iterator.next = load_balance_next_fair; 959 cfs_rq_iterator.next = load_balance_next_fair;
960 960
961 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 961 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
962 struct cfs_rq *this_cfs_rq; 962 struct cfs_rq *this_cfs_rq;
963 long imbalance; 963 long imbalance;
964 unsigned long maxload; 964 unsigned long maxload;
965 int this_best_prio, best_prio, best_prio_seen = 0; 965 int this_best_prio, best_prio, best_prio_seen = 0;
966 966
967 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 967 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
968 968
969 imbalance = busy_cfs_rq->load.weight - 969 imbalance = busy_cfs_rq->load.weight -
970 this_cfs_rq->load.weight; 970 this_cfs_rq->load.weight;
971 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 971 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
972 if (imbalance <= 0) 972 if (imbalance <= 0)
973 continue; 973 continue;
974 974
975 /* Don't pull more than imbalance/2 */ 975 /* Don't pull more than imbalance/2 */
976 imbalance /= 2; 976 imbalance /= 2;
977 maxload = min(rem_load_move, imbalance); 977 maxload = min(rem_load_move, imbalance);
978 978
979 this_best_prio = cfs_rq_best_prio(this_cfs_rq); 979 this_best_prio = cfs_rq_best_prio(this_cfs_rq);
980 best_prio = cfs_rq_best_prio(busy_cfs_rq); 980 best_prio = cfs_rq_best_prio(busy_cfs_rq);
981 981
982 /* 982 /*
983 * Enable handling of the case where there is more than one task 983 * Enable handling of the case where there is more than one task
984 * with the best priority. If the current running task is one 984 * with the best priority. If the current running task is one
985 * of those with prio==best_prio we know it won't be moved 985 * of those with prio==best_prio we know it won't be moved
986 * and therefore it's safe to override the skip (based on load) 986 * and therefore it's safe to override the skip (based on load)
987 * of any task we find with that prio. 987 * of any task we find with that prio.
988 */ 988 */
989 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) 989 if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
990 best_prio_seen = 1; 990 best_prio_seen = 1;
991 991
992 /* pass busy_cfs_rq argument into 992 /* pass busy_cfs_rq argument into
993 * load_balance_[start|next]_fair iterators 993 * load_balance_[start|next]_fair iterators
994 */ 994 */
995 cfs_rq_iterator.arg = busy_cfs_rq; 995 cfs_rq_iterator.arg = busy_cfs_rq;
996 nr_moved = balance_tasks(this_rq, this_cpu, busiest, 996 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
997 max_nr_move, maxload, sd, idle, all_pinned, 997 max_nr_move, maxload, sd, idle, all_pinned,
998 &load_moved, this_best_prio, best_prio, 998 &load_moved, this_best_prio, best_prio,
999 best_prio_seen, &cfs_rq_iterator); 999 best_prio_seen, &cfs_rq_iterator);
1000 1000
1001 total_nr_moved += nr_moved; 1001 total_nr_moved += nr_moved;
1002 max_nr_move -= nr_moved; 1002 max_nr_move -= nr_moved;
1003 rem_load_move -= load_moved; 1003 rem_load_move -= load_moved;
1004 1004
1005 if (max_nr_move <= 0 || rem_load_move <= 0) 1005 if (max_nr_move <= 0 || rem_load_move <= 0)
1006 break; 1006 break;
1007 } 1007 }
1008 1008
1009 *total_load_moved = max_load_move - rem_load_move; 1009 return max_load_move - rem_load_move;
1010
1011 return total_nr_moved;
1012 } 1010 }
1013 1011
1014 /* 1012 /*
1015 * scheduler tick hitting a task of our scheduling class: 1013 * scheduler tick hitting a task of our scheduling class:
1016 */ 1014 */
1017 static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1015 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
1018 { 1016 {
1019 struct cfs_rq *cfs_rq; 1017 struct cfs_rq *cfs_rq;
1020 struct sched_entity *se = &curr->se; 1018 struct sched_entity *se = &curr->se;
1021 1019
1022 for_each_sched_entity(se) { 1020 for_each_sched_entity(se) {
1023 cfs_rq = cfs_rq_of(se); 1021 cfs_rq = cfs_rq_of(se);
1024 entity_tick(cfs_rq, se); 1022 entity_tick(cfs_rq, se);
1025 } 1023 }
1026 } 1024 }
1027 1025
1028 /* 1026 /*
1029 * Share the fairness runtime between parent and child, thus the 1027 * Share the fairness runtime between parent and child, thus the
1030 * total amount of pressure for CPU stays equal - new tasks 1028 * total amount of pressure for CPU stays equal - new tasks
1031 * get a chance to run but frequent forkers are not allowed to 1029 * get a chance to run but frequent forkers are not allowed to
1032 * monopolize the CPU. Note: the parent runqueue is locked, 1030 * monopolize the CPU. Note: the parent runqueue is locked,
1033 * the child is not running yet. 1031 * the child is not running yet.
1034 */ 1032 */
1035 static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now) 1033 static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
1036 { 1034 {
1037 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1035 struct cfs_rq *cfs_rq = task_cfs_rq(p);
1038 struct sched_entity *se = &p->se; 1036 struct sched_entity *se = &p->se;
1039 1037
1040 sched_info_queued(p); 1038 sched_info_queued(p);
1041 1039
1042 update_stats_enqueue(cfs_rq, se, now); 1040 update_stats_enqueue(cfs_rq, se, now);
1043 /* 1041 /*
1044 * Child runs first: we let it run before the parent 1042 * Child runs first: we let it run before the parent
1045 * until it reschedules once. We set up the key so that 1043 * until it reschedules once. We set up the key so that
1046 * it will preempt the parent: 1044 * it will preempt the parent:
1047 */ 1045 */
1048 p->se.fair_key = current->se.fair_key - 1046 p->se.fair_key = current->se.fair_key -
1049 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; 1047 niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
1050 /* 1048 /*
1051 * The first wait is dominated by the child-runs-first logic, 1049 * The first wait is dominated by the child-runs-first logic,
1052 * so do not credit it with that waiting time yet: 1050 * so do not credit it with that waiting time yet:
1053 */ 1051 */
1054 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) 1052 if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
1055 p->se.wait_start_fair = 0; 1053 p->se.wait_start_fair = 0;
1056 1054
1057 /* 1055 /*
1058 * The statistical average of wait_runtime is about 1056 * The statistical average of wait_runtime is about
1059 * -granularity/2, so initialize the task with that: 1057 * -granularity/2, so initialize the task with that:
1060 */ 1058 */
1061 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) 1059 if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
1062 p->se.wait_runtime = -(sysctl_sched_granularity / 2); 1060 p->se.wait_runtime = -(sysctl_sched_granularity / 2);
1063 1061
1064 __enqueue_entity(cfs_rq, se); 1062 __enqueue_entity(cfs_rq, se);
1065 } 1063 }
1066 1064
1067 #ifdef CONFIG_FAIR_GROUP_SCHED 1065 #ifdef CONFIG_FAIR_GROUP_SCHED
1068 /* Account for a task changing its policy or group. 1066 /* Account for a task changing its policy or group.
1069 * 1067 *
1070 * This routine is mostly called to set cfs_rq->curr field when a task 1068 * This routine is mostly called to set cfs_rq->curr field when a task
1071 * migrates between groups/classes. 1069 * migrates between groups/classes.
1072 */ 1070 */
1073 static void set_curr_task_fair(struct rq *rq) 1071 static void set_curr_task_fair(struct rq *rq)
1074 { 1072 {
1075 struct task_struct *curr = rq->curr; 1073 struct task_struct *curr = rq->curr;
1076 struct sched_entity *se = &curr->se; 1074 struct sched_entity *se = &curr->se;
1077 u64 now = rq_clock(rq); 1075 u64 now = rq_clock(rq);
1078 struct cfs_rq *cfs_rq; 1076 struct cfs_rq *cfs_rq;
1079 1077
1080 for_each_sched_entity(se) { 1078 for_each_sched_entity(se) {
1081 cfs_rq = cfs_rq_of(se); 1079 cfs_rq = cfs_rq_of(se);
1082 set_next_entity(cfs_rq, se, now); 1080 set_next_entity(cfs_rq, se, now);
1083 } 1081 }
1084 } 1082 }
1085 #else 1083 #else
1086 static void set_curr_task_fair(struct rq *rq) 1084 static void set_curr_task_fair(struct rq *rq)
1087 { 1085 {
1088 } 1086 }
1089 #endif 1087 #endif
1090 1088
1091 /* 1089 /*
1092 * All the scheduling class methods: 1090 * All the scheduling class methods:
1093 */ 1091 */
1094 struct sched_class fair_sched_class __read_mostly = { 1092 struct sched_class fair_sched_class __read_mostly = {
1095 .enqueue_task = enqueue_task_fair, 1093 .enqueue_task = enqueue_task_fair,
1096 .dequeue_task = dequeue_task_fair, 1094 .dequeue_task = dequeue_task_fair,
1097 .yield_task = yield_task_fair, 1095 .yield_task = yield_task_fair,
1098 1096
1099 .check_preempt_curr = check_preempt_curr_fair, 1097 .check_preempt_curr = check_preempt_curr_fair,
1100 1098
1101 .pick_next_task = pick_next_task_fair, 1099 .pick_next_task = pick_next_task_fair,
1102 .put_prev_task = put_prev_task_fair, 1100 .put_prev_task = put_prev_task_fair,
1103 1101
1104 .load_balance = load_balance_fair, 1102 .load_balance = load_balance_fair,
1105 1103
1106 .set_curr_task = set_curr_task_fair, 1104 .set_curr_task = set_curr_task_fair,
1107 .task_tick = task_tick_fair, 1105 .task_tick = task_tick_fair,
1108 .task_new = task_new_fair, 1106 .task_new = task_new_fair,
1109 }; 1107 };
1110 1108
1111 #ifdef CONFIG_SCHED_DEBUG 1109 #ifdef CONFIG_SCHED_DEBUG
1112 void print_cfs_stats(struct seq_file *m, int cpu, u64 now) 1110 void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
1113 { 1111 {
1114 struct rq *rq = cpu_rq(cpu); 1112 struct rq *rq = cpu_rq(cpu);
1115 struct cfs_rq *cfs_rq; 1113 struct cfs_rq *cfs_rq;
1116 1114
1117 for_each_leaf_cfs_rq(rq, cfs_rq) 1115 for_each_leaf_cfs_rq(rq, cfs_rq)
1118 print_cfs_rq(m, cpu, cfs_rq, now); 1116 print_cfs_rq(m, cpu, cfs_rq, now);
1119 } 1117 }
1120 #endif 1118 #endif
1121 1119
kernel/sched_idletask.c
1 /* 1 /*
2 * idle-task scheduling class. 2 * idle-task scheduling class.
3 * 3 *
4 * (NOTE: these are not related to SCHED_IDLE tasks which are 4 * (NOTE: these are not related to SCHED_IDLE tasks which are
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8 /* 8 /*
9 * Idle tasks are unconditionally rescheduled: 9 * Idle tasks are unconditionally rescheduled:
10 */ 10 */
11 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 11 static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
12 { 12 {
13 resched_task(rq->idle); 13 resched_task(rq->idle);
14 } 14 }
15 15
16 static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) 16 static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now)
17 { 17 {
18 schedstat_inc(rq, sched_goidle); 18 schedstat_inc(rq, sched_goidle);
19 19
20 return rq->idle; 20 return rq->idle;
21 } 21 }
22 22
23 /* 23 /*
24 * It is not legal to sleep in the idle task - print a warning 24 * It is not legal to sleep in the idle task - print a warning
25 * message if some code attempts to do it: 25 * message if some code attempts to do it:
26 */ 26 */
27 static void 27 static void
28 dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) 28 dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now)
29 { 29 {
30 spin_unlock_irq(&rq->lock); 30 spin_unlock_irq(&rq->lock);
31 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 31 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
32 dump_stack(); 32 dump_stack();
33 spin_lock_irq(&rq->lock); 33 spin_lock_irq(&rq->lock);
34 } 34 }
35 35
36 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) 36 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now)
37 { 37 {
38 } 38 }
39 39
40 static int 40 static unsigned long
41 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, 41 load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
42 unsigned long max_nr_move, unsigned long max_load_move, 42 unsigned long max_nr_move, unsigned long max_load_move,
43 struct sched_domain *sd, enum cpu_idle_type idle, 43 struct sched_domain *sd, enum cpu_idle_type idle,
44 int *all_pinned, unsigned long *total_load_moved) 44 int *all_pinned)
45 { 45 {
46 return 0; 46 return 0;
47 } 47 }
48 48
49 static void task_tick_idle(struct rq *rq, struct task_struct *curr) 49 static void task_tick_idle(struct rq *rq, struct task_struct *curr)
50 { 50 {
51 } 51 }
52 52
53 /* 53 /*
54 * Simple, special scheduling class for the per-CPU idle tasks: 54 * Simple, special scheduling class for the per-CPU idle tasks:
55 */ 55 */
56 static struct sched_class idle_sched_class __read_mostly = { 56 static struct sched_class idle_sched_class __read_mostly = {
57 /* no enqueue/yield_task for idle tasks */ 57 /* no enqueue/yield_task for idle tasks */
58 58
59 /* dequeue is not valid, we print a debug message there: */ 59 /* dequeue is not valid, we print a debug message there: */
60 .dequeue_task = dequeue_task_idle, 60 .dequeue_task = dequeue_task_idle,
61 61
62 .check_preempt_curr = check_preempt_curr_idle, 62 .check_preempt_curr = check_preempt_curr_idle,
63 63
64 .pick_next_task = pick_next_task_idle, 64 .pick_next_task = pick_next_task_idle,
65 .put_prev_task = put_prev_task_idle, 65 .put_prev_task = put_prev_task_idle,
66 66
67 .load_balance = load_balance_idle, 67 .load_balance = load_balance_idle,
68 68
69 .task_tick = task_tick_idle, 69 .task_tick = task_tick_idle,
70 /* no .task_new for idle tasks */ 70 /* no .task_new for idle tasks */
71 }; 71 };
72 72
1 /* 1 /*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies) 3 * policies)
4 */ 4 */
5 5
6 /* 6 /*
7 * Update the current task's runtime statistics. Skip current tasks that 7 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 8 * are not in our scheduling class.
9 */ 9 */
10 static inline void update_curr_rt(struct rq *rq, u64 now) 10 static inline void update_curr_rt(struct rq *rq, u64 now)
11 { 11 {
12 struct task_struct *curr = rq->curr; 12 struct task_struct *curr = rq->curr;
13 u64 delta_exec; 13 u64 delta_exec;
14 14
15 if (!task_has_rt_policy(curr)) 15 if (!task_has_rt_policy(curr))
16 return; 16 return;
17 17
18 delta_exec = now - curr->se.exec_start; 18 delta_exec = now - curr->se.exec_start;
19 if (unlikely((s64)delta_exec < 0)) 19 if (unlikely((s64)delta_exec < 0))
20 delta_exec = 0; 20 delta_exec = 0;
21 21
22 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 22 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
23 23
24 curr->se.sum_exec_runtime += delta_exec; 24 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = now; 25 curr->se.exec_start = now;
26 } 26 }
27 27
28 static void 28 static void
29 enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) 29 enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
30 { 30 {
31 struct rt_prio_array *array = &rq->rt.active; 31 struct rt_prio_array *array = &rq->rt.active;
32 32
33 list_add_tail(&p->run_list, array->queue + p->prio); 33 list_add_tail(&p->run_list, array->queue + p->prio);
34 __set_bit(p->prio, array->bitmap); 34 __set_bit(p->prio, array->bitmap);
35 } 35 }
36 36
37 /* 37 /*
38 * Adding/removing a task to/from a priority array: 38 * Adding/removing a task to/from a priority array:
39 */ 39 */
40 static void 40 static void
41 dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) 41 dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now)
42 { 42 {
43 struct rt_prio_array *array = &rq->rt.active; 43 struct rt_prio_array *array = &rq->rt.active;
44 44
45 update_curr_rt(rq, now); 45 update_curr_rt(rq, now);
46 46
47 list_del(&p->run_list); 47 list_del(&p->run_list);
48 if (list_empty(array->queue + p->prio)) 48 if (list_empty(array->queue + p->prio))
49 __clear_bit(p->prio, array->bitmap); 49 __clear_bit(p->prio, array->bitmap);
50 } 50 }
51 51
52 /* 52 /*
53 * Put task to the end of the run list without the overhead of dequeue 53 * Put task to the end of the run list without the overhead of dequeue
54 * followed by enqueue. 54 * followed by enqueue.
55 */ 55 */
56 static void requeue_task_rt(struct rq *rq, struct task_struct *p) 56 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
57 { 57 {
58 struct rt_prio_array *array = &rq->rt.active; 58 struct rt_prio_array *array = &rq->rt.active;
59 59
60 list_move_tail(&p->run_list, array->queue + p->prio); 60 list_move_tail(&p->run_list, array->queue + p->prio);
61 } 61 }
62 62
63 static void 63 static void
64 yield_task_rt(struct rq *rq, struct task_struct *p) 64 yield_task_rt(struct rq *rq, struct task_struct *p)
65 { 65 {
66 requeue_task_rt(rq, p); 66 requeue_task_rt(rq, p);
67 } 67 }
68 68
69 /* 69 /*
70 * Preempt the current task with a newly woken task if needed: 70 * Preempt the current task with a newly woken task if needed:
71 */ 71 */
72 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 72 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
73 { 73 {
74 if (p->prio < rq->curr->prio) 74 if (p->prio < rq->curr->prio)
75 resched_task(rq->curr); 75 resched_task(rq->curr);
76 } 76 }
77 77
78 static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) 78 static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now)
79 { 79 {
80 struct rt_prio_array *array = &rq->rt.active; 80 struct rt_prio_array *array = &rq->rt.active;
81 struct task_struct *next; 81 struct task_struct *next;
82 struct list_head *queue; 82 struct list_head *queue;
83 int idx; 83 int idx;
84 84
85 idx = sched_find_first_bit(array->bitmap); 85 idx = sched_find_first_bit(array->bitmap);
86 if (idx >= MAX_RT_PRIO) 86 if (idx >= MAX_RT_PRIO)
87 return NULL; 87 return NULL;
88 88
89 queue = array->queue + idx; 89 queue = array->queue + idx;
90 next = list_entry(queue->next, struct task_struct, run_list); 90 next = list_entry(queue->next, struct task_struct, run_list);
91 91
92 next->se.exec_start = now; 92 next->se.exec_start = now;
93 93
94 return next; 94 return next;
95 } 95 }
96 96
97 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) 97 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
98 { 98 {
99 update_curr_rt(rq, now); 99 update_curr_rt(rq, now);
100 p->se.exec_start = 0; 100 p->se.exec_start = 0;
101 } 101 }
102 102
103 /* 103 /*
104 * Load-balancing iterator. Note: while the runqueue stays locked 104 * Load-balancing iterator. Note: while the runqueue stays locked
105 * during the whole iteration, the current task might be 105 * during the whole iteration, the current task might be
106 * dequeued so the iterator has to be dequeue-safe. Here we 106 * dequeued so the iterator has to be dequeue-safe. Here we
107 * achieve that by always pre-iterating before returning 107 * achieve that by always pre-iterating before returning
108 * the current task: 108 * the current task:
109 */ 109 */
110 static struct task_struct *load_balance_start_rt(void *arg) 110 static struct task_struct *load_balance_start_rt(void *arg)
111 { 111 {
112 struct rq *rq = arg; 112 struct rq *rq = arg;
113 struct rt_prio_array *array = &rq->rt.active; 113 struct rt_prio_array *array = &rq->rt.active;
114 struct list_head *head, *curr; 114 struct list_head *head, *curr;
115 struct task_struct *p; 115 struct task_struct *p;
116 int idx; 116 int idx;
117 117
118 idx = sched_find_first_bit(array->bitmap); 118 idx = sched_find_first_bit(array->bitmap);
119 if (idx >= MAX_RT_PRIO) 119 if (idx >= MAX_RT_PRIO)
120 return NULL; 120 return NULL;
121 121
122 head = array->queue + idx; 122 head = array->queue + idx;
123 curr = head->prev; 123 curr = head->prev;
124 124
125 p = list_entry(curr, struct task_struct, run_list); 125 p = list_entry(curr, struct task_struct, run_list);
126 126
127 curr = curr->prev; 127 curr = curr->prev;
128 128
129 rq->rt.rt_load_balance_idx = idx; 129 rq->rt.rt_load_balance_idx = idx;
130 rq->rt.rt_load_balance_head = head; 130 rq->rt.rt_load_balance_head = head;
131 rq->rt.rt_load_balance_curr = curr; 131 rq->rt.rt_load_balance_curr = curr;
132 132
133 return p; 133 return p;
134 } 134 }
135 135
136 static struct task_struct *load_balance_next_rt(void *arg) 136 static struct task_struct *load_balance_next_rt(void *arg)
137 { 137 {
138 struct rq *rq = arg; 138 struct rq *rq = arg;
139 struct rt_prio_array *array = &rq->rt.active; 139 struct rt_prio_array *array = &rq->rt.active;
140 struct list_head *head, *curr; 140 struct list_head *head, *curr;
141 struct task_struct *p; 141 struct task_struct *p;
142 int idx; 142 int idx;
143 143
144 idx = rq->rt.rt_load_balance_idx; 144 idx = rq->rt.rt_load_balance_idx;
145 head = rq->rt.rt_load_balance_head; 145 head = rq->rt.rt_load_balance_head;
146 curr = rq->rt.rt_load_balance_curr; 146 curr = rq->rt.rt_load_balance_curr;
147 147
148 /* 148 /*
149 * If we arrived back to the head again then 149 * If we arrived back to the head again then
150 * iterate to the next queue (if any): 150 * iterate to the next queue (if any):
151 */ 151 */
152 if (unlikely(head == curr)) { 152 if (unlikely(head == curr)) {
153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 153 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
154 154
155 if (next_idx >= MAX_RT_PRIO) 155 if (next_idx >= MAX_RT_PRIO)
156 return NULL; 156 return NULL;
157 157
158 idx = next_idx; 158 idx = next_idx;
159 head = array->queue + idx; 159 head = array->queue + idx;
160 curr = head->prev; 160 curr = head->prev;
161 161
162 rq->rt.rt_load_balance_idx = idx; 162 rq->rt.rt_load_balance_idx = idx;
163 rq->rt.rt_load_balance_head = head; 163 rq->rt.rt_load_balance_head = head;
164 } 164 }
165 165
166 p = list_entry(curr, struct task_struct, run_list); 166 p = list_entry(curr, struct task_struct, run_list);
167 167
168 curr = curr->prev; 168 curr = curr->prev;
169 169
170 rq->rt.rt_load_balance_curr = curr; 170 rq->rt.rt_load_balance_curr = curr;
171 171
172 return p; 172 return p;
173 } 173 }
174 174
175 static int 175 static unsigned long
176 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 176 load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 unsigned long max_nr_move, unsigned long max_load_move, 177 unsigned long max_nr_move, unsigned long max_load_move,
178 struct sched_domain *sd, enum cpu_idle_type idle, 178 struct sched_domain *sd, enum cpu_idle_type idle,
179 int *all_pinned, unsigned long *load_moved) 179 int *all_pinned)
180 { 180 {
181 int this_best_prio, best_prio, best_prio_seen = 0; 181 int this_best_prio, best_prio, best_prio_seen = 0;
182 int nr_moved; 182 int nr_moved;
183 struct rq_iterator rt_rq_iterator; 183 struct rq_iterator rt_rq_iterator;
184 unsigned long load_moved;
184 185
185 best_prio = sched_find_first_bit(busiest->rt.active.bitmap); 186 best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
186 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); 187 this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
187 188
188 /* 189 /*
189 * Enable handling of the case where there is more than one task 190 * Enable handling of the case where there is more than one task
190 * with the best priority. If the current running task is one 191 * with the best priority. If the current running task is one
191 * of those with prio==best_prio we know it won't be moved 192 * of those with prio==best_prio we know it won't be moved
192 * and therefore it's safe to override the skip (based on load) 193 * and therefore it's safe to override the skip (based on load)
193 * of any task we find with that prio. 194 * of any task we find with that prio.
194 */ 195 */
195 if (busiest->curr->prio == best_prio) 196 if (busiest->curr->prio == best_prio)
196 best_prio_seen = 1; 197 best_prio_seen = 1;
197 198
198 rt_rq_iterator.start = load_balance_start_rt; 199 rt_rq_iterator.start = load_balance_start_rt;
199 rt_rq_iterator.next = load_balance_next_rt; 200 rt_rq_iterator.next = load_balance_next_rt;
200 /* pass 'busiest' rq argument into 201 /* pass 'busiest' rq argument into
201 * load_balance_[start|next]_rt iterators 202 * load_balance_[start|next]_rt iterators
202 */ 203 */
203 rt_rq_iterator.arg = busiest; 204 rt_rq_iterator.arg = busiest;
204 205
205 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, 206 nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
206 max_load_move, sd, idle, all_pinned, load_moved, 207 max_load_move, sd, idle, all_pinned, &load_moved,
207 this_best_prio, best_prio, best_prio_seen, 208 this_best_prio, best_prio, best_prio_seen,
208 &rt_rq_iterator); 209 &rt_rq_iterator);
209 210
210 return nr_moved; 211 return load_moved;
211 } 212 }
212 213
213 static void task_tick_rt(struct rq *rq, struct task_struct *p) 214 static void task_tick_rt(struct rq *rq, struct task_struct *p)
214 { 215 {
215 /* 216 /*
216 * RR tasks need a special form of timeslice management. 217 * RR tasks need a special form of timeslice management.
217 * FIFO tasks have no timeslices. 218 * FIFO tasks have no timeslices.
218 */ 219 */
219 if (p->policy != SCHED_RR) 220 if (p->policy != SCHED_RR)
220 return; 221 return;
221 222
222 if (--p->time_slice) 223 if (--p->time_slice)
223 return; 224 return;
224 225
225 p->time_slice = static_prio_timeslice(p->static_prio); 226 p->time_slice = static_prio_timeslice(p->static_prio);
226 set_tsk_need_resched(p); 227 set_tsk_need_resched(p);
227 228
228 /* put it at the end of the queue: */ 229 /* put it at the end of the queue: */
229 requeue_task_rt(rq, p); 230 requeue_task_rt(rq, p);
230 } 231 }
231 232
232 static struct sched_class rt_sched_class __read_mostly = { 233 static struct sched_class rt_sched_class __read_mostly = {
233 .enqueue_task = enqueue_task_rt, 234 .enqueue_task = enqueue_task_rt,
234 .dequeue_task = dequeue_task_rt, 235 .dequeue_task = dequeue_task_rt,
235 .yield_task = yield_task_rt, 236 .yield_task = yield_task_rt,
236 237
237 .check_preempt_curr = check_preempt_curr_rt, 238 .check_preempt_curr = check_preempt_curr_rt,
238 239
239 .pick_next_task = pick_next_task_rt, 240 .pick_next_task = pick_next_task_rt,
240 .put_prev_task = put_prev_task_rt, 241 .put_prev_task = put_prev_task_rt,
241 242
242 .load_balance = load_balance_rt, 243 .load_balance = load_balance_rt,
243 244
244 .task_tick = task_tick_rt, 245 .task_tick = task_tick_rt,
245 }; 246 };
246 247