Commit 4301065920b0cbde3986519582347e883b166f3e
Committed by
Ingo Molnar
1 parent
f1a438d813
Exists in
master
and in
7 other branches
sched: simplify move_tasks()
The move_tasks() function is currently multiplexed with two distinct capabilities: 1. attempt to move a specified amount of weighted load from one run queue to another; and 2. attempt to move a specified number of tasks from one run queue to another. The first of these capabilities is used in two places, load_balance() and load_balance_idle(), and in both of these cases the return value of move_tasks() is used purely to decide if tasks/load were moved and no notice of the actual number of tasks moved is taken. The second capability is used in exactly one place, active_load_balance(), to attempt to move exactly one task and, as before, the return value is only used as an indicator of success or failure. This multiplexing of sched_task() was introduced, by me, as part of the smpnice patches and was motivated by the fact that the alternative, one function to move specified load and one to move a single task, would have led to two functions of roughly the same complexity as the old move_tasks() (or the new balance_tasks()). However, the new modular design of the new CFS scheduler allows a simpler solution to be adopted and this patch addresses that solution by: 1. adding a new function, move_one_task(), to be used by active_load_balance(); and 2. making move_tasks() a single purpose function that tries to move a specified weighted load and returns 1 for success and 0 for failure. One of the consequences of these changes is that neither move_one_task() or the new move_tasks() care how many tasks sched_class.load_balance() moves and this enables its interface to be simplified by returning the amount of load moved as its result and removing the load_moved pointer from the argument list. This helps simplify the new move_tasks() and slightly reduces the amount of work done in each of sched_class.load_balance()'s implementations. Further simplification, e.g. changes to balance_tasks(), are possible but (slightly) complicated by the special needs of load_balance_fair() so I've left them to a later patch (if this one gets accepted). NB Since move_tasks() gets called with two run queue locks held even small reductions in overhead are worthwhile. [ mingo@elte.hu ] this change also reduces code size nicely: text data bss dec hex filename 39216 3618 24 42858 a76a sched.o.before 39173 3618 24 42815 a73f sched.o.after Signed-off-by: Peter Williams <pwil3058@bigpond.net.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Showing 5 changed files with 58 additions and 49 deletions Inline Diff
include/linux/sched.h
1 | #ifndef _LINUX_SCHED_H | 1 | #ifndef _LINUX_SCHED_H |
2 | #define _LINUX_SCHED_H | 2 | #define _LINUX_SCHED_H |
3 | 3 | ||
4 | #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ | 4 | #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ |
5 | 5 | ||
6 | /* | 6 | /* |
7 | * cloning flags: | 7 | * cloning flags: |
8 | */ | 8 | */ |
9 | #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ | 9 | #define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ |
10 | #define CLONE_VM 0x00000100 /* set if VM shared between processes */ | 10 | #define CLONE_VM 0x00000100 /* set if VM shared between processes */ |
11 | #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ | 11 | #define CLONE_FS 0x00000200 /* set if fs info shared between processes */ |
12 | #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ | 12 | #define CLONE_FILES 0x00000400 /* set if open files shared between processes */ |
13 | #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ | 13 | #define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ |
14 | #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ | 14 | #define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ |
15 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ | 15 | #define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ |
16 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ | 16 | #define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ |
17 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ | 17 | #define CLONE_THREAD 0x00010000 /* Same thread group? */ |
18 | #define CLONE_NEWNS 0x00020000 /* New namespace group? */ | 18 | #define CLONE_NEWNS 0x00020000 /* New namespace group? */ |
19 | #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ | 19 | #define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */ |
20 | #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ | 20 | #define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */ |
21 | #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ | 21 | #define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */ |
22 | #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ | 22 | #define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */ |
23 | #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ | 23 | #define CLONE_DETACHED 0x00400000 /* Unused, ignored */ |
24 | #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ | 24 | #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ |
25 | #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ | 25 | #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ |
26 | #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ | 26 | #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ |
27 | #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ | 27 | #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ |
28 | #define CLONE_NEWIPC 0x08000000 /* New ipcs */ | 28 | #define CLONE_NEWIPC 0x08000000 /* New ipcs */ |
29 | #define CLONE_NEWUSER 0x10000000 /* New user namespace */ | 29 | #define CLONE_NEWUSER 0x10000000 /* New user namespace */ |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * Scheduling policies | 32 | * Scheduling policies |
33 | */ | 33 | */ |
34 | #define SCHED_NORMAL 0 | 34 | #define SCHED_NORMAL 0 |
35 | #define SCHED_FIFO 1 | 35 | #define SCHED_FIFO 1 |
36 | #define SCHED_RR 2 | 36 | #define SCHED_RR 2 |
37 | #define SCHED_BATCH 3 | 37 | #define SCHED_BATCH 3 |
38 | /* SCHED_ISO: reserved but not implemented yet */ | 38 | /* SCHED_ISO: reserved but not implemented yet */ |
39 | #define SCHED_IDLE 5 | 39 | #define SCHED_IDLE 5 |
40 | 40 | ||
41 | #ifdef __KERNEL__ | 41 | #ifdef __KERNEL__ |
42 | 42 | ||
43 | struct sched_param { | 43 | struct sched_param { |
44 | int sched_priority; | 44 | int sched_priority; |
45 | }; | 45 | }; |
46 | 46 | ||
47 | #include <asm/param.h> /* for HZ */ | 47 | #include <asm/param.h> /* for HZ */ |
48 | 48 | ||
49 | #include <linux/capability.h> | 49 | #include <linux/capability.h> |
50 | #include <linux/threads.h> | 50 | #include <linux/threads.h> |
51 | #include <linux/kernel.h> | 51 | #include <linux/kernel.h> |
52 | #include <linux/types.h> | 52 | #include <linux/types.h> |
53 | #include <linux/timex.h> | 53 | #include <linux/timex.h> |
54 | #include <linux/jiffies.h> | 54 | #include <linux/jiffies.h> |
55 | #include <linux/rbtree.h> | 55 | #include <linux/rbtree.h> |
56 | #include <linux/thread_info.h> | 56 | #include <linux/thread_info.h> |
57 | #include <linux/cpumask.h> | 57 | #include <linux/cpumask.h> |
58 | #include <linux/errno.h> | 58 | #include <linux/errno.h> |
59 | #include <linux/nodemask.h> | 59 | #include <linux/nodemask.h> |
60 | 60 | ||
61 | #include <asm/system.h> | 61 | #include <asm/system.h> |
62 | #include <asm/semaphore.h> | 62 | #include <asm/semaphore.h> |
63 | #include <asm/page.h> | 63 | #include <asm/page.h> |
64 | #include <asm/ptrace.h> | 64 | #include <asm/ptrace.h> |
65 | #include <asm/mmu.h> | 65 | #include <asm/mmu.h> |
66 | #include <asm/cputime.h> | 66 | #include <asm/cputime.h> |
67 | 67 | ||
68 | #include <linux/smp.h> | 68 | #include <linux/smp.h> |
69 | #include <linux/sem.h> | 69 | #include <linux/sem.h> |
70 | #include <linux/signal.h> | 70 | #include <linux/signal.h> |
71 | #include <linux/securebits.h> | 71 | #include <linux/securebits.h> |
72 | #include <linux/fs_struct.h> | 72 | #include <linux/fs_struct.h> |
73 | #include <linux/compiler.h> | 73 | #include <linux/compiler.h> |
74 | #include <linux/completion.h> | 74 | #include <linux/completion.h> |
75 | #include <linux/pid.h> | 75 | #include <linux/pid.h> |
76 | #include <linux/percpu.h> | 76 | #include <linux/percpu.h> |
77 | #include <linux/topology.h> | 77 | #include <linux/topology.h> |
78 | #include <linux/seccomp.h> | 78 | #include <linux/seccomp.h> |
79 | #include <linux/rcupdate.h> | 79 | #include <linux/rcupdate.h> |
80 | #include <linux/futex.h> | 80 | #include <linux/futex.h> |
81 | #include <linux/rtmutex.h> | 81 | #include <linux/rtmutex.h> |
82 | 82 | ||
83 | #include <linux/time.h> | 83 | #include <linux/time.h> |
84 | #include <linux/param.h> | 84 | #include <linux/param.h> |
85 | #include <linux/resource.h> | 85 | #include <linux/resource.h> |
86 | #include <linux/timer.h> | 86 | #include <linux/timer.h> |
87 | #include <linux/hrtimer.h> | 87 | #include <linux/hrtimer.h> |
88 | #include <linux/task_io_accounting.h> | 88 | #include <linux/task_io_accounting.h> |
89 | 89 | ||
90 | #include <asm/processor.h> | 90 | #include <asm/processor.h> |
91 | 91 | ||
92 | struct exec_domain; | 92 | struct exec_domain; |
93 | struct futex_pi_state; | 93 | struct futex_pi_state; |
94 | struct bio; | 94 | struct bio; |
95 | 95 | ||
96 | /* | 96 | /* |
97 | * List of flags we want to share for kernel threads, | 97 | * List of flags we want to share for kernel threads, |
98 | * if only because they are not used by them anyway. | 98 | * if only because they are not used by them anyway. |
99 | */ | 99 | */ |
100 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) | 100 | #define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) |
101 | 101 | ||
102 | /* | 102 | /* |
103 | * These are the constant used to fake the fixed-point load-average | 103 | * These are the constant used to fake the fixed-point load-average |
104 | * counting. Some notes: | 104 | * counting. Some notes: |
105 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives | 105 | * - 11 bit fractions expand to 22 bits by the multiplies: this gives |
106 | * a load-average precision of 10 bits integer + 11 bits fractional | 106 | * a load-average precision of 10 bits integer + 11 bits fractional |
107 | * - if you want to count load-averages more often, you need more | 107 | * - if you want to count load-averages more often, you need more |
108 | * precision, or rounding will get you. With 2-second counting freq, | 108 | * precision, or rounding will get you. With 2-second counting freq, |
109 | * the EXP_n values would be 1981, 2034 and 2043 if still using only | 109 | * the EXP_n values would be 1981, 2034 and 2043 if still using only |
110 | * 11 bit fractions. | 110 | * 11 bit fractions. |
111 | */ | 111 | */ |
112 | extern unsigned long avenrun[]; /* Load averages */ | 112 | extern unsigned long avenrun[]; /* Load averages */ |
113 | 113 | ||
114 | #define FSHIFT 11 /* nr of bits of precision */ | 114 | #define FSHIFT 11 /* nr of bits of precision */ |
115 | #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ | 115 | #define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ |
116 | #define LOAD_FREQ (5*HZ) /* 5 sec intervals */ | 116 | #define LOAD_FREQ (5*HZ) /* 5 sec intervals */ |
117 | #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ | 117 | #define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ |
118 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ | 118 | #define EXP_5 2014 /* 1/exp(5sec/5min) */ |
119 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ | 119 | #define EXP_15 2037 /* 1/exp(5sec/15min) */ |
120 | 120 | ||
121 | #define CALC_LOAD(load,exp,n) \ | 121 | #define CALC_LOAD(load,exp,n) \ |
122 | load *= exp; \ | 122 | load *= exp; \ |
123 | load += n*(FIXED_1-exp); \ | 123 | load += n*(FIXED_1-exp); \ |
124 | load >>= FSHIFT; | 124 | load >>= FSHIFT; |
125 | 125 | ||
126 | extern unsigned long total_forks; | 126 | extern unsigned long total_forks; |
127 | extern int nr_threads; | 127 | extern int nr_threads; |
128 | DECLARE_PER_CPU(unsigned long, process_counts); | 128 | DECLARE_PER_CPU(unsigned long, process_counts); |
129 | extern int nr_processes(void); | 129 | extern int nr_processes(void); |
130 | extern unsigned long nr_running(void); | 130 | extern unsigned long nr_running(void); |
131 | extern unsigned long nr_uninterruptible(void); | 131 | extern unsigned long nr_uninterruptible(void); |
132 | extern unsigned long nr_active(void); | 132 | extern unsigned long nr_active(void); |
133 | extern unsigned long nr_iowait(void); | 133 | extern unsigned long nr_iowait(void); |
134 | extern unsigned long weighted_cpuload(const int cpu); | 134 | extern unsigned long weighted_cpuload(const int cpu); |
135 | 135 | ||
136 | struct seq_file; | 136 | struct seq_file; |
137 | struct cfs_rq; | 137 | struct cfs_rq; |
138 | #ifdef CONFIG_SCHED_DEBUG | 138 | #ifdef CONFIG_SCHED_DEBUG |
139 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); | 139 | extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); |
140 | extern void proc_sched_set_task(struct task_struct *p); | 140 | extern void proc_sched_set_task(struct task_struct *p); |
141 | extern void | 141 | extern void |
142 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now); | 142 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now); |
143 | #else | 143 | #else |
144 | static inline void | 144 | static inline void |
145 | proc_sched_show_task(struct task_struct *p, struct seq_file *m) | 145 | proc_sched_show_task(struct task_struct *p, struct seq_file *m) |
146 | { | 146 | { |
147 | } | 147 | } |
148 | static inline void proc_sched_set_task(struct task_struct *p) | 148 | static inline void proc_sched_set_task(struct task_struct *p) |
149 | { | 149 | { |
150 | } | 150 | } |
151 | static inline void | 151 | static inline void |
152 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) | 152 | print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now) |
153 | { | 153 | { |
154 | } | 154 | } |
155 | #endif | 155 | #endif |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * Task state bitmask. NOTE! These bits are also | 158 | * Task state bitmask. NOTE! These bits are also |
159 | * encoded in fs/proc/array.c: get_task_state(). | 159 | * encoded in fs/proc/array.c: get_task_state(). |
160 | * | 160 | * |
161 | * We have two separate sets of flags: task->state | 161 | * We have two separate sets of flags: task->state |
162 | * is about runnability, while task->exit_state are | 162 | * is about runnability, while task->exit_state are |
163 | * about the task exiting. Confusing, but this way | 163 | * about the task exiting. Confusing, but this way |
164 | * modifying one set can't modify the other one by | 164 | * modifying one set can't modify the other one by |
165 | * mistake. | 165 | * mistake. |
166 | */ | 166 | */ |
167 | #define TASK_RUNNING 0 | 167 | #define TASK_RUNNING 0 |
168 | #define TASK_INTERRUPTIBLE 1 | 168 | #define TASK_INTERRUPTIBLE 1 |
169 | #define TASK_UNINTERRUPTIBLE 2 | 169 | #define TASK_UNINTERRUPTIBLE 2 |
170 | #define TASK_STOPPED 4 | 170 | #define TASK_STOPPED 4 |
171 | #define TASK_TRACED 8 | 171 | #define TASK_TRACED 8 |
172 | /* in tsk->exit_state */ | 172 | /* in tsk->exit_state */ |
173 | #define EXIT_ZOMBIE 16 | 173 | #define EXIT_ZOMBIE 16 |
174 | #define EXIT_DEAD 32 | 174 | #define EXIT_DEAD 32 |
175 | /* in tsk->state again */ | 175 | /* in tsk->state again */ |
176 | #define TASK_NONINTERACTIVE 64 | 176 | #define TASK_NONINTERACTIVE 64 |
177 | #define TASK_DEAD 128 | 177 | #define TASK_DEAD 128 |
178 | 178 | ||
179 | #define __set_task_state(tsk, state_value) \ | 179 | #define __set_task_state(tsk, state_value) \ |
180 | do { (tsk)->state = (state_value); } while (0) | 180 | do { (tsk)->state = (state_value); } while (0) |
181 | #define set_task_state(tsk, state_value) \ | 181 | #define set_task_state(tsk, state_value) \ |
182 | set_mb((tsk)->state, (state_value)) | 182 | set_mb((tsk)->state, (state_value)) |
183 | 183 | ||
184 | /* | 184 | /* |
185 | * set_current_state() includes a barrier so that the write of current->state | 185 | * set_current_state() includes a barrier so that the write of current->state |
186 | * is correctly serialised wrt the caller's subsequent test of whether to | 186 | * is correctly serialised wrt the caller's subsequent test of whether to |
187 | * actually sleep: | 187 | * actually sleep: |
188 | * | 188 | * |
189 | * set_current_state(TASK_UNINTERRUPTIBLE); | 189 | * set_current_state(TASK_UNINTERRUPTIBLE); |
190 | * if (do_i_need_to_sleep()) | 190 | * if (do_i_need_to_sleep()) |
191 | * schedule(); | 191 | * schedule(); |
192 | * | 192 | * |
193 | * If the caller does not need such serialisation then use __set_current_state() | 193 | * If the caller does not need such serialisation then use __set_current_state() |
194 | */ | 194 | */ |
195 | #define __set_current_state(state_value) \ | 195 | #define __set_current_state(state_value) \ |
196 | do { current->state = (state_value); } while (0) | 196 | do { current->state = (state_value); } while (0) |
197 | #define set_current_state(state_value) \ | 197 | #define set_current_state(state_value) \ |
198 | set_mb(current->state, (state_value)) | 198 | set_mb(current->state, (state_value)) |
199 | 199 | ||
200 | /* Task command name length */ | 200 | /* Task command name length */ |
201 | #define TASK_COMM_LEN 16 | 201 | #define TASK_COMM_LEN 16 |
202 | 202 | ||
203 | #include <linux/spinlock.h> | 203 | #include <linux/spinlock.h> |
204 | 204 | ||
205 | /* | 205 | /* |
206 | * This serializes "schedule()" and also protects | 206 | * This serializes "schedule()" and also protects |
207 | * the run-queue from deletions/modifications (but | 207 | * the run-queue from deletions/modifications (but |
208 | * _adding_ to the beginning of the run-queue has | 208 | * _adding_ to the beginning of the run-queue has |
209 | * a separate lock). | 209 | * a separate lock). |
210 | */ | 210 | */ |
211 | extern rwlock_t tasklist_lock; | 211 | extern rwlock_t tasklist_lock; |
212 | extern spinlock_t mmlist_lock; | 212 | extern spinlock_t mmlist_lock; |
213 | 213 | ||
214 | struct task_struct; | 214 | struct task_struct; |
215 | 215 | ||
216 | extern void sched_init(void); | 216 | extern void sched_init(void); |
217 | extern void sched_init_smp(void); | 217 | extern void sched_init_smp(void); |
218 | extern void init_idle(struct task_struct *idle, int cpu); | 218 | extern void init_idle(struct task_struct *idle, int cpu); |
219 | extern void init_idle_bootup_task(struct task_struct *idle); | 219 | extern void init_idle_bootup_task(struct task_struct *idle); |
220 | 220 | ||
221 | extern cpumask_t nohz_cpu_mask; | 221 | extern cpumask_t nohz_cpu_mask; |
222 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) | 222 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) |
223 | extern int select_nohz_load_balancer(int cpu); | 223 | extern int select_nohz_load_balancer(int cpu); |
224 | #else | 224 | #else |
225 | static inline int select_nohz_load_balancer(int cpu) | 225 | static inline int select_nohz_load_balancer(int cpu) |
226 | { | 226 | { |
227 | return 0; | 227 | return 0; |
228 | } | 228 | } |
229 | #endif | 229 | #endif |
230 | 230 | ||
231 | /* | 231 | /* |
232 | * Only dump TASK_* tasks. (0 for all tasks) | 232 | * Only dump TASK_* tasks. (0 for all tasks) |
233 | */ | 233 | */ |
234 | extern void show_state_filter(unsigned long state_filter); | 234 | extern void show_state_filter(unsigned long state_filter); |
235 | 235 | ||
236 | static inline void show_state(void) | 236 | static inline void show_state(void) |
237 | { | 237 | { |
238 | show_state_filter(0); | 238 | show_state_filter(0); |
239 | } | 239 | } |
240 | 240 | ||
241 | extern void show_regs(struct pt_regs *); | 241 | extern void show_regs(struct pt_regs *); |
242 | 242 | ||
243 | /* | 243 | /* |
244 | * TASK is a pointer to the task whose backtrace we want to see (or NULL for current | 244 | * TASK is a pointer to the task whose backtrace we want to see (or NULL for current |
245 | * task), SP is the stack pointer of the first frame that should be shown in the back | 245 | * task), SP is the stack pointer of the first frame that should be shown in the back |
246 | * trace (or NULL if the entire call-chain of the task should be shown). | 246 | * trace (or NULL if the entire call-chain of the task should be shown). |
247 | */ | 247 | */ |
248 | extern void show_stack(struct task_struct *task, unsigned long *sp); | 248 | extern void show_stack(struct task_struct *task, unsigned long *sp); |
249 | 249 | ||
250 | void io_schedule(void); | 250 | void io_schedule(void); |
251 | long io_schedule_timeout(long timeout); | 251 | long io_schedule_timeout(long timeout); |
252 | 252 | ||
253 | extern void cpu_init (void); | 253 | extern void cpu_init (void); |
254 | extern void trap_init(void); | 254 | extern void trap_init(void); |
255 | extern void update_process_times(int user); | 255 | extern void update_process_times(int user); |
256 | extern void scheduler_tick(void); | 256 | extern void scheduler_tick(void); |
257 | 257 | ||
258 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 258 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
259 | extern void softlockup_tick(void); | 259 | extern void softlockup_tick(void); |
260 | extern void spawn_softlockup_task(void); | 260 | extern void spawn_softlockup_task(void); |
261 | extern void touch_softlockup_watchdog(void); | 261 | extern void touch_softlockup_watchdog(void); |
262 | extern void touch_all_softlockup_watchdogs(void); | 262 | extern void touch_all_softlockup_watchdogs(void); |
263 | #else | 263 | #else |
264 | static inline void softlockup_tick(void) | 264 | static inline void softlockup_tick(void) |
265 | { | 265 | { |
266 | } | 266 | } |
267 | static inline void spawn_softlockup_task(void) | 267 | static inline void spawn_softlockup_task(void) |
268 | { | 268 | { |
269 | } | 269 | } |
270 | static inline void touch_softlockup_watchdog(void) | 270 | static inline void touch_softlockup_watchdog(void) |
271 | { | 271 | { |
272 | } | 272 | } |
273 | static inline void touch_all_softlockup_watchdogs(void) | 273 | static inline void touch_all_softlockup_watchdogs(void) |
274 | { | 274 | { |
275 | } | 275 | } |
276 | #endif | 276 | #endif |
277 | 277 | ||
278 | 278 | ||
279 | /* Attach to any functions which should be ignored in wchan output. */ | 279 | /* Attach to any functions which should be ignored in wchan output. */ |
280 | #define __sched __attribute__((__section__(".sched.text"))) | 280 | #define __sched __attribute__((__section__(".sched.text"))) |
281 | /* Is this address in the __sched functions? */ | 281 | /* Is this address in the __sched functions? */ |
282 | extern int in_sched_functions(unsigned long addr); | 282 | extern int in_sched_functions(unsigned long addr); |
283 | 283 | ||
284 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX | 284 | #define MAX_SCHEDULE_TIMEOUT LONG_MAX |
285 | extern signed long FASTCALL(schedule_timeout(signed long timeout)); | 285 | extern signed long FASTCALL(schedule_timeout(signed long timeout)); |
286 | extern signed long schedule_timeout_interruptible(signed long timeout); | 286 | extern signed long schedule_timeout_interruptible(signed long timeout); |
287 | extern signed long schedule_timeout_uninterruptible(signed long timeout); | 287 | extern signed long schedule_timeout_uninterruptible(signed long timeout); |
288 | asmlinkage void schedule(void); | 288 | asmlinkage void schedule(void); |
289 | 289 | ||
290 | struct nsproxy; | 290 | struct nsproxy; |
291 | struct user_namespace; | 291 | struct user_namespace; |
292 | 292 | ||
293 | /* Maximum number of active map areas.. This is a random (large) number */ | 293 | /* Maximum number of active map areas.. This is a random (large) number */ |
294 | #define DEFAULT_MAX_MAP_COUNT 65536 | 294 | #define DEFAULT_MAX_MAP_COUNT 65536 |
295 | 295 | ||
296 | extern int sysctl_max_map_count; | 296 | extern int sysctl_max_map_count; |
297 | 297 | ||
298 | #include <linux/aio.h> | 298 | #include <linux/aio.h> |
299 | 299 | ||
300 | extern unsigned long | 300 | extern unsigned long |
301 | arch_get_unmapped_area(struct file *, unsigned long, unsigned long, | 301 | arch_get_unmapped_area(struct file *, unsigned long, unsigned long, |
302 | unsigned long, unsigned long); | 302 | unsigned long, unsigned long); |
303 | extern unsigned long | 303 | extern unsigned long |
304 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, | 304 | arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, |
305 | unsigned long len, unsigned long pgoff, | 305 | unsigned long len, unsigned long pgoff, |
306 | unsigned long flags); | 306 | unsigned long flags); |
307 | extern void arch_unmap_area(struct mm_struct *, unsigned long); | 307 | extern void arch_unmap_area(struct mm_struct *, unsigned long); |
308 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); | 308 | extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); |
309 | 309 | ||
310 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | 310 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS |
311 | /* | 311 | /* |
312 | * The mm counters are not protected by its page_table_lock, | 312 | * The mm counters are not protected by its page_table_lock, |
313 | * so must be incremented atomically. | 313 | * so must be incremented atomically. |
314 | */ | 314 | */ |
315 | #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) | 315 | #define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) |
316 | #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) | 316 | #define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) |
317 | #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) | 317 | #define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) |
318 | #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) | 318 | #define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) |
319 | #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) | 319 | #define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) |
320 | typedef atomic_long_t mm_counter_t; | 320 | typedef atomic_long_t mm_counter_t; |
321 | 321 | ||
322 | #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | 322 | #else /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ |
323 | /* | 323 | /* |
324 | * The mm counters are protected by its page_table_lock, | 324 | * The mm counters are protected by its page_table_lock, |
325 | * so can be incremented directly. | 325 | * so can be incremented directly. |
326 | */ | 326 | */ |
327 | #define set_mm_counter(mm, member, value) (mm)->_##member = (value) | 327 | #define set_mm_counter(mm, member, value) (mm)->_##member = (value) |
328 | #define get_mm_counter(mm, member) ((mm)->_##member) | 328 | #define get_mm_counter(mm, member) ((mm)->_##member) |
329 | #define add_mm_counter(mm, member, value) (mm)->_##member += (value) | 329 | #define add_mm_counter(mm, member, value) (mm)->_##member += (value) |
330 | #define inc_mm_counter(mm, member) (mm)->_##member++ | 330 | #define inc_mm_counter(mm, member) (mm)->_##member++ |
331 | #define dec_mm_counter(mm, member) (mm)->_##member-- | 331 | #define dec_mm_counter(mm, member) (mm)->_##member-- |
332 | typedef unsigned long mm_counter_t; | 332 | typedef unsigned long mm_counter_t; |
333 | 333 | ||
334 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ | 334 | #endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */ |
335 | 335 | ||
336 | #define get_mm_rss(mm) \ | 336 | #define get_mm_rss(mm) \ |
337 | (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) | 337 | (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) |
338 | #define update_hiwater_rss(mm) do { \ | 338 | #define update_hiwater_rss(mm) do { \ |
339 | unsigned long _rss = get_mm_rss(mm); \ | 339 | unsigned long _rss = get_mm_rss(mm); \ |
340 | if ((mm)->hiwater_rss < _rss) \ | 340 | if ((mm)->hiwater_rss < _rss) \ |
341 | (mm)->hiwater_rss = _rss; \ | 341 | (mm)->hiwater_rss = _rss; \ |
342 | } while (0) | 342 | } while (0) |
343 | #define update_hiwater_vm(mm) do { \ | 343 | #define update_hiwater_vm(mm) do { \ |
344 | if ((mm)->hiwater_vm < (mm)->total_vm) \ | 344 | if ((mm)->hiwater_vm < (mm)->total_vm) \ |
345 | (mm)->hiwater_vm = (mm)->total_vm; \ | 345 | (mm)->hiwater_vm = (mm)->total_vm; \ |
346 | } while (0) | 346 | } while (0) |
347 | 347 | ||
348 | extern void set_dumpable(struct mm_struct *mm, int value); | 348 | extern void set_dumpable(struct mm_struct *mm, int value); |
349 | extern int get_dumpable(struct mm_struct *mm); | 349 | extern int get_dumpable(struct mm_struct *mm); |
350 | 350 | ||
351 | /* mm flags */ | 351 | /* mm flags */ |
352 | /* dumpable bits */ | 352 | /* dumpable bits */ |
353 | #define MMF_DUMPABLE 0 /* core dump is permitted */ | 353 | #define MMF_DUMPABLE 0 /* core dump is permitted */ |
354 | #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ | 354 | #define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ |
355 | #define MMF_DUMPABLE_BITS 2 | 355 | #define MMF_DUMPABLE_BITS 2 |
356 | 356 | ||
357 | /* coredump filter bits */ | 357 | /* coredump filter bits */ |
358 | #define MMF_DUMP_ANON_PRIVATE 2 | 358 | #define MMF_DUMP_ANON_PRIVATE 2 |
359 | #define MMF_DUMP_ANON_SHARED 3 | 359 | #define MMF_DUMP_ANON_SHARED 3 |
360 | #define MMF_DUMP_MAPPED_PRIVATE 4 | 360 | #define MMF_DUMP_MAPPED_PRIVATE 4 |
361 | #define MMF_DUMP_MAPPED_SHARED 5 | 361 | #define MMF_DUMP_MAPPED_SHARED 5 |
362 | #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS | 362 | #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS |
363 | #define MMF_DUMP_FILTER_BITS 4 | 363 | #define MMF_DUMP_FILTER_BITS 4 |
364 | #define MMF_DUMP_FILTER_MASK \ | 364 | #define MMF_DUMP_FILTER_MASK \ |
365 | (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) | 365 | (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) |
366 | #define MMF_DUMP_FILTER_DEFAULT \ | 366 | #define MMF_DUMP_FILTER_DEFAULT \ |
367 | ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) | 367 | ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED)) |
368 | 368 | ||
369 | struct mm_struct { | 369 | struct mm_struct { |
370 | struct vm_area_struct * mmap; /* list of VMAs */ | 370 | struct vm_area_struct * mmap; /* list of VMAs */ |
371 | struct rb_root mm_rb; | 371 | struct rb_root mm_rb; |
372 | struct vm_area_struct * mmap_cache; /* last find_vma result */ | 372 | struct vm_area_struct * mmap_cache; /* last find_vma result */ |
373 | unsigned long (*get_unmapped_area) (struct file *filp, | 373 | unsigned long (*get_unmapped_area) (struct file *filp, |
374 | unsigned long addr, unsigned long len, | 374 | unsigned long addr, unsigned long len, |
375 | unsigned long pgoff, unsigned long flags); | 375 | unsigned long pgoff, unsigned long flags); |
376 | void (*unmap_area) (struct mm_struct *mm, unsigned long addr); | 376 | void (*unmap_area) (struct mm_struct *mm, unsigned long addr); |
377 | unsigned long mmap_base; /* base of mmap area */ | 377 | unsigned long mmap_base; /* base of mmap area */ |
378 | unsigned long task_size; /* size of task vm space */ | 378 | unsigned long task_size; /* size of task vm space */ |
379 | unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ | 379 | unsigned long cached_hole_size; /* if non-zero, the largest hole below free_area_cache */ |
380 | unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ | 380 | unsigned long free_area_cache; /* first hole of size cached_hole_size or larger */ |
381 | pgd_t * pgd; | 381 | pgd_t * pgd; |
382 | atomic_t mm_users; /* How many users with user space? */ | 382 | atomic_t mm_users; /* How many users with user space? */ |
383 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ | 383 | atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ |
384 | int map_count; /* number of VMAs */ | 384 | int map_count; /* number of VMAs */ |
385 | struct rw_semaphore mmap_sem; | 385 | struct rw_semaphore mmap_sem; |
386 | spinlock_t page_table_lock; /* Protects page tables and some counters */ | 386 | spinlock_t page_table_lock; /* Protects page tables and some counters */ |
387 | 387 | ||
388 | struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung | 388 | struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung |
389 | * together off init_mm.mmlist, and are protected | 389 | * together off init_mm.mmlist, and are protected |
390 | * by mmlist_lock | 390 | * by mmlist_lock |
391 | */ | 391 | */ |
392 | 392 | ||
393 | /* Special counters, in some configurations protected by the | 393 | /* Special counters, in some configurations protected by the |
394 | * page_table_lock, in other configurations by being atomic. | 394 | * page_table_lock, in other configurations by being atomic. |
395 | */ | 395 | */ |
396 | mm_counter_t _file_rss; | 396 | mm_counter_t _file_rss; |
397 | mm_counter_t _anon_rss; | 397 | mm_counter_t _anon_rss; |
398 | 398 | ||
399 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ | 399 | unsigned long hiwater_rss; /* High-watermark of RSS usage */ |
400 | unsigned long hiwater_vm; /* High-water virtual memory usage */ | 400 | unsigned long hiwater_vm; /* High-water virtual memory usage */ |
401 | 401 | ||
402 | unsigned long total_vm, locked_vm, shared_vm, exec_vm; | 402 | unsigned long total_vm, locked_vm, shared_vm, exec_vm; |
403 | unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; | 403 | unsigned long stack_vm, reserved_vm, def_flags, nr_ptes; |
404 | unsigned long start_code, end_code, start_data, end_data; | 404 | unsigned long start_code, end_code, start_data, end_data; |
405 | unsigned long start_brk, brk, start_stack; | 405 | unsigned long start_brk, brk, start_stack; |
406 | unsigned long arg_start, arg_end, env_start, env_end; | 406 | unsigned long arg_start, arg_end, env_start, env_end; |
407 | 407 | ||
408 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 408 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
409 | 409 | ||
410 | cpumask_t cpu_vm_mask; | 410 | cpumask_t cpu_vm_mask; |
411 | 411 | ||
412 | /* Architecture-specific MM context */ | 412 | /* Architecture-specific MM context */ |
413 | mm_context_t context; | 413 | mm_context_t context; |
414 | 414 | ||
415 | /* Swap token stuff */ | 415 | /* Swap token stuff */ |
416 | /* | 416 | /* |
417 | * Last value of global fault stamp as seen by this process. | 417 | * Last value of global fault stamp as seen by this process. |
418 | * In other words, this value gives an indication of how long | 418 | * In other words, this value gives an indication of how long |
419 | * it has been since this task got the token. | 419 | * it has been since this task got the token. |
420 | * Look at mm/thrash.c | 420 | * Look at mm/thrash.c |
421 | */ | 421 | */ |
422 | unsigned int faultstamp; | 422 | unsigned int faultstamp; |
423 | unsigned int token_priority; | 423 | unsigned int token_priority; |
424 | unsigned int last_interval; | 424 | unsigned int last_interval; |
425 | 425 | ||
426 | unsigned long flags; /* Must use atomic bitops to access the bits */ | 426 | unsigned long flags; /* Must use atomic bitops to access the bits */ |
427 | 427 | ||
428 | /* coredumping support */ | 428 | /* coredumping support */ |
429 | int core_waiters; | 429 | int core_waiters; |
430 | struct completion *core_startup_done, core_done; | 430 | struct completion *core_startup_done, core_done; |
431 | 431 | ||
432 | /* aio bits */ | 432 | /* aio bits */ |
433 | rwlock_t ioctx_list_lock; | 433 | rwlock_t ioctx_list_lock; |
434 | struct kioctx *ioctx_list; | 434 | struct kioctx *ioctx_list; |
435 | }; | 435 | }; |
436 | 436 | ||
437 | struct sighand_struct { | 437 | struct sighand_struct { |
438 | atomic_t count; | 438 | atomic_t count; |
439 | struct k_sigaction action[_NSIG]; | 439 | struct k_sigaction action[_NSIG]; |
440 | spinlock_t siglock; | 440 | spinlock_t siglock; |
441 | struct list_head signalfd_list; | 441 | struct list_head signalfd_list; |
442 | }; | 442 | }; |
443 | 443 | ||
444 | struct pacct_struct { | 444 | struct pacct_struct { |
445 | int ac_flag; | 445 | int ac_flag; |
446 | long ac_exitcode; | 446 | long ac_exitcode; |
447 | unsigned long ac_mem; | 447 | unsigned long ac_mem; |
448 | cputime_t ac_utime, ac_stime; | 448 | cputime_t ac_utime, ac_stime; |
449 | unsigned long ac_minflt, ac_majflt; | 449 | unsigned long ac_minflt, ac_majflt; |
450 | }; | 450 | }; |
451 | 451 | ||
452 | /* | 452 | /* |
453 | * NOTE! "signal_struct" does not have it's own | 453 | * NOTE! "signal_struct" does not have it's own |
454 | * locking, because a shared signal_struct always | 454 | * locking, because a shared signal_struct always |
455 | * implies a shared sighand_struct, so locking | 455 | * implies a shared sighand_struct, so locking |
456 | * sighand_struct is always a proper superset of | 456 | * sighand_struct is always a proper superset of |
457 | * the locking of signal_struct. | 457 | * the locking of signal_struct. |
458 | */ | 458 | */ |
459 | struct signal_struct { | 459 | struct signal_struct { |
460 | atomic_t count; | 460 | atomic_t count; |
461 | atomic_t live; | 461 | atomic_t live; |
462 | 462 | ||
463 | wait_queue_head_t wait_chldexit; /* for wait4() */ | 463 | wait_queue_head_t wait_chldexit; /* for wait4() */ |
464 | 464 | ||
465 | /* current thread group signal load-balancing target: */ | 465 | /* current thread group signal load-balancing target: */ |
466 | struct task_struct *curr_target; | 466 | struct task_struct *curr_target; |
467 | 467 | ||
468 | /* shared signal handling: */ | 468 | /* shared signal handling: */ |
469 | struct sigpending shared_pending; | 469 | struct sigpending shared_pending; |
470 | 470 | ||
471 | /* thread group exit support */ | 471 | /* thread group exit support */ |
472 | int group_exit_code; | 472 | int group_exit_code; |
473 | /* overloaded: | 473 | /* overloaded: |
474 | * - notify group_exit_task when ->count is equal to notify_count | 474 | * - notify group_exit_task when ->count is equal to notify_count |
475 | * - everyone except group_exit_task is stopped during signal delivery | 475 | * - everyone except group_exit_task is stopped during signal delivery |
476 | * of fatal signals, group_exit_task processes the signal. | 476 | * of fatal signals, group_exit_task processes the signal. |
477 | */ | 477 | */ |
478 | struct task_struct *group_exit_task; | 478 | struct task_struct *group_exit_task; |
479 | int notify_count; | 479 | int notify_count; |
480 | 480 | ||
481 | /* thread group stop support, overloads group_exit_code too */ | 481 | /* thread group stop support, overloads group_exit_code too */ |
482 | int group_stop_count; | 482 | int group_stop_count; |
483 | unsigned int flags; /* see SIGNAL_* flags below */ | 483 | unsigned int flags; /* see SIGNAL_* flags below */ |
484 | 484 | ||
485 | /* POSIX.1b Interval Timers */ | 485 | /* POSIX.1b Interval Timers */ |
486 | struct list_head posix_timers; | 486 | struct list_head posix_timers; |
487 | 487 | ||
488 | /* ITIMER_REAL timer for the process */ | 488 | /* ITIMER_REAL timer for the process */ |
489 | struct hrtimer real_timer; | 489 | struct hrtimer real_timer; |
490 | struct task_struct *tsk; | 490 | struct task_struct *tsk; |
491 | ktime_t it_real_incr; | 491 | ktime_t it_real_incr; |
492 | 492 | ||
493 | /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ | 493 | /* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */ |
494 | cputime_t it_prof_expires, it_virt_expires; | 494 | cputime_t it_prof_expires, it_virt_expires; |
495 | cputime_t it_prof_incr, it_virt_incr; | 495 | cputime_t it_prof_incr, it_virt_incr; |
496 | 496 | ||
497 | /* job control IDs */ | 497 | /* job control IDs */ |
498 | pid_t pgrp; | 498 | pid_t pgrp; |
499 | struct pid *tty_old_pgrp; | 499 | struct pid *tty_old_pgrp; |
500 | 500 | ||
501 | union { | 501 | union { |
502 | pid_t session __deprecated; | 502 | pid_t session __deprecated; |
503 | pid_t __session; | 503 | pid_t __session; |
504 | }; | 504 | }; |
505 | 505 | ||
506 | /* boolean value for session group leader */ | 506 | /* boolean value for session group leader */ |
507 | int leader; | 507 | int leader; |
508 | 508 | ||
509 | struct tty_struct *tty; /* NULL if no tty */ | 509 | struct tty_struct *tty; /* NULL if no tty */ |
510 | 510 | ||
511 | /* | 511 | /* |
512 | * Cumulative resource counters for dead threads in the group, | 512 | * Cumulative resource counters for dead threads in the group, |
513 | * and for reaped dead child processes forked by this group. | 513 | * and for reaped dead child processes forked by this group. |
514 | * Live threads maintain their own counters and add to these | 514 | * Live threads maintain their own counters and add to these |
515 | * in __exit_signal, except for the group leader. | 515 | * in __exit_signal, except for the group leader. |
516 | */ | 516 | */ |
517 | cputime_t utime, stime, cutime, cstime; | 517 | cputime_t utime, stime, cutime, cstime; |
518 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; | 518 | unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; |
519 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; | 519 | unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; |
520 | unsigned long inblock, oublock, cinblock, coublock; | 520 | unsigned long inblock, oublock, cinblock, coublock; |
521 | 521 | ||
522 | /* | 522 | /* |
523 | * Cumulative ns of scheduled CPU time for dead threads in the | 523 | * Cumulative ns of scheduled CPU time for dead threads in the |
524 | * group, not including a zombie group leader. (This only differs | 524 | * group, not including a zombie group leader. (This only differs |
525 | * from jiffies_to_ns(utime + stime) if sched_clock uses something | 525 | * from jiffies_to_ns(utime + stime) if sched_clock uses something |
526 | * other than jiffies.) | 526 | * other than jiffies.) |
527 | */ | 527 | */ |
528 | unsigned long long sum_sched_runtime; | 528 | unsigned long long sum_sched_runtime; |
529 | 529 | ||
530 | /* | 530 | /* |
531 | * We don't bother to synchronize most readers of this at all, | 531 | * We don't bother to synchronize most readers of this at all, |
532 | * because there is no reader checking a limit that actually needs | 532 | * because there is no reader checking a limit that actually needs |
533 | * to get both rlim_cur and rlim_max atomically, and either one | 533 | * to get both rlim_cur and rlim_max atomically, and either one |
534 | * alone is a single word that can safely be read normally. | 534 | * alone is a single word that can safely be read normally. |
535 | * getrlimit/setrlimit use task_lock(current->group_leader) to | 535 | * getrlimit/setrlimit use task_lock(current->group_leader) to |
536 | * protect this instead of the siglock, because they really | 536 | * protect this instead of the siglock, because they really |
537 | * have no need to disable irqs. | 537 | * have no need to disable irqs. |
538 | */ | 538 | */ |
539 | struct rlimit rlim[RLIM_NLIMITS]; | 539 | struct rlimit rlim[RLIM_NLIMITS]; |
540 | 540 | ||
541 | struct list_head cpu_timers[3]; | 541 | struct list_head cpu_timers[3]; |
542 | 542 | ||
543 | /* keep the process-shared keyrings here so that they do the right | 543 | /* keep the process-shared keyrings here so that they do the right |
544 | * thing in threads created with CLONE_THREAD */ | 544 | * thing in threads created with CLONE_THREAD */ |
545 | #ifdef CONFIG_KEYS | 545 | #ifdef CONFIG_KEYS |
546 | struct key *session_keyring; /* keyring inherited over fork */ | 546 | struct key *session_keyring; /* keyring inherited over fork */ |
547 | struct key *process_keyring; /* keyring private to this process */ | 547 | struct key *process_keyring; /* keyring private to this process */ |
548 | #endif | 548 | #endif |
549 | #ifdef CONFIG_BSD_PROCESS_ACCT | 549 | #ifdef CONFIG_BSD_PROCESS_ACCT |
550 | struct pacct_struct pacct; /* per-process accounting information */ | 550 | struct pacct_struct pacct; /* per-process accounting information */ |
551 | #endif | 551 | #endif |
552 | #ifdef CONFIG_TASKSTATS | 552 | #ifdef CONFIG_TASKSTATS |
553 | struct taskstats *stats; | 553 | struct taskstats *stats; |
554 | #endif | 554 | #endif |
555 | #ifdef CONFIG_AUDIT | 555 | #ifdef CONFIG_AUDIT |
556 | unsigned audit_tty; | 556 | unsigned audit_tty; |
557 | struct tty_audit_buf *tty_audit_buf; | 557 | struct tty_audit_buf *tty_audit_buf; |
558 | #endif | 558 | #endif |
559 | }; | 559 | }; |
560 | 560 | ||
561 | /* Context switch must be unlocked if interrupts are to be enabled */ | 561 | /* Context switch must be unlocked if interrupts are to be enabled */ |
562 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 562 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
563 | # define __ARCH_WANT_UNLOCKED_CTXSW | 563 | # define __ARCH_WANT_UNLOCKED_CTXSW |
564 | #endif | 564 | #endif |
565 | 565 | ||
566 | /* | 566 | /* |
567 | * Bits in flags field of signal_struct. | 567 | * Bits in flags field of signal_struct. |
568 | */ | 568 | */ |
569 | #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ | 569 | #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ |
570 | #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ | 570 | #define SIGNAL_STOP_DEQUEUED 0x00000002 /* stop signal dequeued */ |
571 | #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ | 571 | #define SIGNAL_STOP_CONTINUED 0x00000004 /* SIGCONT since WCONTINUED reap */ |
572 | #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ | 572 | #define SIGNAL_GROUP_EXIT 0x00000008 /* group exit in progress */ |
573 | 573 | ||
574 | /* | 574 | /* |
575 | * Some day this will be a full-fledged user tracking system.. | 575 | * Some day this will be a full-fledged user tracking system.. |
576 | */ | 576 | */ |
577 | struct user_struct { | 577 | struct user_struct { |
578 | atomic_t __count; /* reference count */ | 578 | atomic_t __count; /* reference count */ |
579 | atomic_t processes; /* How many processes does this user have? */ | 579 | atomic_t processes; /* How many processes does this user have? */ |
580 | atomic_t files; /* How many open files does this user have? */ | 580 | atomic_t files; /* How many open files does this user have? */ |
581 | atomic_t sigpending; /* How many pending signals does this user have? */ | 581 | atomic_t sigpending; /* How many pending signals does this user have? */ |
582 | #ifdef CONFIG_INOTIFY_USER | 582 | #ifdef CONFIG_INOTIFY_USER |
583 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ | 583 | atomic_t inotify_watches; /* How many inotify watches does this user have? */ |
584 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ | 584 | atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ |
585 | #endif | 585 | #endif |
586 | /* protected by mq_lock */ | 586 | /* protected by mq_lock */ |
587 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ | 587 | unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ |
588 | unsigned long locked_shm; /* How many pages of mlocked shm ? */ | 588 | unsigned long locked_shm; /* How many pages of mlocked shm ? */ |
589 | 589 | ||
590 | #ifdef CONFIG_KEYS | 590 | #ifdef CONFIG_KEYS |
591 | struct key *uid_keyring; /* UID specific keyring */ | 591 | struct key *uid_keyring; /* UID specific keyring */ |
592 | struct key *session_keyring; /* UID's default session keyring */ | 592 | struct key *session_keyring; /* UID's default session keyring */ |
593 | #endif | 593 | #endif |
594 | 594 | ||
595 | /* Hash table maintenance information */ | 595 | /* Hash table maintenance information */ |
596 | struct list_head uidhash_list; | 596 | struct list_head uidhash_list; |
597 | uid_t uid; | 597 | uid_t uid; |
598 | }; | 598 | }; |
599 | 599 | ||
600 | extern struct user_struct *find_user(uid_t); | 600 | extern struct user_struct *find_user(uid_t); |
601 | 601 | ||
602 | extern struct user_struct root_user; | 602 | extern struct user_struct root_user; |
603 | #define INIT_USER (&root_user) | 603 | #define INIT_USER (&root_user) |
604 | 604 | ||
605 | struct backing_dev_info; | 605 | struct backing_dev_info; |
606 | struct reclaim_state; | 606 | struct reclaim_state; |
607 | 607 | ||
608 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 608 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
609 | struct sched_info { | 609 | struct sched_info { |
610 | /* cumulative counters */ | 610 | /* cumulative counters */ |
611 | unsigned long pcnt; /* # of times run on this cpu */ | 611 | unsigned long pcnt; /* # of times run on this cpu */ |
612 | unsigned long long cpu_time, /* time spent on the cpu */ | 612 | unsigned long long cpu_time, /* time spent on the cpu */ |
613 | run_delay; /* time spent waiting on a runqueue */ | 613 | run_delay; /* time spent waiting on a runqueue */ |
614 | 614 | ||
615 | /* timestamps */ | 615 | /* timestamps */ |
616 | unsigned long long last_arrival,/* when we last ran on a cpu */ | 616 | unsigned long long last_arrival,/* when we last ran on a cpu */ |
617 | last_queued; /* when we were last queued to run */ | 617 | last_queued; /* when we were last queued to run */ |
618 | }; | 618 | }; |
619 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ | 619 | #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ |
620 | 620 | ||
621 | #ifdef CONFIG_SCHEDSTATS | 621 | #ifdef CONFIG_SCHEDSTATS |
622 | extern const struct file_operations proc_schedstat_operations; | 622 | extern const struct file_operations proc_schedstat_operations; |
623 | #endif /* CONFIG_SCHEDSTATS */ | 623 | #endif /* CONFIG_SCHEDSTATS */ |
624 | 624 | ||
625 | #ifdef CONFIG_TASK_DELAY_ACCT | 625 | #ifdef CONFIG_TASK_DELAY_ACCT |
626 | struct task_delay_info { | 626 | struct task_delay_info { |
627 | spinlock_t lock; | 627 | spinlock_t lock; |
628 | unsigned int flags; /* Private per-task flags */ | 628 | unsigned int flags; /* Private per-task flags */ |
629 | 629 | ||
630 | /* For each stat XXX, add following, aligned appropriately | 630 | /* For each stat XXX, add following, aligned appropriately |
631 | * | 631 | * |
632 | * struct timespec XXX_start, XXX_end; | 632 | * struct timespec XXX_start, XXX_end; |
633 | * u64 XXX_delay; | 633 | * u64 XXX_delay; |
634 | * u32 XXX_count; | 634 | * u32 XXX_count; |
635 | * | 635 | * |
636 | * Atomicity of updates to XXX_delay, XXX_count protected by | 636 | * Atomicity of updates to XXX_delay, XXX_count protected by |
637 | * single lock above (split into XXX_lock if contention is an issue). | 637 | * single lock above (split into XXX_lock if contention is an issue). |
638 | */ | 638 | */ |
639 | 639 | ||
640 | /* | 640 | /* |
641 | * XXX_count is incremented on every XXX operation, the delay | 641 | * XXX_count is incremented on every XXX operation, the delay |
642 | * associated with the operation is added to XXX_delay. | 642 | * associated with the operation is added to XXX_delay. |
643 | * XXX_delay contains the accumulated delay time in nanoseconds. | 643 | * XXX_delay contains the accumulated delay time in nanoseconds. |
644 | */ | 644 | */ |
645 | struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ | 645 | struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ |
646 | u64 blkio_delay; /* wait for sync block io completion */ | 646 | u64 blkio_delay; /* wait for sync block io completion */ |
647 | u64 swapin_delay; /* wait for swapin block io completion */ | 647 | u64 swapin_delay; /* wait for swapin block io completion */ |
648 | u32 blkio_count; /* total count of the number of sync block */ | 648 | u32 blkio_count; /* total count of the number of sync block */ |
649 | /* io operations performed */ | 649 | /* io operations performed */ |
650 | u32 swapin_count; /* total count of the number of swapin block */ | 650 | u32 swapin_count; /* total count of the number of swapin block */ |
651 | /* io operations performed */ | 651 | /* io operations performed */ |
652 | }; | 652 | }; |
653 | #endif /* CONFIG_TASK_DELAY_ACCT */ | 653 | #endif /* CONFIG_TASK_DELAY_ACCT */ |
654 | 654 | ||
655 | static inline int sched_info_on(void) | 655 | static inline int sched_info_on(void) |
656 | { | 656 | { |
657 | #ifdef CONFIG_SCHEDSTATS | 657 | #ifdef CONFIG_SCHEDSTATS |
658 | return 1; | 658 | return 1; |
659 | #elif defined(CONFIG_TASK_DELAY_ACCT) | 659 | #elif defined(CONFIG_TASK_DELAY_ACCT) |
660 | extern int delayacct_on; | 660 | extern int delayacct_on; |
661 | return delayacct_on; | 661 | return delayacct_on; |
662 | #else | 662 | #else |
663 | return 0; | 663 | return 0; |
664 | #endif | 664 | #endif |
665 | } | 665 | } |
666 | 666 | ||
667 | enum cpu_idle_type { | 667 | enum cpu_idle_type { |
668 | CPU_IDLE, | 668 | CPU_IDLE, |
669 | CPU_NOT_IDLE, | 669 | CPU_NOT_IDLE, |
670 | CPU_NEWLY_IDLE, | 670 | CPU_NEWLY_IDLE, |
671 | CPU_MAX_IDLE_TYPES | 671 | CPU_MAX_IDLE_TYPES |
672 | }; | 672 | }; |
673 | 673 | ||
674 | /* | 674 | /* |
675 | * sched-domains (multiprocessor balancing) declarations: | 675 | * sched-domains (multiprocessor balancing) declarations: |
676 | */ | 676 | */ |
677 | 677 | ||
678 | /* | 678 | /* |
679 | * Increase resolution of nice-level calculations: | 679 | * Increase resolution of nice-level calculations: |
680 | */ | 680 | */ |
681 | #define SCHED_LOAD_SHIFT 10 | 681 | #define SCHED_LOAD_SHIFT 10 |
682 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) | 682 | #define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) |
683 | 683 | ||
684 | #define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) | 684 | #define SCHED_LOAD_SCALE_FUZZ (SCHED_LOAD_SCALE >> 1) |
685 | 685 | ||
686 | #ifdef CONFIG_SMP | 686 | #ifdef CONFIG_SMP |
687 | #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ | 687 | #define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ |
688 | #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ | 688 | #define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ |
689 | #define SD_BALANCE_EXEC 4 /* Balance on exec */ | 689 | #define SD_BALANCE_EXEC 4 /* Balance on exec */ |
690 | #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ | 690 | #define SD_BALANCE_FORK 8 /* Balance on fork, clone */ |
691 | #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ | 691 | #define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ |
692 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ | 692 | #define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ |
693 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ | 693 | #define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ |
694 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ | 694 | #define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ |
695 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ | 695 | #define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ |
696 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ | 696 | #define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ |
697 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ | 697 | #define SD_SERIALIZE 1024 /* Only a single load balancing instance */ |
698 | 698 | ||
699 | #define BALANCE_FOR_MC_POWER \ | 699 | #define BALANCE_FOR_MC_POWER \ |
700 | (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) | 700 | (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0) |
701 | 701 | ||
702 | #define BALANCE_FOR_PKG_POWER \ | 702 | #define BALANCE_FOR_PKG_POWER \ |
703 | ((sched_mc_power_savings || sched_smt_power_savings) ? \ | 703 | ((sched_mc_power_savings || sched_smt_power_savings) ? \ |
704 | SD_POWERSAVINGS_BALANCE : 0) | 704 | SD_POWERSAVINGS_BALANCE : 0) |
705 | 705 | ||
706 | #define test_sd_parent(sd, flag) ((sd->parent && \ | 706 | #define test_sd_parent(sd, flag) ((sd->parent && \ |
707 | (sd->parent->flags & flag)) ? 1 : 0) | 707 | (sd->parent->flags & flag)) ? 1 : 0) |
708 | 708 | ||
709 | 709 | ||
710 | struct sched_group { | 710 | struct sched_group { |
711 | struct sched_group *next; /* Must be a circular list */ | 711 | struct sched_group *next; /* Must be a circular list */ |
712 | cpumask_t cpumask; | 712 | cpumask_t cpumask; |
713 | 713 | ||
714 | /* | 714 | /* |
715 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a | 715 | * CPU power of this group, SCHED_LOAD_SCALE being max power for a |
716 | * single CPU. This is read only (except for setup, hotplug CPU). | 716 | * single CPU. This is read only (except for setup, hotplug CPU). |
717 | * Note : Never change cpu_power without recompute its reciprocal | 717 | * Note : Never change cpu_power without recompute its reciprocal |
718 | */ | 718 | */ |
719 | unsigned int __cpu_power; | 719 | unsigned int __cpu_power; |
720 | /* | 720 | /* |
721 | * reciprocal value of cpu_power to avoid expensive divides | 721 | * reciprocal value of cpu_power to avoid expensive divides |
722 | * (see include/linux/reciprocal_div.h) | 722 | * (see include/linux/reciprocal_div.h) |
723 | */ | 723 | */ |
724 | u32 reciprocal_cpu_power; | 724 | u32 reciprocal_cpu_power; |
725 | }; | 725 | }; |
726 | 726 | ||
727 | struct sched_domain { | 727 | struct sched_domain { |
728 | /* These fields must be setup */ | 728 | /* These fields must be setup */ |
729 | struct sched_domain *parent; /* top domain must be null terminated */ | 729 | struct sched_domain *parent; /* top domain must be null terminated */ |
730 | struct sched_domain *child; /* bottom domain must be null terminated */ | 730 | struct sched_domain *child; /* bottom domain must be null terminated */ |
731 | struct sched_group *groups; /* the balancing groups of the domain */ | 731 | struct sched_group *groups; /* the balancing groups of the domain */ |
732 | cpumask_t span; /* span of all CPUs in this domain */ | 732 | cpumask_t span; /* span of all CPUs in this domain */ |
733 | unsigned long min_interval; /* Minimum balance interval ms */ | 733 | unsigned long min_interval; /* Minimum balance interval ms */ |
734 | unsigned long max_interval; /* Maximum balance interval ms */ | 734 | unsigned long max_interval; /* Maximum balance interval ms */ |
735 | unsigned int busy_factor; /* less balancing by factor if busy */ | 735 | unsigned int busy_factor; /* less balancing by factor if busy */ |
736 | unsigned int imbalance_pct; /* No balance until over watermark */ | 736 | unsigned int imbalance_pct; /* No balance until over watermark */ |
737 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ | 737 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ |
738 | unsigned int busy_idx; | 738 | unsigned int busy_idx; |
739 | unsigned int idle_idx; | 739 | unsigned int idle_idx; |
740 | unsigned int newidle_idx; | 740 | unsigned int newidle_idx; |
741 | unsigned int wake_idx; | 741 | unsigned int wake_idx; |
742 | unsigned int forkexec_idx; | 742 | unsigned int forkexec_idx; |
743 | int flags; /* See SD_* */ | 743 | int flags; /* See SD_* */ |
744 | 744 | ||
745 | /* Runtime fields. */ | 745 | /* Runtime fields. */ |
746 | unsigned long last_balance; /* init to jiffies. units in jiffies */ | 746 | unsigned long last_balance; /* init to jiffies. units in jiffies */ |
747 | unsigned int balance_interval; /* initialise to 1. units in ms. */ | 747 | unsigned int balance_interval; /* initialise to 1. units in ms. */ |
748 | unsigned int nr_balance_failed; /* initialise to 0 */ | 748 | unsigned int nr_balance_failed; /* initialise to 0 */ |
749 | 749 | ||
750 | #ifdef CONFIG_SCHEDSTATS | 750 | #ifdef CONFIG_SCHEDSTATS |
751 | /* load_balance() stats */ | 751 | /* load_balance() stats */ |
752 | unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; | 752 | unsigned long lb_cnt[CPU_MAX_IDLE_TYPES]; |
753 | unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; | 753 | unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; |
754 | unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; | 754 | unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; |
755 | unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; | 755 | unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; |
756 | unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; | 756 | unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; |
757 | unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; | 757 | unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; |
758 | unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; | 758 | unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; |
759 | unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; | 759 | unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; |
760 | 760 | ||
761 | /* Active load balancing */ | 761 | /* Active load balancing */ |
762 | unsigned long alb_cnt; | 762 | unsigned long alb_cnt; |
763 | unsigned long alb_failed; | 763 | unsigned long alb_failed; |
764 | unsigned long alb_pushed; | 764 | unsigned long alb_pushed; |
765 | 765 | ||
766 | /* SD_BALANCE_EXEC stats */ | 766 | /* SD_BALANCE_EXEC stats */ |
767 | unsigned long sbe_cnt; | 767 | unsigned long sbe_cnt; |
768 | unsigned long sbe_balanced; | 768 | unsigned long sbe_balanced; |
769 | unsigned long sbe_pushed; | 769 | unsigned long sbe_pushed; |
770 | 770 | ||
771 | /* SD_BALANCE_FORK stats */ | 771 | /* SD_BALANCE_FORK stats */ |
772 | unsigned long sbf_cnt; | 772 | unsigned long sbf_cnt; |
773 | unsigned long sbf_balanced; | 773 | unsigned long sbf_balanced; |
774 | unsigned long sbf_pushed; | 774 | unsigned long sbf_pushed; |
775 | 775 | ||
776 | /* try_to_wake_up() stats */ | 776 | /* try_to_wake_up() stats */ |
777 | unsigned long ttwu_wake_remote; | 777 | unsigned long ttwu_wake_remote; |
778 | unsigned long ttwu_move_affine; | 778 | unsigned long ttwu_move_affine; |
779 | unsigned long ttwu_move_balance; | 779 | unsigned long ttwu_move_balance; |
780 | #endif | 780 | #endif |
781 | }; | 781 | }; |
782 | 782 | ||
783 | extern int partition_sched_domains(cpumask_t *partition1, | 783 | extern int partition_sched_domains(cpumask_t *partition1, |
784 | cpumask_t *partition2); | 784 | cpumask_t *partition2); |
785 | 785 | ||
786 | #endif /* CONFIG_SMP */ | 786 | #endif /* CONFIG_SMP */ |
787 | 787 | ||
788 | /* | 788 | /* |
789 | * A runqueue laden with a single nice 0 task scores a weighted_cpuload of | 789 | * A runqueue laden with a single nice 0 task scores a weighted_cpuload of |
790 | * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a | 790 | * SCHED_LOAD_SCALE. This function returns 1 if any cpu is laden with a |
791 | * task of nice 0 or enough lower priority tasks to bring up the | 791 | * task of nice 0 or enough lower priority tasks to bring up the |
792 | * weighted_cpuload | 792 | * weighted_cpuload |
793 | */ | 793 | */ |
794 | static inline int above_background_load(void) | 794 | static inline int above_background_load(void) |
795 | { | 795 | { |
796 | unsigned long cpu; | 796 | unsigned long cpu; |
797 | 797 | ||
798 | for_each_online_cpu(cpu) { | 798 | for_each_online_cpu(cpu) { |
799 | if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) | 799 | if (weighted_cpuload(cpu) >= SCHED_LOAD_SCALE) |
800 | return 1; | 800 | return 1; |
801 | } | 801 | } |
802 | return 0; | 802 | return 0; |
803 | } | 803 | } |
804 | 804 | ||
805 | struct io_context; /* See blkdev.h */ | 805 | struct io_context; /* See blkdev.h */ |
806 | struct cpuset; | 806 | struct cpuset; |
807 | 807 | ||
808 | #define NGROUPS_SMALL 32 | 808 | #define NGROUPS_SMALL 32 |
809 | #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) | 809 | #define NGROUPS_PER_BLOCK ((int)(PAGE_SIZE / sizeof(gid_t))) |
810 | struct group_info { | 810 | struct group_info { |
811 | int ngroups; | 811 | int ngroups; |
812 | atomic_t usage; | 812 | atomic_t usage; |
813 | gid_t small_block[NGROUPS_SMALL]; | 813 | gid_t small_block[NGROUPS_SMALL]; |
814 | int nblocks; | 814 | int nblocks; |
815 | gid_t *blocks[0]; | 815 | gid_t *blocks[0]; |
816 | }; | 816 | }; |
817 | 817 | ||
818 | /* | 818 | /* |
819 | * get_group_info() must be called with the owning task locked (via task_lock()) | 819 | * get_group_info() must be called with the owning task locked (via task_lock()) |
820 | * when task != current. The reason being that the vast majority of callers are | 820 | * when task != current. The reason being that the vast majority of callers are |
821 | * looking at current->group_info, which can not be changed except by the | 821 | * looking at current->group_info, which can not be changed except by the |
822 | * current task. Changing current->group_info requires the task lock, too. | 822 | * current task. Changing current->group_info requires the task lock, too. |
823 | */ | 823 | */ |
824 | #define get_group_info(group_info) do { \ | 824 | #define get_group_info(group_info) do { \ |
825 | atomic_inc(&(group_info)->usage); \ | 825 | atomic_inc(&(group_info)->usage); \ |
826 | } while (0) | 826 | } while (0) |
827 | 827 | ||
828 | #define put_group_info(group_info) do { \ | 828 | #define put_group_info(group_info) do { \ |
829 | if (atomic_dec_and_test(&(group_info)->usage)) \ | 829 | if (atomic_dec_and_test(&(group_info)->usage)) \ |
830 | groups_free(group_info); \ | 830 | groups_free(group_info); \ |
831 | } while (0) | 831 | } while (0) |
832 | 832 | ||
833 | extern struct group_info *groups_alloc(int gidsetsize); | 833 | extern struct group_info *groups_alloc(int gidsetsize); |
834 | extern void groups_free(struct group_info *group_info); | 834 | extern void groups_free(struct group_info *group_info); |
835 | extern int set_current_groups(struct group_info *group_info); | 835 | extern int set_current_groups(struct group_info *group_info); |
836 | extern int groups_search(struct group_info *group_info, gid_t grp); | 836 | extern int groups_search(struct group_info *group_info, gid_t grp); |
837 | /* access the groups "array" with this macro */ | 837 | /* access the groups "array" with this macro */ |
838 | #define GROUP_AT(gi, i) \ | 838 | #define GROUP_AT(gi, i) \ |
839 | ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) | 839 | ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK]) |
840 | 840 | ||
841 | #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK | 841 | #ifdef ARCH_HAS_PREFETCH_SWITCH_STACK |
842 | extern void prefetch_stack(struct task_struct *t); | 842 | extern void prefetch_stack(struct task_struct *t); |
843 | #else | 843 | #else |
844 | static inline void prefetch_stack(struct task_struct *t) { } | 844 | static inline void prefetch_stack(struct task_struct *t) { } |
845 | #endif | 845 | #endif |
846 | 846 | ||
847 | struct audit_context; /* See audit.c */ | 847 | struct audit_context; /* See audit.c */ |
848 | struct mempolicy; | 848 | struct mempolicy; |
849 | struct pipe_inode_info; | 849 | struct pipe_inode_info; |
850 | struct uts_namespace; | 850 | struct uts_namespace; |
851 | 851 | ||
852 | struct rq; | 852 | struct rq; |
853 | struct sched_domain; | 853 | struct sched_domain; |
854 | 854 | ||
855 | struct sched_class { | 855 | struct sched_class { |
856 | struct sched_class *next; | 856 | struct sched_class *next; |
857 | 857 | ||
858 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, | 858 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, |
859 | int wakeup, u64 now); | 859 | int wakeup, u64 now); |
860 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, | 860 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, |
861 | int sleep, u64 now); | 861 | int sleep, u64 now); |
862 | void (*yield_task) (struct rq *rq, struct task_struct *p); | 862 | void (*yield_task) (struct rq *rq, struct task_struct *p); |
863 | 863 | ||
864 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); | 864 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p); |
865 | 865 | ||
866 | struct task_struct * (*pick_next_task) (struct rq *rq, u64 now); | 866 | struct task_struct * (*pick_next_task) (struct rq *rq, u64 now); |
867 | void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now); | 867 | void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now); |
868 | 868 | ||
869 | int (*load_balance) (struct rq *this_rq, int this_cpu, | 869 | unsigned long (*load_balance) (struct rq *this_rq, int this_cpu, |
870 | struct rq *busiest, | 870 | struct rq *busiest, |
871 | unsigned long max_nr_move, unsigned long max_load_move, | 871 | unsigned long max_nr_move, unsigned long max_load_move, |
872 | struct sched_domain *sd, enum cpu_idle_type idle, | 872 | struct sched_domain *sd, enum cpu_idle_type idle, |
873 | int *all_pinned, unsigned long *total_load_moved); | 873 | int *all_pinned); |
874 | 874 | ||
875 | void (*set_curr_task) (struct rq *rq); | 875 | void (*set_curr_task) (struct rq *rq); |
876 | void (*task_tick) (struct rq *rq, struct task_struct *p); | 876 | void (*task_tick) (struct rq *rq, struct task_struct *p); |
877 | void (*task_new) (struct rq *rq, struct task_struct *p, u64 now); | 877 | void (*task_new) (struct rq *rq, struct task_struct *p, u64 now); |
878 | }; | 878 | }; |
879 | 879 | ||
880 | struct load_weight { | 880 | struct load_weight { |
881 | unsigned long weight, inv_weight; | 881 | unsigned long weight, inv_weight; |
882 | }; | 882 | }; |
883 | 883 | ||
884 | /* | 884 | /* |
885 | * CFS stats for a schedulable entity (task, task-group etc) | 885 | * CFS stats for a schedulable entity (task, task-group etc) |
886 | * | 886 | * |
887 | * Current field usage histogram: | 887 | * Current field usage histogram: |
888 | * | 888 | * |
889 | * 4 se->block_start | 889 | * 4 se->block_start |
890 | * 4 se->run_node | 890 | * 4 se->run_node |
891 | * 4 se->sleep_start | 891 | * 4 se->sleep_start |
892 | * 4 se->sleep_start_fair | 892 | * 4 se->sleep_start_fair |
893 | * 6 se->load.weight | 893 | * 6 se->load.weight |
894 | * 7 se->delta_fair | 894 | * 7 se->delta_fair |
895 | * 15 se->wait_runtime | 895 | * 15 se->wait_runtime |
896 | */ | 896 | */ |
897 | struct sched_entity { | 897 | struct sched_entity { |
898 | long wait_runtime; | 898 | long wait_runtime; |
899 | unsigned long delta_fair_run; | 899 | unsigned long delta_fair_run; |
900 | unsigned long delta_fair_sleep; | 900 | unsigned long delta_fair_sleep; |
901 | unsigned long delta_exec; | 901 | unsigned long delta_exec; |
902 | s64 fair_key; | 902 | s64 fair_key; |
903 | struct load_weight load; /* for load-balancing */ | 903 | struct load_weight load; /* for load-balancing */ |
904 | struct rb_node run_node; | 904 | struct rb_node run_node; |
905 | unsigned int on_rq; | 905 | unsigned int on_rq; |
906 | 906 | ||
907 | u64 exec_start; | 907 | u64 exec_start; |
908 | u64 sum_exec_runtime; | 908 | u64 sum_exec_runtime; |
909 | u64 wait_start_fair; | 909 | u64 wait_start_fair; |
910 | u64 sleep_start_fair; | 910 | u64 sleep_start_fair; |
911 | 911 | ||
912 | #ifdef CONFIG_SCHEDSTATS | 912 | #ifdef CONFIG_SCHEDSTATS |
913 | u64 wait_start; | 913 | u64 wait_start; |
914 | u64 wait_max; | 914 | u64 wait_max; |
915 | s64 sum_wait_runtime; | 915 | s64 sum_wait_runtime; |
916 | 916 | ||
917 | u64 sleep_start; | 917 | u64 sleep_start; |
918 | u64 sleep_max; | 918 | u64 sleep_max; |
919 | s64 sum_sleep_runtime; | 919 | s64 sum_sleep_runtime; |
920 | 920 | ||
921 | u64 block_start; | 921 | u64 block_start; |
922 | u64 block_max; | 922 | u64 block_max; |
923 | u64 exec_max; | 923 | u64 exec_max; |
924 | 924 | ||
925 | unsigned long wait_runtime_overruns; | 925 | unsigned long wait_runtime_overruns; |
926 | unsigned long wait_runtime_underruns; | 926 | unsigned long wait_runtime_underruns; |
927 | #endif | 927 | #endif |
928 | 928 | ||
929 | #ifdef CONFIG_FAIR_GROUP_SCHED | 929 | #ifdef CONFIG_FAIR_GROUP_SCHED |
930 | struct sched_entity *parent; | 930 | struct sched_entity *parent; |
931 | /* rq on which this entity is (to be) queued: */ | 931 | /* rq on which this entity is (to be) queued: */ |
932 | struct cfs_rq *cfs_rq; | 932 | struct cfs_rq *cfs_rq; |
933 | /* rq "owned" by this entity/group: */ | 933 | /* rq "owned" by this entity/group: */ |
934 | struct cfs_rq *my_q; | 934 | struct cfs_rq *my_q; |
935 | #endif | 935 | #endif |
936 | }; | 936 | }; |
937 | 937 | ||
938 | struct task_struct { | 938 | struct task_struct { |
939 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | 939 | volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ |
940 | void *stack; | 940 | void *stack; |
941 | atomic_t usage; | 941 | atomic_t usage; |
942 | unsigned int flags; /* per process flags, defined below */ | 942 | unsigned int flags; /* per process flags, defined below */ |
943 | unsigned int ptrace; | 943 | unsigned int ptrace; |
944 | 944 | ||
945 | int lock_depth; /* BKL lock depth */ | 945 | int lock_depth; /* BKL lock depth */ |
946 | 946 | ||
947 | #ifdef CONFIG_SMP | 947 | #ifdef CONFIG_SMP |
948 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 948 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
949 | int oncpu; | 949 | int oncpu; |
950 | #endif | 950 | #endif |
951 | #endif | 951 | #endif |
952 | 952 | ||
953 | int prio, static_prio, normal_prio; | 953 | int prio, static_prio, normal_prio; |
954 | struct list_head run_list; | 954 | struct list_head run_list; |
955 | struct sched_class *sched_class; | 955 | struct sched_class *sched_class; |
956 | struct sched_entity se; | 956 | struct sched_entity se; |
957 | 957 | ||
958 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 958 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
959 | /* list of struct preempt_notifier: */ | 959 | /* list of struct preempt_notifier: */ |
960 | struct hlist_head preempt_notifiers; | 960 | struct hlist_head preempt_notifiers; |
961 | #endif | 961 | #endif |
962 | 962 | ||
963 | unsigned short ioprio; | 963 | unsigned short ioprio; |
964 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 964 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
965 | unsigned int btrace_seq; | 965 | unsigned int btrace_seq; |
966 | #endif | 966 | #endif |
967 | 967 | ||
968 | unsigned int policy; | 968 | unsigned int policy; |
969 | cpumask_t cpus_allowed; | 969 | cpumask_t cpus_allowed; |
970 | unsigned int time_slice; | 970 | unsigned int time_slice; |
971 | 971 | ||
972 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 972 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
973 | struct sched_info sched_info; | 973 | struct sched_info sched_info; |
974 | #endif | 974 | #endif |
975 | 975 | ||
976 | struct list_head tasks; | 976 | struct list_head tasks; |
977 | /* | 977 | /* |
978 | * ptrace_list/ptrace_children forms the list of my children | 978 | * ptrace_list/ptrace_children forms the list of my children |
979 | * that were stolen by a ptracer. | 979 | * that were stolen by a ptracer. |
980 | */ | 980 | */ |
981 | struct list_head ptrace_children; | 981 | struct list_head ptrace_children; |
982 | struct list_head ptrace_list; | 982 | struct list_head ptrace_list; |
983 | 983 | ||
984 | struct mm_struct *mm, *active_mm; | 984 | struct mm_struct *mm, *active_mm; |
985 | 985 | ||
986 | /* task state */ | 986 | /* task state */ |
987 | struct linux_binfmt *binfmt; | 987 | struct linux_binfmt *binfmt; |
988 | int exit_state; | 988 | int exit_state; |
989 | int exit_code, exit_signal; | 989 | int exit_code, exit_signal; |
990 | int pdeath_signal; /* The signal sent when the parent dies */ | 990 | int pdeath_signal; /* The signal sent when the parent dies */ |
991 | /* ??? */ | 991 | /* ??? */ |
992 | unsigned int personality; | 992 | unsigned int personality; |
993 | unsigned did_exec:1; | 993 | unsigned did_exec:1; |
994 | pid_t pid; | 994 | pid_t pid; |
995 | pid_t tgid; | 995 | pid_t tgid; |
996 | 996 | ||
997 | #ifdef CONFIG_CC_STACKPROTECTOR | 997 | #ifdef CONFIG_CC_STACKPROTECTOR |
998 | /* Canary value for the -fstack-protector gcc feature */ | 998 | /* Canary value for the -fstack-protector gcc feature */ |
999 | unsigned long stack_canary; | 999 | unsigned long stack_canary; |
1000 | #endif | 1000 | #endif |
1001 | /* | 1001 | /* |
1002 | * pointers to (original) parent process, youngest child, younger sibling, | 1002 | * pointers to (original) parent process, youngest child, younger sibling, |
1003 | * older sibling, respectively. (p->father can be replaced with | 1003 | * older sibling, respectively. (p->father can be replaced with |
1004 | * p->parent->pid) | 1004 | * p->parent->pid) |
1005 | */ | 1005 | */ |
1006 | struct task_struct *real_parent; /* real parent process (when being debugged) */ | 1006 | struct task_struct *real_parent; /* real parent process (when being debugged) */ |
1007 | struct task_struct *parent; /* parent process */ | 1007 | struct task_struct *parent; /* parent process */ |
1008 | /* | 1008 | /* |
1009 | * children/sibling forms the list of my children plus the | 1009 | * children/sibling forms the list of my children plus the |
1010 | * tasks I'm ptracing. | 1010 | * tasks I'm ptracing. |
1011 | */ | 1011 | */ |
1012 | struct list_head children; /* list of my children */ | 1012 | struct list_head children; /* list of my children */ |
1013 | struct list_head sibling; /* linkage in my parent's children list */ | 1013 | struct list_head sibling; /* linkage in my parent's children list */ |
1014 | struct task_struct *group_leader; /* threadgroup leader */ | 1014 | struct task_struct *group_leader; /* threadgroup leader */ |
1015 | 1015 | ||
1016 | /* PID/PID hash table linkage. */ | 1016 | /* PID/PID hash table linkage. */ |
1017 | struct pid_link pids[PIDTYPE_MAX]; | 1017 | struct pid_link pids[PIDTYPE_MAX]; |
1018 | struct list_head thread_group; | 1018 | struct list_head thread_group; |
1019 | 1019 | ||
1020 | struct completion *vfork_done; /* for vfork() */ | 1020 | struct completion *vfork_done; /* for vfork() */ |
1021 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1021 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
1022 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ | 1022 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1023 | 1023 | ||
1024 | unsigned int rt_priority; | 1024 | unsigned int rt_priority; |
1025 | cputime_t utime, stime; | 1025 | cputime_t utime, stime; |
1026 | unsigned long nvcsw, nivcsw; /* context switch counts */ | 1026 | unsigned long nvcsw, nivcsw; /* context switch counts */ |
1027 | struct timespec start_time; /* monotonic time */ | 1027 | struct timespec start_time; /* monotonic time */ |
1028 | struct timespec real_start_time; /* boot based time */ | 1028 | struct timespec real_start_time; /* boot based time */ |
1029 | /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ | 1029 | /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ |
1030 | unsigned long min_flt, maj_flt; | 1030 | unsigned long min_flt, maj_flt; |
1031 | 1031 | ||
1032 | cputime_t it_prof_expires, it_virt_expires; | 1032 | cputime_t it_prof_expires, it_virt_expires; |
1033 | unsigned long long it_sched_expires; | 1033 | unsigned long long it_sched_expires; |
1034 | struct list_head cpu_timers[3]; | 1034 | struct list_head cpu_timers[3]; |
1035 | 1035 | ||
1036 | /* process credentials */ | 1036 | /* process credentials */ |
1037 | uid_t uid,euid,suid,fsuid; | 1037 | uid_t uid,euid,suid,fsuid; |
1038 | gid_t gid,egid,sgid,fsgid; | 1038 | gid_t gid,egid,sgid,fsgid; |
1039 | struct group_info *group_info; | 1039 | struct group_info *group_info; |
1040 | kernel_cap_t cap_effective, cap_inheritable, cap_permitted; | 1040 | kernel_cap_t cap_effective, cap_inheritable, cap_permitted; |
1041 | unsigned keep_capabilities:1; | 1041 | unsigned keep_capabilities:1; |
1042 | struct user_struct *user; | 1042 | struct user_struct *user; |
1043 | #ifdef CONFIG_KEYS | 1043 | #ifdef CONFIG_KEYS |
1044 | struct key *request_key_auth; /* assumed request_key authority */ | 1044 | struct key *request_key_auth; /* assumed request_key authority */ |
1045 | struct key *thread_keyring; /* keyring private to this thread */ | 1045 | struct key *thread_keyring; /* keyring private to this thread */ |
1046 | unsigned char jit_keyring; /* default keyring to attach requested keys to */ | 1046 | unsigned char jit_keyring; /* default keyring to attach requested keys to */ |
1047 | #endif | 1047 | #endif |
1048 | /* | 1048 | /* |
1049 | * fpu_counter contains the number of consecutive context switches | 1049 | * fpu_counter contains the number of consecutive context switches |
1050 | * that the FPU is used. If this is over a threshold, the lazy fpu | 1050 | * that the FPU is used. If this is over a threshold, the lazy fpu |
1051 | * saving becomes unlazy to save the trap. This is an unsigned char | 1051 | * saving becomes unlazy to save the trap. This is an unsigned char |
1052 | * so that after 256 times the counter wraps and the behavior turns | 1052 | * so that after 256 times the counter wraps and the behavior turns |
1053 | * lazy again; this to deal with bursty apps that only use FPU for | 1053 | * lazy again; this to deal with bursty apps that only use FPU for |
1054 | * a short time | 1054 | * a short time |
1055 | */ | 1055 | */ |
1056 | unsigned char fpu_counter; | 1056 | unsigned char fpu_counter; |
1057 | int oomkilladj; /* OOM kill score adjustment (bit shift). */ | 1057 | int oomkilladj; /* OOM kill score adjustment (bit shift). */ |
1058 | char comm[TASK_COMM_LEN]; /* executable name excluding path | 1058 | char comm[TASK_COMM_LEN]; /* executable name excluding path |
1059 | - access with [gs]et_task_comm (which lock | 1059 | - access with [gs]et_task_comm (which lock |
1060 | it with task_lock()) | 1060 | it with task_lock()) |
1061 | - initialized normally by flush_old_exec */ | 1061 | - initialized normally by flush_old_exec */ |
1062 | /* file system info */ | 1062 | /* file system info */ |
1063 | int link_count, total_link_count; | 1063 | int link_count, total_link_count; |
1064 | #ifdef CONFIG_SYSVIPC | 1064 | #ifdef CONFIG_SYSVIPC |
1065 | /* ipc stuff */ | 1065 | /* ipc stuff */ |
1066 | struct sysv_sem sysvsem; | 1066 | struct sysv_sem sysvsem; |
1067 | #endif | 1067 | #endif |
1068 | /* CPU-specific state of this task */ | 1068 | /* CPU-specific state of this task */ |
1069 | struct thread_struct thread; | 1069 | struct thread_struct thread; |
1070 | /* filesystem information */ | 1070 | /* filesystem information */ |
1071 | struct fs_struct *fs; | 1071 | struct fs_struct *fs; |
1072 | /* open file information */ | 1072 | /* open file information */ |
1073 | struct files_struct *files; | 1073 | struct files_struct *files; |
1074 | /* namespaces */ | 1074 | /* namespaces */ |
1075 | struct nsproxy *nsproxy; | 1075 | struct nsproxy *nsproxy; |
1076 | /* signal handlers */ | 1076 | /* signal handlers */ |
1077 | struct signal_struct *signal; | 1077 | struct signal_struct *signal; |
1078 | struct sighand_struct *sighand; | 1078 | struct sighand_struct *sighand; |
1079 | 1079 | ||
1080 | sigset_t blocked, real_blocked; | 1080 | sigset_t blocked, real_blocked; |
1081 | sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ | 1081 | sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */ |
1082 | struct sigpending pending; | 1082 | struct sigpending pending; |
1083 | 1083 | ||
1084 | unsigned long sas_ss_sp; | 1084 | unsigned long sas_ss_sp; |
1085 | size_t sas_ss_size; | 1085 | size_t sas_ss_size; |
1086 | int (*notifier)(void *priv); | 1086 | int (*notifier)(void *priv); |
1087 | void *notifier_data; | 1087 | void *notifier_data; |
1088 | sigset_t *notifier_mask; | 1088 | sigset_t *notifier_mask; |
1089 | 1089 | ||
1090 | void *security; | 1090 | void *security; |
1091 | struct audit_context *audit_context; | 1091 | struct audit_context *audit_context; |
1092 | seccomp_t seccomp; | 1092 | seccomp_t seccomp; |
1093 | 1093 | ||
1094 | /* Thread group tracking */ | 1094 | /* Thread group tracking */ |
1095 | u32 parent_exec_id; | 1095 | u32 parent_exec_id; |
1096 | u32 self_exec_id; | 1096 | u32 self_exec_id; |
1097 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ | 1097 | /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */ |
1098 | spinlock_t alloc_lock; | 1098 | spinlock_t alloc_lock; |
1099 | 1099 | ||
1100 | /* Protection of the PI data structures: */ | 1100 | /* Protection of the PI data structures: */ |
1101 | spinlock_t pi_lock; | 1101 | spinlock_t pi_lock; |
1102 | 1102 | ||
1103 | #ifdef CONFIG_RT_MUTEXES | 1103 | #ifdef CONFIG_RT_MUTEXES |
1104 | /* PI waiters blocked on a rt_mutex held by this task */ | 1104 | /* PI waiters blocked on a rt_mutex held by this task */ |
1105 | struct plist_head pi_waiters; | 1105 | struct plist_head pi_waiters; |
1106 | /* Deadlock detection and priority inheritance handling */ | 1106 | /* Deadlock detection and priority inheritance handling */ |
1107 | struct rt_mutex_waiter *pi_blocked_on; | 1107 | struct rt_mutex_waiter *pi_blocked_on; |
1108 | #endif | 1108 | #endif |
1109 | 1109 | ||
1110 | #ifdef CONFIG_DEBUG_MUTEXES | 1110 | #ifdef CONFIG_DEBUG_MUTEXES |
1111 | /* mutex deadlock detection */ | 1111 | /* mutex deadlock detection */ |
1112 | struct mutex_waiter *blocked_on; | 1112 | struct mutex_waiter *blocked_on; |
1113 | #endif | 1113 | #endif |
1114 | #ifdef CONFIG_TRACE_IRQFLAGS | 1114 | #ifdef CONFIG_TRACE_IRQFLAGS |
1115 | unsigned int irq_events; | 1115 | unsigned int irq_events; |
1116 | int hardirqs_enabled; | 1116 | int hardirqs_enabled; |
1117 | unsigned long hardirq_enable_ip; | 1117 | unsigned long hardirq_enable_ip; |
1118 | unsigned int hardirq_enable_event; | 1118 | unsigned int hardirq_enable_event; |
1119 | unsigned long hardirq_disable_ip; | 1119 | unsigned long hardirq_disable_ip; |
1120 | unsigned int hardirq_disable_event; | 1120 | unsigned int hardirq_disable_event; |
1121 | int softirqs_enabled; | 1121 | int softirqs_enabled; |
1122 | unsigned long softirq_disable_ip; | 1122 | unsigned long softirq_disable_ip; |
1123 | unsigned int softirq_disable_event; | 1123 | unsigned int softirq_disable_event; |
1124 | unsigned long softirq_enable_ip; | 1124 | unsigned long softirq_enable_ip; |
1125 | unsigned int softirq_enable_event; | 1125 | unsigned int softirq_enable_event; |
1126 | int hardirq_context; | 1126 | int hardirq_context; |
1127 | int softirq_context; | 1127 | int softirq_context; |
1128 | #endif | 1128 | #endif |
1129 | #ifdef CONFIG_LOCKDEP | 1129 | #ifdef CONFIG_LOCKDEP |
1130 | # define MAX_LOCK_DEPTH 30UL | 1130 | # define MAX_LOCK_DEPTH 30UL |
1131 | u64 curr_chain_key; | 1131 | u64 curr_chain_key; |
1132 | int lockdep_depth; | 1132 | int lockdep_depth; |
1133 | struct held_lock held_locks[MAX_LOCK_DEPTH]; | 1133 | struct held_lock held_locks[MAX_LOCK_DEPTH]; |
1134 | unsigned int lockdep_recursion; | 1134 | unsigned int lockdep_recursion; |
1135 | #endif | 1135 | #endif |
1136 | 1136 | ||
1137 | /* journalling filesystem info */ | 1137 | /* journalling filesystem info */ |
1138 | void *journal_info; | 1138 | void *journal_info; |
1139 | 1139 | ||
1140 | /* stacked block device info */ | 1140 | /* stacked block device info */ |
1141 | struct bio *bio_list, **bio_tail; | 1141 | struct bio *bio_list, **bio_tail; |
1142 | 1142 | ||
1143 | /* VM state */ | 1143 | /* VM state */ |
1144 | struct reclaim_state *reclaim_state; | 1144 | struct reclaim_state *reclaim_state; |
1145 | 1145 | ||
1146 | struct backing_dev_info *backing_dev_info; | 1146 | struct backing_dev_info *backing_dev_info; |
1147 | 1147 | ||
1148 | struct io_context *io_context; | 1148 | struct io_context *io_context; |
1149 | 1149 | ||
1150 | unsigned long ptrace_message; | 1150 | unsigned long ptrace_message; |
1151 | siginfo_t *last_siginfo; /* For ptrace use. */ | 1151 | siginfo_t *last_siginfo; /* For ptrace use. */ |
1152 | /* | 1152 | /* |
1153 | * current io wait handle: wait queue entry to use for io waits | 1153 | * current io wait handle: wait queue entry to use for io waits |
1154 | * If this thread is processing aio, this points at the waitqueue | 1154 | * If this thread is processing aio, this points at the waitqueue |
1155 | * inside the currently handled kiocb. It may be NULL (i.e. default | 1155 | * inside the currently handled kiocb. It may be NULL (i.e. default |
1156 | * to a stack based synchronous wait) if its doing sync IO. | 1156 | * to a stack based synchronous wait) if its doing sync IO. |
1157 | */ | 1157 | */ |
1158 | wait_queue_t *io_wait; | 1158 | wait_queue_t *io_wait; |
1159 | #ifdef CONFIG_TASK_XACCT | 1159 | #ifdef CONFIG_TASK_XACCT |
1160 | /* i/o counters(bytes read/written, #syscalls */ | 1160 | /* i/o counters(bytes read/written, #syscalls */ |
1161 | u64 rchar, wchar, syscr, syscw; | 1161 | u64 rchar, wchar, syscr, syscw; |
1162 | #endif | 1162 | #endif |
1163 | struct task_io_accounting ioac; | 1163 | struct task_io_accounting ioac; |
1164 | #if defined(CONFIG_TASK_XACCT) | 1164 | #if defined(CONFIG_TASK_XACCT) |
1165 | u64 acct_rss_mem1; /* accumulated rss usage */ | 1165 | u64 acct_rss_mem1; /* accumulated rss usage */ |
1166 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ | 1166 | u64 acct_vm_mem1; /* accumulated virtual memory usage */ |
1167 | cputime_t acct_stimexpd;/* stime since last update */ | 1167 | cputime_t acct_stimexpd;/* stime since last update */ |
1168 | #endif | 1168 | #endif |
1169 | #ifdef CONFIG_NUMA | 1169 | #ifdef CONFIG_NUMA |
1170 | struct mempolicy *mempolicy; | 1170 | struct mempolicy *mempolicy; |
1171 | short il_next; | 1171 | short il_next; |
1172 | #endif | 1172 | #endif |
1173 | #ifdef CONFIG_CPUSETS | 1173 | #ifdef CONFIG_CPUSETS |
1174 | struct cpuset *cpuset; | 1174 | struct cpuset *cpuset; |
1175 | nodemask_t mems_allowed; | 1175 | nodemask_t mems_allowed; |
1176 | int cpuset_mems_generation; | 1176 | int cpuset_mems_generation; |
1177 | int cpuset_mem_spread_rotor; | 1177 | int cpuset_mem_spread_rotor; |
1178 | #endif | 1178 | #endif |
1179 | struct robust_list_head __user *robust_list; | 1179 | struct robust_list_head __user *robust_list; |
1180 | #ifdef CONFIG_COMPAT | 1180 | #ifdef CONFIG_COMPAT |
1181 | struct compat_robust_list_head __user *compat_robust_list; | 1181 | struct compat_robust_list_head __user *compat_robust_list; |
1182 | #endif | 1182 | #endif |
1183 | struct list_head pi_state_list; | 1183 | struct list_head pi_state_list; |
1184 | struct futex_pi_state *pi_state_cache; | 1184 | struct futex_pi_state *pi_state_cache; |
1185 | 1185 | ||
1186 | atomic_t fs_excl; /* holding fs exclusive resources */ | 1186 | atomic_t fs_excl; /* holding fs exclusive resources */ |
1187 | struct rcu_head rcu; | 1187 | struct rcu_head rcu; |
1188 | 1188 | ||
1189 | /* | 1189 | /* |
1190 | * cache last used pipe for splice | 1190 | * cache last used pipe for splice |
1191 | */ | 1191 | */ |
1192 | struct pipe_inode_info *splice_pipe; | 1192 | struct pipe_inode_info *splice_pipe; |
1193 | #ifdef CONFIG_TASK_DELAY_ACCT | 1193 | #ifdef CONFIG_TASK_DELAY_ACCT |
1194 | struct task_delay_info *delays; | 1194 | struct task_delay_info *delays; |
1195 | #endif | 1195 | #endif |
1196 | #ifdef CONFIG_FAULT_INJECTION | 1196 | #ifdef CONFIG_FAULT_INJECTION |
1197 | int make_it_fail; | 1197 | int make_it_fail; |
1198 | #endif | 1198 | #endif |
1199 | }; | 1199 | }; |
1200 | 1200 | ||
1201 | /* | 1201 | /* |
1202 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT | 1202 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT |
1203 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH | 1203 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH |
1204 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority | 1204 | * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority |
1205 | * values are inverted: lower p->prio value means higher priority. | 1205 | * values are inverted: lower p->prio value means higher priority. |
1206 | * | 1206 | * |
1207 | * The MAX_USER_RT_PRIO value allows the actual maximum | 1207 | * The MAX_USER_RT_PRIO value allows the actual maximum |
1208 | * RT priority to be separate from the value exported to | 1208 | * RT priority to be separate from the value exported to |
1209 | * user-space. This allows kernel threads to set their | 1209 | * user-space. This allows kernel threads to set their |
1210 | * priority to a value higher than any user task. Note: | 1210 | * priority to a value higher than any user task. Note: |
1211 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. | 1211 | * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. |
1212 | */ | 1212 | */ |
1213 | 1213 | ||
1214 | #define MAX_USER_RT_PRIO 100 | 1214 | #define MAX_USER_RT_PRIO 100 |
1215 | #define MAX_RT_PRIO MAX_USER_RT_PRIO | 1215 | #define MAX_RT_PRIO MAX_USER_RT_PRIO |
1216 | 1216 | ||
1217 | #define MAX_PRIO (MAX_RT_PRIO + 40) | 1217 | #define MAX_PRIO (MAX_RT_PRIO + 40) |
1218 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) | 1218 | #define DEFAULT_PRIO (MAX_RT_PRIO + 20) |
1219 | 1219 | ||
1220 | static inline int rt_prio(int prio) | 1220 | static inline int rt_prio(int prio) |
1221 | { | 1221 | { |
1222 | if (unlikely(prio < MAX_RT_PRIO)) | 1222 | if (unlikely(prio < MAX_RT_PRIO)) |
1223 | return 1; | 1223 | return 1; |
1224 | return 0; | 1224 | return 0; |
1225 | } | 1225 | } |
1226 | 1226 | ||
1227 | static inline int rt_task(struct task_struct *p) | 1227 | static inline int rt_task(struct task_struct *p) |
1228 | { | 1228 | { |
1229 | return rt_prio(p->prio); | 1229 | return rt_prio(p->prio); |
1230 | } | 1230 | } |
1231 | 1231 | ||
1232 | static inline pid_t process_group(struct task_struct *tsk) | 1232 | static inline pid_t process_group(struct task_struct *tsk) |
1233 | { | 1233 | { |
1234 | return tsk->signal->pgrp; | 1234 | return tsk->signal->pgrp; |
1235 | } | 1235 | } |
1236 | 1236 | ||
1237 | static inline pid_t signal_session(struct signal_struct *sig) | 1237 | static inline pid_t signal_session(struct signal_struct *sig) |
1238 | { | 1238 | { |
1239 | return sig->__session; | 1239 | return sig->__session; |
1240 | } | 1240 | } |
1241 | 1241 | ||
1242 | static inline pid_t process_session(struct task_struct *tsk) | 1242 | static inline pid_t process_session(struct task_struct *tsk) |
1243 | { | 1243 | { |
1244 | return signal_session(tsk->signal); | 1244 | return signal_session(tsk->signal); |
1245 | } | 1245 | } |
1246 | 1246 | ||
1247 | static inline void set_signal_session(struct signal_struct *sig, pid_t session) | 1247 | static inline void set_signal_session(struct signal_struct *sig, pid_t session) |
1248 | { | 1248 | { |
1249 | sig->__session = session; | 1249 | sig->__session = session; |
1250 | } | 1250 | } |
1251 | 1251 | ||
1252 | static inline struct pid *task_pid(struct task_struct *task) | 1252 | static inline struct pid *task_pid(struct task_struct *task) |
1253 | { | 1253 | { |
1254 | return task->pids[PIDTYPE_PID].pid; | 1254 | return task->pids[PIDTYPE_PID].pid; |
1255 | } | 1255 | } |
1256 | 1256 | ||
1257 | static inline struct pid *task_tgid(struct task_struct *task) | 1257 | static inline struct pid *task_tgid(struct task_struct *task) |
1258 | { | 1258 | { |
1259 | return task->group_leader->pids[PIDTYPE_PID].pid; | 1259 | return task->group_leader->pids[PIDTYPE_PID].pid; |
1260 | } | 1260 | } |
1261 | 1261 | ||
1262 | static inline struct pid *task_pgrp(struct task_struct *task) | 1262 | static inline struct pid *task_pgrp(struct task_struct *task) |
1263 | { | 1263 | { |
1264 | return task->group_leader->pids[PIDTYPE_PGID].pid; | 1264 | return task->group_leader->pids[PIDTYPE_PGID].pid; |
1265 | } | 1265 | } |
1266 | 1266 | ||
1267 | static inline struct pid *task_session(struct task_struct *task) | 1267 | static inline struct pid *task_session(struct task_struct *task) |
1268 | { | 1268 | { |
1269 | return task->group_leader->pids[PIDTYPE_SID].pid; | 1269 | return task->group_leader->pids[PIDTYPE_SID].pid; |
1270 | } | 1270 | } |
1271 | 1271 | ||
1272 | /** | 1272 | /** |
1273 | * pid_alive - check that a task structure is not stale | 1273 | * pid_alive - check that a task structure is not stale |
1274 | * @p: Task structure to be checked. | 1274 | * @p: Task structure to be checked. |
1275 | * | 1275 | * |
1276 | * Test if a process is not yet dead (at most zombie state) | 1276 | * Test if a process is not yet dead (at most zombie state) |
1277 | * If pid_alive fails, then pointers within the task structure | 1277 | * If pid_alive fails, then pointers within the task structure |
1278 | * can be stale and must not be dereferenced. | 1278 | * can be stale and must not be dereferenced. |
1279 | */ | 1279 | */ |
1280 | static inline int pid_alive(struct task_struct *p) | 1280 | static inline int pid_alive(struct task_struct *p) |
1281 | { | 1281 | { |
1282 | return p->pids[PIDTYPE_PID].pid != NULL; | 1282 | return p->pids[PIDTYPE_PID].pid != NULL; |
1283 | } | 1283 | } |
1284 | 1284 | ||
1285 | /** | 1285 | /** |
1286 | * is_init - check if a task structure is init | 1286 | * is_init - check if a task structure is init |
1287 | * @tsk: Task structure to be checked. | 1287 | * @tsk: Task structure to be checked. |
1288 | * | 1288 | * |
1289 | * Check if a task structure is the first user space task the kernel created. | 1289 | * Check if a task structure is the first user space task the kernel created. |
1290 | */ | 1290 | */ |
1291 | static inline int is_init(struct task_struct *tsk) | 1291 | static inline int is_init(struct task_struct *tsk) |
1292 | { | 1292 | { |
1293 | return tsk->pid == 1; | 1293 | return tsk->pid == 1; |
1294 | } | 1294 | } |
1295 | 1295 | ||
1296 | extern struct pid *cad_pid; | 1296 | extern struct pid *cad_pid; |
1297 | 1297 | ||
1298 | extern void free_task(struct task_struct *tsk); | 1298 | extern void free_task(struct task_struct *tsk); |
1299 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) | 1299 | #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) |
1300 | 1300 | ||
1301 | extern void __put_task_struct(struct task_struct *t); | 1301 | extern void __put_task_struct(struct task_struct *t); |
1302 | 1302 | ||
1303 | static inline void put_task_struct(struct task_struct *t) | 1303 | static inline void put_task_struct(struct task_struct *t) |
1304 | { | 1304 | { |
1305 | if (atomic_dec_and_test(&t->usage)) | 1305 | if (atomic_dec_and_test(&t->usage)) |
1306 | __put_task_struct(t); | 1306 | __put_task_struct(t); |
1307 | } | 1307 | } |
1308 | 1308 | ||
1309 | /* | 1309 | /* |
1310 | * Per process flags | 1310 | * Per process flags |
1311 | */ | 1311 | */ |
1312 | #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ | 1312 | #define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ |
1313 | /* Not implemented yet, only for 486*/ | 1313 | /* Not implemented yet, only for 486*/ |
1314 | #define PF_STARTING 0x00000002 /* being created */ | 1314 | #define PF_STARTING 0x00000002 /* being created */ |
1315 | #define PF_EXITING 0x00000004 /* getting shut down */ | 1315 | #define PF_EXITING 0x00000004 /* getting shut down */ |
1316 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ | 1316 | #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ |
1317 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | 1317 | #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ |
1318 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | 1318 | #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ |
1319 | #define PF_DUMPCORE 0x00000200 /* dumped core */ | 1319 | #define PF_DUMPCORE 0x00000200 /* dumped core */ |
1320 | #define PF_SIGNALED 0x00000400 /* killed by a signal */ | 1320 | #define PF_SIGNALED 0x00000400 /* killed by a signal */ |
1321 | #define PF_MEMALLOC 0x00000800 /* Allocating memory */ | 1321 | #define PF_MEMALLOC 0x00000800 /* Allocating memory */ |
1322 | #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ | 1322 | #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ |
1323 | #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ | 1323 | #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ |
1324 | #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ | 1324 | #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ |
1325 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ | 1325 | #define PF_FROZEN 0x00010000 /* frozen for system suspend */ |
1326 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ | 1326 | #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ |
1327 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ | 1327 | #define PF_KSWAPD 0x00040000 /* I am kswapd */ |
1328 | #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ | 1328 | #define PF_SWAPOFF 0x00080000 /* I am in swapoff */ |
1329 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ | 1329 | #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ |
1330 | #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */ | 1330 | #define PF_BORROWED_MM 0x00200000 /* I am a kthread doing use_mm */ |
1331 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ | 1331 | #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ |
1332 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | 1332 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
1333 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ | 1333 | #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ |
1334 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ | 1334 | #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ |
1335 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ | 1335 | #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ |
1336 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ | 1336 | #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ |
1337 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ | 1337 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezeable */ |
1338 | 1338 | ||
1339 | /* | 1339 | /* |
1340 | * Only the _current_ task can read/write to tsk->flags, but other | 1340 | * Only the _current_ task can read/write to tsk->flags, but other |
1341 | * tasks can access tsk->flags in readonly mode for example | 1341 | * tasks can access tsk->flags in readonly mode for example |
1342 | * with tsk_used_math (like during threaded core dumping). | 1342 | * with tsk_used_math (like during threaded core dumping). |
1343 | * There is however an exception to this rule during ptrace | 1343 | * There is however an exception to this rule during ptrace |
1344 | * or during fork: the ptracer task is allowed to write to the | 1344 | * or during fork: the ptracer task is allowed to write to the |
1345 | * child->flags of its traced child (same goes for fork, the parent | 1345 | * child->flags of its traced child (same goes for fork, the parent |
1346 | * can write to the child->flags), because we're guaranteed the | 1346 | * can write to the child->flags), because we're guaranteed the |
1347 | * child is not running and in turn not changing child->flags | 1347 | * child is not running and in turn not changing child->flags |
1348 | * at the same time the parent does it. | 1348 | * at the same time the parent does it. |
1349 | */ | 1349 | */ |
1350 | #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) | 1350 | #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) |
1351 | #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) | 1351 | #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) |
1352 | #define clear_used_math() clear_stopped_child_used_math(current) | 1352 | #define clear_used_math() clear_stopped_child_used_math(current) |
1353 | #define set_used_math() set_stopped_child_used_math(current) | 1353 | #define set_used_math() set_stopped_child_used_math(current) |
1354 | #define conditional_stopped_child_used_math(condition, child) \ | 1354 | #define conditional_stopped_child_used_math(condition, child) \ |
1355 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) | 1355 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) |
1356 | #define conditional_used_math(condition) \ | 1356 | #define conditional_used_math(condition) \ |
1357 | conditional_stopped_child_used_math(condition, current) | 1357 | conditional_stopped_child_used_math(condition, current) |
1358 | #define copy_to_stopped_child_used_math(child) \ | 1358 | #define copy_to_stopped_child_used_math(child) \ |
1359 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) | 1359 | do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) |
1360 | /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ | 1360 | /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ |
1361 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) | 1361 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) |
1362 | #define used_math() tsk_used_math(current) | 1362 | #define used_math() tsk_used_math(current) |
1363 | 1363 | ||
1364 | #ifdef CONFIG_SMP | 1364 | #ifdef CONFIG_SMP |
1365 | extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); | 1365 | extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); |
1366 | #else | 1366 | #else |
1367 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 1367 | static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
1368 | { | 1368 | { |
1369 | if (!cpu_isset(0, new_mask)) | 1369 | if (!cpu_isset(0, new_mask)) |
1370 | return -EINVAL; | 1370 | return -EINVAL; |
1371 | return 0; | 1371 | return 0; |
1372 | } | 1372 | } |
1373 | #endif | 1373 | #endif |
1374 | 1374 | ||
1375 | extern unsigned long long sched_clock(void); | 1375 | extern unsigned long long sched_clock(void); |
1376 | 1376 | ||
1377 | /* | 1377 | /* |
1378 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 1378 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
1379 | * clock constructed from sched_clock(): | 1379 | * clock constructed from sched_clock(): |
1380 | */ | 1380 | */ |
1381 | extern unsigned long long cpu_clock(int cpu); | 1381 | extern unsigned long long cpu_clock(int cpu); |
1382 | 1382 | ||
1383 | extern unsigned long long | 1383 | extern unsigned long long |
1384 | task_sched_runtime(struct task_struct *task); | 1384 | task_sched_runtime(struct task_struct *task); |
1385 | 1385 | ||
1386 | /* sched_exec is called by processes performing an exec */ | 1386 | /* sched_exec is called by processes performing an exec */ |
1387 | #ifdef CONFIG_SMP | 1387 | #ifdef CONFIG_SMP |
1388 | extern void sched_exec(void); | 1388 | extern void sched_exec(void); |
1389 | #else | 1389 | #else |
1390 | #define sched_exec() {} | 1390 | #define sched_exec() {} |
1391 | #endif | 1391 | #endif |
1392 | 1392 | ||
1393 | extern void sched_clock_unstable_event(void); | 1393 | extern void sched_clock_unstable_event(void); |
1394 | 1394 | ||
1395 | #ifdef CONFIG_HOTPLUG_CPU | 1395 | #ifdef CONFIG_HOTPLUG_CPU |
1396 | extern void idle_task_exit(void); | 1396 | extern void idle_task_exit(void); |
1397 | #else | 1397 | #else |
1398 | static inline void idle_task_exit(void) {} | 1398 | static inline void idle_task_exit(void) {} |
1399 | #endif | 1399 | #endif |
1400 | 1400 | ||
1401 | extern void sched_idle_next(void); | 1401 | extern void sched_idle_next(void); |
1402 | 1402 | ||
1403 | extern unsigned int sysctl_sched_granularity; | 1403 | extern unsigned int sysctl_sched_granularity; |
1404 | extern unsigned int sysctl_sched_wakeup_granularity; | 1404 | extern unsigned int sysctl_sched_wakeup_granularity; |
1405 | extern unsigned int sysctl_sched_batch_wakeup_granularity; | 1405 | extern unsigned int sysctl_sched_batch_wakeup_granularity; |
1406 | extern unsigned int sysctl_sched_stat_granularity; | 1406 | extern unsigned int sysctl_sched_stat_granularity; |
1407 | extern unsigned int sysctl_sched_runtime_limit; | 1407 | extern unsigned int sysctl_sched_runtime_limit; |
1408 | extern unsigned int sysctl_sched_child_runs_first; | 1408 | extern unsigned int sysctl_sched_child_runs_first; |
1409 | extern unsigned int sysctl_sched_features; | 1409 | extern unsigned int sysctl_sched_features; |
1410 | 1410 | ||
1411 | #ifdef CONFIG_RT_MUTEXES | 1411 | #ifdef CONFIG_RT_MUTEXES |
1412 | extern int rt_mutex_getprio(struct task_struct *p); | 1412 | extern int rt_mutex_getprio(struct task_struct *p); |
1413 | extern void rt_mutex_setprio(struct task_struct *p, int prio); | 1413 | extern void rt_mutex_setprio(struct task_struct *p, int prio); |
1414 | extern void rt_mutex_adjust_pi(struct task_struct *p); | 1414 | extern void rt_mutex_adjust_pi(struct task_struct *p); |
1415 | #else | 1415 | #else |
1416 | static inline int rt_mutex_getprio(struct task_struct *p) | 1416 | static inline int rt_mutex_getprio(struct task_struct *p) |
1417 | { | 1417 | { |
1418 | return p->normal_prio; | 1418 | return p->normal_prio; |
1419 | } | 1419 | } |
1420 | # define rt_mutex_adjust_pi(p) do { } while (0) | 1420 | # define rt_mutex_adjust_pi(p) do { } while (0) |
1421 | #endif | 1421 | #endif |
1422 | 1422 | ||
1423 | extern void set_user_nice(struct task_struct *p, long nice); | 1423 | extern void set_user_nice(struct task_struct *p, long nice); |
1424 | extern int task_prio(const struct task_struct *p); | 1424 | extern int task_prio(const struct task_struct *p); |
1425 | extern int task_nice(const struct task_struct *p); | 1425 | extern int task_nice(const struct task_struct *p); |
1426 | extern int can_nice(const struct task_struct *p, const int nice); | 1426 | extern int can_nice(const struct task_struct *p, const int nice); |
1427 | extern int task_curr(const struct task_struct *p); | 1427 | extern int task_curr(const struct task_struct *p); |
1428 | extern int idle_cpu(int cpu); | 1428 | extern int idle_cpu(int cpu); |
1429 | extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); | 1429 | extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); |
1430 | extern struct task_struct *idle_task(int cpu); | 1430 | extern struct task_struct *idle_task(int cpu); |
1431 | extern struct task_struct *curr_task(int cpu); | 1431 | extern struct task_struct *curr_task(int cpu); |
1432 | extern void set_curr_task(int cpu, struct task_struct *p); | 1432 | extern void set_curr_task(int cpu, struct task_struct *p); |
1433 | 1433 | ||
1434 | void yield(void); | 1434 | void yield(void); |
1435 | 1435 | ||
1436 | /* | 1436 | /* |
1437 | * The default (Linux) execution domain. | 1437 | * The default (Linux) execution domain. |
1438 | */ | 1438 | */ |
1439 | extern struct exec_domain default_exec_domain; | 1439 | extern struct exec_domain default_exec_domain; |
1440 | 1440 | ||
1441 | union thread_union { | 1441 | union thread_union { |
1442 | struct thread_info thread_info; | 1442 | struct thread_info thread_info; |
1443 | unsigned long stack[THREAD_SIZE/sizeof(long)]; | 1443 | unsigned long stack[THREAD_SIZE/sizeof(long)]; |
1444 | }; | 1444 | }; |
1445 | 1445 | ||
1446 | #ifndef __HAVE_ARCH_KSTACK_END | 1446 | #ifndef __HAVE_ARCH_KSTACK_END |
1447 | static inline int kstack_end(void *addr) | 1447 | static inline int kstack_end(void *addr) |
1448 | { | 1448 | { |
1449 | /* Reliable end of stack detection: | 1449 | /* Reliable end of stack detection: |
1450 | * Some APM bios versions misalign the stack | 1450 | * Some APM bios versions misalign the stack |
1451 | */ | 1451 | */ |
1452 | return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); | 1452 | return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*))); |
1453 | } | 1453 | } |
1454 | #endif | 1454 | #endif |
1455 | 1455 | ||
1456 | extern union thread_union init_thread_union; | 1456 | extern union thread_union init_thread_union; |
1457 | extern struct task_struct init_task; | 1457 | extern struct task_struct init_task; |
1458 | 1458 | ||
1459 | extern struct mm_struct init_mm; | 1459 | extern struct mm_struct init_mm; |
1460 | 1460 | ||
1461 | #define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) | 1461 | #define find_task_by_pid(nr) find_task_by_pid_type(PIDTYPE_PID, nr) |
1462 | extern struct task_struct *find_task_by_pid_type(int type, int pid); | 1462 | extern struct task_struct *find_task_by_pid_type(int type, int pid); |
1463 | extern void __set_special_pids(pid_t session, pid_t pgrp); | 1463 | extern void __set_special_pids(pid_t session, pid_t pgrp); |
1464 | 1464 | ||
1465 | /* per-UID process charging. */ | 1465 | /* per-UID process charging. */ |
1466 | extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); | 1466 | extern struct user_struct * alloc_uid(struct user_namespace *, uid_t); |
1467 | static inline struct user_struct *get_uid(struct user_struct *u) | 1467 | static inline struct user_struct *get_uid(struct user_struct *u) |
1468 | { | 1468 | { |
1469 | atomic_inc(&u->__count); | 1469 | atomic_inc(&u->__count); |
1470 | return u; | 1470 | return u; |
1471 | } | 1471 | } |
1472 | extern void free_uid(struct user_struct *); | 1472 | extern void free_uid(struct user_struct *); |
1473 | extern void switch_uid(struct user_struct *); | 1473 | extern void switch_uid(struct user_struct *); |
1474 | 1474 | ||
1475 | #include <asm/current.h> | 1475 | #include <asm/current.h> |
1476 | 1476 | ||
1477 | extern void do_timer(unsigned long ticks); | 1477 | extern void do_timer(unsigned long ticks); |
1478 | 1478 | ||
1479 | extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); | 1479 | extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); |
1480 | extern int FASTCALL(wake_up_process(struct task_struct * tsk)); | 1480 | extern int FASTCALL(wake_up_process(struct task_struct * tsk)); |
1481 | extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, | 1481 | extern void FASTCALL(wake_up_new_task(struct task_struct * tsk, |
1482 | unsigned long clone_flags)); | 1482 | unsigned long clone_flags)); |
1483 | #ifdef CONFIG_SMP | 1483 | #ifdef CONFIG_SMP |
1484 | extern void kick_process(struct task_struct *tsk); | 1484 | extern void kick_process(struct task_struct *tsk); |
1485 | #else | 1485 | #else |
1486 | static inline void kick_process(struct task_struct *tsk) { } | 1486 | static inline void kick_process(struct task_struct *tsk) { } |
1487 | #endif | 1487 | #endif |
1488 | extern void sched_fork(struct task_struct *p, int clone_flags); | 1488 | extern void sched_fork(struct task_struct *p, int clone_flags); |
1489 | extern void sched_dead(struct task_struct *p); | 1489 | extern void sched_dead(struct task_struct *p); |
1490 | 1490 | ||
1491 | extern int in_group_p(gid_t); | 1491 | extern int in_group_p(gid_t); |
1492 | extern int in_egroup_p(gid_t); | 1492 | extern int in_egroup_p(gid_t); |
1493 | 1493 | ||
1494 | extern void proc_caches_init(void); | 1494 | extern void proc_caches_init(void); |
1495 | extern void flush_signals(struct task_struct *); | 1495 | extern void flush_signals(struct task_struct *); |
1496 | extern void ignore_signals(struct task_struct *); | 1496 | extern void ignore_signals(struct task_struct *); |
1497 | extern void flush_signal_handlers(struct task_struct *, int force_default); | 1497 | extern void flush_signal_handlers(struct task_struct *, int force_default); |
1498 | extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); | 1498 | extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); |
1499 | 1499 | ||
1500 | static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) | 1500 | static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) |
1501 | { | 1501 | { |
1502 | unsigned long flags; | 1502 | unsigned long flags; |
1503 | int ret; | 1503 | int ret; |
1504 | 1504 | ||
1505 | spin_lock_irqsave(&tsk->sighand->siglock, flags); | 1505 | spin_lock_irqsave(&tsk->sighand->siglock, flags); |
1506 | ret = dequeue_signal(tsk, mask, info); | 1506 | ret = dequeue_signal(tsk, mask, info); |
1507 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); | 1507 | spin_unlock_irqrestore(&tsk->sighand->siglock, flags); |
1508 | 1508 | ||
1509 | return ret; | 1509 | return ret; |
1510 | } | 1510 | } |
1511 | 1511 | ||
1512 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, | 1512 | extern void block_all_signals(int (*notifier)(void *priv), void *priv, |
1513 | sigset_t *mask); | 1513 | sigset_t *mask); |
1514 | extern void unblock_all_signals(void); | 1514 | extern void unblock_all_signals(void); |
1515 | extern void release_task(struct task_struct * p); | 1515 | extern void release_task(struct task_struct * p); |
1516 | extern int send_sig_info(int, struct siginfo *, struct task_struct *); | 1516 | extern int send_sig_info(int, struct siginfo *, struct task_struct *); |
1517 | extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); | 1517 | extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); |
1518 | extern int force_sigsegv(int, struct task_struct *); | 1518 | extern int force_sigsegv(int, struct task_struct *); |
1519 | extern int force_sig_info(int, struct siginfo *, struct task_struct *); | 1519 | extern int force_sig_info(int, struct siginfo *, struct task_struct *); |
1520 | extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); | 1520 | extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); |
1521 | extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); | 1521 | extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); |
1522 | extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); | 1522 | extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); |
1523 | extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32); | 1523 | extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32); |
1524 | extern int kill_pgrp(struct pid *pid, int sig, int priv); | 1524 | extern int kill_pgrp(struct pid *pid, int sig, int priv); |
1525 | extern int kill_pid(struct pid *pid, int sig, int priv); | 1525 | extern int kill_pid(struct pid *pid, int sig, int priv); |
1526 | extern int kill_proc_info(int, struct siginfo *, pid_t); | 1526 | extern int kill_proc_info(int, struct siginfo *, pid_t); |
1527 | extern void do_notify_parent(struct task_struct *, int); | 1527 | extern void do_notify_parent(struct task_struct *, int); |
1528 | extern void force_sig(int, struct task_struct *); | 1528 | extern void force_sig(int, struct task_struct *); |
1529 | extern void force_sig_specific(int, struct task_struct *); | 1529 | extern void force_sig_specific(int, struct task_struct *); |
1530 | extern int send_sig(int, struct task_struct *, int); | 1530 | extern int send_sig(int, struct task_struct *, int); |
1531 | extern void zap_other_threads(struct task_struct *p); | 1531 | extern void zap_other_threads(struct task_struct *p); |
1532 | extern int kill_proc(pid_t, int, int); | 1532 | extern int kill_proc(pid_t, int, int); |
1533 | extern struct sigqueue *sigqueue_alloc(void); | 1533 | extern struct sigqueue *sigqueue_alloc(void); |
1534 | extern void sigqueue_free(struct sigqueue *); | 1534 | extern void sigqueue_free(struct sigqueue *); |
1535 | extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); | 1535 | extern int send_sigqueue(int, struct sigqueue *, struct task_struct *); |
1536 | extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); | 1536 | extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); |
1537 | extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); | 1537 | extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); |
1538 | extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); | 1538 | extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); |
1539 | 1539 | ||
1540 | static inline int kill_cad_pid(int sig, int priv) | 1540 | static inline int kill_cad_pid(int sig, int priv) |
1541 | { | 1541 | { |
1542 | return kill_pid(cad_pid, sig, priv); | 1542 | return kill_pid(cad_pid, sig, priv); |
1543 | } | 1543 | } |
1544 | 1544 | ||
1545 | /* These can be the second arg to send_sig_info/send_group_sig_info. */ | 1545 | /* These can be the second arg to send_sig_info/send_group_sig_info. */ |
1546 | #define SEND_SIG_NOINFO ((struct siginfo *) 0) | 1546 | #define SEND_SIG_NOINFO ((struct siginfo *) 0) |
1547 | #define SEND_SIG_PRIV ((struct siginfo *) 1) | 1547 | #define SEND_SIG_PRIV ((struct siginfo *) 1) |
1548 | #define SEND_SIG_FORCED ((struct siginfo *) 2) | 1548 | #define SEND_SIG_FORCED ((struct siginfo *) 2) |
1549 | 1549 | ||
1550 | static inline int is_si_special(const struct siginfo *info) | 1550 | static inline int is_si_special(const struct siginfo *info) |
1551 | { | 1551 | { |
1552 | return info <= SEND_SIG_FORCED; | 1552 | return info <= SEND_SIG_FORCED; |
1553 | } | 1553 | } |
1554 | 1554 | ||
1555 | /* True if we are on the alternate signal stack. */ | 1555 | /* True if we are on the alternate signal stack. */ |
1556 | 1556 | ||
1557 | static inline int on_sig_stack(unsigned long sp) | 1557 | static inline int on_sig_stack(unsigned long sp) |
1558 | { | 1558 | { |
1559 | return (sp - current->sas_ss_sp < current->sas_ss_size); | 1559 | return (sp - current->sas_ss_sp < current->sas_ss_size); |
1560 | } | 1560 | } |
1561 | 1561 | ||
1562 | static inline int sas_ss_flags(unsigned long sp) | 1562 | static inline int sas_ss_flags(unsigned long sp) |
1563 | { | 1563 | { |
1564 | return (current->sas_ss_size == 0 ? SS_DISABLE | 1564 | return (current->sas_ss_size == 0 ? SS_DISABLE |
1565 | : on_sig_stack(sp) ? SS_ONSTACK : 0); | 1565 | : on_sig_stack(sp) ? SS_ONSTACK : 0); |
1566 | } | 1566 | } |
1567 | 1567 | ||
1568 | /* | 1568 | /* |
1569 | * Routines for handling mm_structs | 1569 | * Routines for handling mm_structs |
1570 | */ | 1570 | */ |
1571 | extern struct mm_struct * mm_alloc(void); | 1571 | extern struct mm_struct * mm_alloc(void); |
1572 | 1572 | ||
1573 | /* mmdrop drops the mm and the page tables */ | 1573 | /* mmdrop drops the mm and the page tables */ |
1574 | extern void FASTCALL(__mmdrop(struct mm_struct *)); | 1574 | extern void FASTCALL(__mmdrop(struct mm_struct *)); |
1575 | static inline void mmdrop(struct mm_struct * mm) | 1575 | static inline void mmdrop(struct mm_struct * mm) |
1576 | { | 1576 | { |
1577 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) | 1577 | if (unlikely(atomic_dec_and_test(&mm->mm_count))) |
1578 | __mmdrop(mm); | 1578 | __mmdrop(mm); |
1579 | } | 1579 | } |
1580 | 1580 | ||
1581 | /* mmput gets rid of the mappings and all user-space */ | 1581 | /* mmput gets rid of the mappings and all user-space */ |
1582 | extern void mmput(struct mm_struct *); | 1582 | extern void mmput(struct mm_struct *); |
1583 | /* Grab a reference to a task's mm, if it is not already going away */ | 1583 | /* Grab a reference to a task's mm, if it is not already going away */ |
1584 | extern struct mm_struct *get_task_mm(struct task_struct *task); | 1584 | extern struct mm_struct *get_task_mm(struct task_struct *task); |
1585 | /* Remove the current tasks stale references to the old mm_struct */ | 1585 | /* Remove the current tasks stale references to the old mm_struct */ |
1586 | extern void mm_release(struct task_struct *, struct mm_struct *); | 1586 | extern void mm_release(struct task_struct *, struct mm_struct *); |
1587 | 1587 | ||
1588 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); | 1588 | extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); |
1589 | extern void flush_thread(void); | 1589 | extern void flush_thread(void); |
1590 | extern void exit_thread(void); | 1590 | extern void exit_thread(void); |
1591 | 1591 | ||
1592 | extern void exit_files(struct task_struct *); | 1592 | extern void exit_files(struct task_struct *); |
1593 | extern void __cleanup_signal(struct signal_struct *); | 1593 | extern void __cleanup_signal(struct signal_struct *); |
1594 | extern void __cleanup_sighand(struct sighand_struct *); | 1594 | extern void __cleanup_sighand(struct sighand_struct *); |
1595 | extern void exit_itimers(struct signal_struct *); | 1595 | extern void exit_itimers(struct signal_struct *); |
1596 | 1596 | ||
1597 | extern NORET_TYPE void do_group_exit(int); | 1597 | extern NORET_TYPE void do_group_exit(int); |
1598 | 1598 | ||
1599 | extern void daemonize(const char *, ...); | 1599 | extern void daemonize(const char *, ...); |
1600 | extern int allow_signal(int); | 1600 | extern int allow_signal(int); |
1601 | extern int disallow_signal(int); | 1601 | extern int disallow_signal(int); |
1602 | 1602 | ||
1603 | extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); | 1603 | extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); |
1604 | extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); | 1604 | extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); |
1605 | struct task_struct *fork_idle(int); | 1605 | struct task_struct *fork_idle(int); |
1606 | 1606 | ||
1607 | extern void set_task_comm(struct task_struct *tsk, char *from); | 1607 | extern void set_task_comm(struct task_struct *tsk, char *from); |
1608 | extern void get_task_comm(char *to, struct task_struct *tsk); | 1608 | extern void get_task_comm(char *to, struct task_struct *tsk); |
1609 | 1609 | ||
1610 | #ifdef CONFIG_SMP | 1610 | #ifdef CONFIG_SMP |
1611 | extern void wait_task_inactive(struct task_struct * p); | 1611 | extern void wait_task_inactive(struct task_struct * p); |
1612 | #else | 1612 | #else |
1613 | #define wait_task_inactive(p) do { } while (0) | 1613 | #define wait_task_inactive(p) do { } while (0) |
1614 | #endif | 1614 | #endif |
1615 | 1615 | ||
1616 | #define remove_parent(p) list_del_init(&(p)->sibling) | 1616 | #define remove_parent(p) list_del_init(&(p)->sibling) |
1617 | #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) | 1617 | #define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children) |
1618 | 1618 | ||
1619 | #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) | 1619 | #define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) |
1620 | 1620 | ||
1621 | #define for_each_process(p) \ | 1621 | #define for_each_process(p) \ |
1622 | for (p = &init_task ; (p = next_task(p)) != &init_task ; ) | 1622 | for (p = &init_task ; (p = next_task(p)) != &init_task ; ) |
1623 | 1623 | ||
1624 | /* | 1624 | /* |
1625 | * Careful: do_each_thread/while_each_thread is a double loop so | 1625 | * Careful: do_each_thread/while_each_thread is a double loop so |
1626 | * 'break' will not work as expected - use goto instead. | 1626 | * 'break' will not work as expected - use goto instead. |
1627 | */ | 1627 | */ |
1628 | #define do_each_thread(g, t) \ | 1628 | #define do_each_thread(g, t) \ |
1629 | for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do | 1629 | for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do |
1630 | 1630 | ||
1631 | #define while_each_thread(g, t) \ | 1631 | #define while_each_thread(g, t) \ |
1632 | while ((t = next_thread(t)) != g) | 1632 | while ((t = next_thread(t)) != g) |
1633 | 1633 | ||
1634 | /* de_thread depends on thread_group_leader not being a pid based check */ | 1634 | /* de_thread depends on thread_group_leader not being a pid based check */ |
1635 | #define thread_group_leader(p) (p == p->group_leader) | 1635 | #define thread_group_leader(p) (p == p->group_leader) |
1636 | 1636 | ||
1637 | /* Do to the insanities of de_thread it is possible for a process | 1637 | /* Do to the insanities of de_thread it is possible for a process |
1638 | * to have the pid of the thread group leader without actually being | 1638 | * to have the pid of the thread group leader without actually being |
1639 | * the thread group leader. For iteration through the pids in proc | 1639 | * the thread group leader. For iteration through the pids in proc |
1640 | * all we care about is that we have a task with the appropriate | 1640 | * all we care about is that we have a task with the appropriate |
1641 | * pid, we don't actually care if we have the right task. | 1641 | * pid, we don't actually care if we have the right task. |
1642 | */ | 1642 | */ |
1643 | static inline int has_group_leader_pid(struct task_struct *p) | 1643 | static inline int has_group_leader_pid(struct task_struct *p) |
1644 | { | 1644 | { |
1645 | return p->pid == p->tgid; | 1645 | return p->pid == p->tgid; |
1646 | } | 1646 | } |
1647 | 1647 | ||
1648 | static inline struct task_struct *next_thread(const struct task_struct *p) | 1648 | static inline struct task_struct *next_thread(const struct task_struct *p) |
1649 | { | 1649 | { |
1650 | return list_entry(rcu_dereference(p->thread_group.next), | 1650 | return list_entry(rcu_dereference(p->thread_group.next), |
1651 | struct task_struct, thread_group); | 1651 | struct task_struct, thread_group); |
1652 | } | 1652 | } |
1653 | 1653 | ||
1654 | static inline int thread_group_empty(struct task_struct *p) | 1654 | static inline int thread_group_empty(struct task_struct *p) |
1655 | { | 1655 | { |
1656 | return list_empty(&p->thread_group); | 1656 | return list_empty(&p->thread_group); |
1657 | } | 1657 | } |
1658 | 1658 | ||
1659 | #define delay_group_leader(p) \ | 1659 | #define delay_group_leader(p) \ |
1660 | (thread_group_leader(p) && !thread_group_empty(p)) | 1660 | (thread_group_leader(p) && !thread_group_empty(p)) |
1661 | 1661 | ||
1662 | /* | 1662 | /* |
1663 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring | 1663 | * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring |
1664 | * subscriptions and synchronises with wait4(). Also used in procfs. Also | 1664 | * subscriptions and synchronises with wait4(). Also used in procfs. Also |
1665 | * pins the final release of task.io_context. Also protects ->cpuset. | 1665 | * pins the final release of task.io_context. Also protects ->cpuset. |
1666 | * | 1666 | * |
1667 | * Nests both inside and outside of read_lock(&tasklist_lock). | 1667 | * Nests both inside and outside of read_lock(&tasklist_lock). |
1668 | * It must not be nested with write_lock_irq(&tasklist_lock), | 1668 | * It must not be nested with write_lock_irq(&tasklist_lock), |
1669 | * neither inside nor outside. | 1669 | * neither inside nor outside. |
1670 | */ | 1670 | */ |
1671 | static inline void task_lock(struct task_struct *p) | 1671 | static inline void task_lock(struct task_struct *p) |
1672 | { | 1672 | { |
1673 | spin_lock(&p->alloc_lock); | 1673 | spin_lock(&p->alloc_lock); |
1674 | } | 1674 | } |
1675 | 1675 | ||
1676 | static inline void task_unlock(struct task_struct *p) | 1676 | static inline void task_unlock(struct task_struct *p) |
1677 | { | 1677 | { |
1678 | spin_unlock(&p->alloc_lock); | 1678 | spin_unlock(&p->alloc_lock); |
1679 | } | 1679 | } |
1680 | 1680 | ||
1681 | extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk, | 1681 | extern struct sighand_struct *lock_task_sighand(struct task_struct *tsk, |
1682 | unsigned long *flags); | 1682 | unsigned long *flags); |
1683 | 1683 | ||
1684 | static inline void unlock_task_sighand(struct task_struct *tsk, | 1684 | static inline void unlock_task_sighand(struct task_struct *tsk, |
1685 | unsigned long *flags) | 1685 | unsigned long *flags) |
1686 | { | 1686 | { |
1687 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); | 1687 | spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); |
1688 | } | 1688 | } |
1689 | 1689 | ||
1690 | #ifndef __HAVE_THREAD_FUNCTIONS | 1690 | #ifndef __HAVE_THREAD_FUNCTIONS |
1691 | 1691 | ||
1692 | #define task_thread_info(task) ((struct thread_info *)(task)->stack) | 1692 | #define task_thread_info(task) ((struct thread_info *)(task)->stack) |
1693 | #define task_stack_page(task) ((task)->stack) | 1693 | #define task_stack_page(task) ((task)->stack) |
1694 | 1694 | ||
1695 | static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) | 1695 | static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) |
1696 | { | 1696 | { |
1697 | *task_thread_info(p) = *task_thread_info(org); | 1697 | *task_thread_info(p) = *task_thread_info(org); |
1698 | task_thread_info(p)->task = p; | 1698 | task_thread_info(p)->task = p; |
1699 | } | 1699 | } |
1700 | 1700 | ||
1701 | static inline unsigned long *end_of_stack(struct task_struct *p) | 1701 | static inline unsigned long *end_of_stack(struct task_struct *p) |
1702 | { | 1702 | { |
1703 | return (unsigned long *)(task_thread_info(p) + 1); | 1703 | return (unsigned long *)(task_thread_info(p) + 1); |
1704 | } | 1704 | } |
1705 | 1705 | ||
1706 | #endif | 1706 | #endif |
1707 | 1707 | ||
1708 | /* set thread flags in other task's structures | 1708 | /* set thread flags in other task's structures |
1709 | * - see asm/thread_info.h for TIF_xxxx flags available | 1709 | * - see asm/thread_info.h for TIF_xxxx flags available |
1710 | */ | 1710 | */ |
1711 | static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) | 1711 | static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) |
1712 | { | 1712 | { |
1713 | set_ti_thread_flag(task_thread_info(tsk), flag); | 1713 | set_ti_thread_flag(task_thread_info(tsk), flag); |
1714 | } | 1714 | } |
1715 | 1715 | ||
1716 | static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) | 1716 | static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) |
1717 | { | 1717 | { |
1718 | clear_ti_thread_flag(task_thread_info(tsk), flag); | 1718 | clear_ti_thread_flag(task_thread_info(tsk), flag); |
1719 | } | 1719 | } |
1720 | 1720 | ||
1721 | static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) | 1721 | static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) |
1722 | { | 1722 | { |
1723 | return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); | 1723 | return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); |
1724 | } | 1724 | } |
1725 | 1725 | ||
1726 | static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) | 1726 | static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) |
1727 | { | 1727 | { |
1728 | return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); | 1728 | return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); |
1729 | } | 1729 | } |
1730 | 1730 | ||
1731 | static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) | 1731 | static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) |
1732 | { | 1732 | { |
1733 | return test_ti_thread_flag(task_thread_info(tsk), flag); | 1733 | return test_ti_thread_flag(task_thread_info(tsk), flag); |
1734 | } | 1734 | } |
1735 | 1735 | ||
1736 | static inline void set_tsk_need_resched(struct task_struct *tsk) | 1736 | static inline void set_tsk_need_resched(struct task_struct *tsk) |
1737 | { | 1737 | { |
1738 | set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); | 1738 | set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); |
1739 | } | 1739 | } |
1740 | 1740 | ||
1741 | static inline void clear_tsk_need_resched(struct task_struct *tsk) | 1741 | static inline void clear_tsk_need_resched(struct task_struct *tsk) |
1742 | { | 1742 | { |
1743 | clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); | 1743 | clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); |
1744 | } | 1744 | } |
1745 | 1745 | ||
1746 | static inline int signal_pending(struct task_struct *p) | 1746 | static inline int signal_pending(struct task_struct *p) |
1747 | { | 1747 | { |
1748 | return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); | 1748 | return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); |
1749 | } | 1749 | } |
1750 | 1750 | ||
1751 | static inline int need_resched(void) | 1751 | static inline int need_resched(void) |
1752 | { | 1752 | { |
1753 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); | 1753 | return unlikely(test_thread_flag(TIF_NEED_RESCHED)); |
1754 | } | 1754 | } |
1755 | 1755 | ||
1756 | /* | 1756 | /* |
1757 | * cond_resched() and cond_resched_lock(): latency reduction via | 1757 | * cond_resched() and cond_resched_lock(): latency reduction via |
1758 | * explicit rescheduling in places that are safe. The return | 1758 | * explicit rescheduling in places that are safe. The return |
1759 | * value indicates whether a reschedule was done in fact. | 1759 | * value indicates whether a reschedule was done in fact. |
1760 | * cond_resched_lock() will drop the spinlock before scheduling, | 1760 | * cond_resched_lock() will drop the spinlock before scheduling, |
1761 | * cond_resched_softirq() will enable bhs before scheduling. | 1761 | * cond_resched_softirq() will enable bhs before scheduling. |
1762 | */ | 1762 | */ |
1763 | extern int cond_resched(void); | 1763 | extern int cond_resched(void); |
1764 | extern int cond_resched_lock(spinlock_t * lock); | 1764 | extern int cond_resched_lock(spinlock_t * lock); |
1765 | extern int cond_resched_softirq(void); | 1765 | extern int cond_resched_softirq(void); |
1766 | 1766 | ||
1767 | /* | 1767 | /* |
1768 | * Does a critical section need to be broken due to another | 1768 | * Does a critical section need to be broken due to another |
1769 | * task waiting?: | 1769 | * task waiting?: |
1770 | */ | 1770 | */ |
1771 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) | 1771 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) |
1772 | # define need_lockbreak(lock) ((lock)->break_lock) | 1772 | # define need_lockbreak(lock) ((lock)->break_lock) |
1773 | #else | 1773 | #else |
1774 | # define need_lockbreak(lock) 0 | 1774 | # define need_lockbreak(lock) 0 |
1775 | #endif | 1775 | #endif |
1776 | 1776 | ||
1777 | /* | 1777 | /* |
1778 | * Does a critical section need to be broken due to another | 1778 | * Does a critical section need to be broken due to another |
1779 | * task waiting or preemption being signalled: | 1779 | * task waiting or preemption being signalled: |
1780 | */ | 1780 | */ |
1781 | static inline int lock_need_resched(spinlock_t *lock) | 1781 | static inline int lock_need_resched(spinlock_t *lock) |
1782 | { | 1782 | { |
1783 | if (need_lockbreak(lock) || need_resched()) | 1783 | if (need_lockbreak(lock) || need_resched()) |
1784 | return 1; | 1784 | return 1; |
1785 | return 0; | 1785 | return 0; |
1786 | } | 1786 | } |
1787 | 1787 | ||
1788 | /* | 1788 | /* |
1789 | * Reevaluate whether the task has signals pending delivery. | 1789 | * Reevaluate whether the task has signals pending delivery. |
1790 | * Wake the task if so. | 1790 | * Wake the task if so. |
1791 | * This is required every time the blocked sigset_t changes. | 1791 | * This is required every time the blocked sigset_t changes. |
1792 | * callers must hold sighand->siglock. | 1792 | * callers must hold sighand->siglock. |
1793 | */ | 1793 | */ |
1794 | extern void recalc_sigpending_and_wake(struct task_struct *t); | 1794 | extern void recalc_sigpending_and_wake(struct task_struct *t); |
1795 | extern void recalc_sigpending(void); | 1795 | extern void recalc_sigpending(void); |
1796 | 1796 | ||
1797 | extern void signal_wake_up(struct task_struct *t, int resume_stopped); | 1797 | extern void signal_wake_up(struct task_struct *t, int resume_stopped); |
1798 | 1798 | ||
1799 | /* | 1799 | /* |
1800 | * Wrappers for p->thread_info->cpu access. No-op on UP. | 1800 | * Wrappers for p->thread_info->cpu access. No-op on UP. |
1801 | */ | 1801 | */ |
1802 | #ifdef CONFIG_SMP | 1802 | #ifdef CONFIG_SMP |
1803 | 1803 | ||
1804 | static inline unsigned int task_cpu(const struct task_struct *p) | 1804 | static inline unsigned int task_cpu(const struct task_struct *p) |
1805 | { | 1805 | { |
1806 | return task_thread_info(p)->cpu; | 1806 | return task_thread_info(p)->cpu; |
1807 | } | 1807 | } |
1808 | 1808 | ||
1809 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); | 1809 | extern void set_task_cpu(struct task_struct *p, unsigned int cpu); |
1810 | 1810 | ||
1811 | #else | 1811 | #else |
1812 | 1812 | ||
1813 | static inline unsigned int task_cpu(const struct task_struct *p) | 1813 | static inline unsigned int task_cpu(const struct task_struct *p) |
1814 | { | 1814 | { |
1815 | return 0; | 1815 | return 0; |
1816 | } | 1816 | } |
1817 | 1817 | ||
1818 | static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) | 1818 | static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) |
1819 | { | 1819 | { |
1820 | } | 1820 | } |
1821 | 1821 | ||
1822 | #endif /* CONFIG_SMP */ | 1822 | #endif /* CONFIG_SMP */ |
1823 | 1823 | ||
1824 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT | 1824 | #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT |
1825 | extern void arch_pick_mmap_layout(struct mm_struct *mm); | 1825 | extern void arch_pick_mmap_layout(struct mm_struct *mm); |
1826 | #else | 1826 | #else |
1827 | static inline void arch_pick_mmap_layout(struct mm_struct *mm) | 1827 | static inline void arch_pick_mmap_layout(struct mm_struct *mm) |
1828 | { | 1828 | { |
1829 | mm->mmap_base = TASK_UNMAPPED_BASE; | 1829 | mm->mmap_base = TASK_UNMAPPED_BASE; |
1830 | mm->get_unmapped_area = arch_get_unmapped_area; | 1830 | mm->get_unmapped_area = arch_get_unmapped_area; |
1831 | mm->unmap_area = arch_unmap_area; | 1831 | mm->unmap_area = arch_unmap_area; |
1832 | } | 1832 | } |
1833 | #endif | 1833 | #endif |
1834 | 1834 | ||
1835 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); | 1835 | extern long sched_setaffinity(pid_t pid, cpumask_t new_mask); |
1836 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); | 1836 | extern long sched_getaffinity(pid_t pid, cpumask_t *mask); |
1837 | 1837 | ||
1838 | extern int sched_mc_power_savings, sched_smt_power_savings; | 1838 | extern int sched_mc_power_savings, sched_smt_power_savings; |
1839 | 1839 | ||
1840 | extern void normalize_rt_tasks(void); | 1840 | extern void normalize_rt_tasks(void); |
1841 | 1841 | ||
1842 | #ifdef CONFIG_TASK_XACCT | 1842 | #ifdef CONFIG_TASK_XACCT |
1843 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 1843 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
1844 | { | 1844 | { |
1845 | tsk->rchar += amt; | 1845 | tsk->rchar += amt; |
1846 | } | 1846 | } |
1847 | 1847 | ||
1848 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) | 1848 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) |
1849 | { | 1849 | { |
1850 | tsk->wchar += amt; | 1850 | tsk->wchar += amt; |
1851 | } | 1851 | } |
1852 | 1852 | ||
1853 | static inline void inc_syscr(struct task_struct *tsk) | 1853 | static inline void inc_syscr(struct task_struct *tsk) |
1854 | { | 1854 | { |
1855 | tsk->syscr++; | 1855 | tsk->syscr++; |
1856 | } | 1856 | } |
1857 | 1857 | ||
1858 | static inline void inc_syscw(struct task_struct *tsk) | 1858 | static inline void inc_syscw(struct task_struct *tsk) |
1859 | { | 1859 | { |
1860 | tsk->syscw++; | 1860 | tsk->syscw++; |
1861 | } | 1861 | } |
1862 | #else | 1862 | #else |
1863 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) | 1863 | static inline void add_rchar(struct task_struct *tsk, ssize_t amt) |
1864 | { | 1864 | { |
1865 | } | 1865 | } |
1866 | 1866 | ||
1867 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) | 1867 | static inline void add_wchar(struct task_struct *tsk, ssize_t amt) |
1868 | { | 1868 | { |
1869 | } | 1869 | } |
1870 | 1870 | ||
1871 | static inline void inc_syscr(struct task_struct *tsk) | 1871 | static inline void inc_syscr(struct task_struct *tsk) |
1872 | { | 1872 | { |
1873 | } | 1873 | } |
1874 | 1874 | ||
1875 | static inline void inc_syscw(struct task_struct *tsk) | 1875 | static inline void inc_syscw(struct task_struct *tsk) |
1876 | { | 1876 | { |
1877 | } | 1877 | } |
1878 | #endif | 1878 | #endif |
1879 | 1879 | ||
1880 | #endif /* __KERNEL__ */ | 1880 | #endif /* __KERNEL__ */ |
1881 | 1881 | ||
1882 | #endif | 1882 | #endif |
1883 | 1883 |
kernel/sched.c
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | 7 | * |
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | 8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 | * make semaphores SMP safe | 9 | * make semaphores SMP safe |
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | 10 | * 1998-11-19 Implemented schedule_timeout() and related stuff |
11 | * by Andrea Arcangeli | 11 | * by Andrea Arcangeli |
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | 12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: |
13 | * hybrid priority-list and round-robin design with | 13 | * hybrid priority-list and round-robin design with |
14 | * an array-switch method of distributing timeslices | 14 | * an array-switch method of distributing timeslices |
15 | * and per-CPU runqueues. Cleanups and useful suggestions | 15 | * and per-CPU runqueues. Cleanups and useful suggestions |
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 17 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 18 | * 2004-04-02 Scheduler domains code by Nick Piggin |
19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a | 19 | * 2007-04-15 Work begun on replacing all interactivity tuning with a |
20 | * fair scheduling design by Con Kolivas. | 20 | * fair scheduling design by Con Kolivas. |
21 | * 2007-05-05 Load balancing (smp-nice) and other improvements | 21 | * 2007-05-05 Load balancing (smp-nice) and other improvements |
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | */ | 25 | */ |
26 | 26 | ||
27 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/module.h> | 28 | #include <linux/module.h> |
29 | #include <linux/nmi.h> | 29 | #include <linux/nmi.h> |
30 | #include <linux/init.h> | 30 | #include <linux/init.h> |
31 | #include <linux/uaccess.h> | 31 | #include <linux/uaccess.h> |
32 | #include <linux/highmem.h> | 32 | #include <linux/highmem.h> |
33 | #include <linux/smp_lock.h> | 33 | #include <linux/smp_lock.h> |
34 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
35 | #include <linux/interrupt.h> | 35 | #include <linux/interrupt.h> |
36 | #include <linux/capability.h> | 36 | #include <linux/capability.h> |
37 | #include <linux/completion.h> | 37 | #include <linux/completion.h> |
38 | #include <linux/kernel_stat.h> | 38 | #include <linux/kernel_stat.h> |
39 | #include <linux/debug_locks.h> | 39 | #include <linux/debug_locks.h> |
40 | #include <linux/security.h> | 40 | #include <linux/security.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/profile.h> | 42 | #include <linux/profile.h> |
43 | #include <linux/freezer.h> | 43 | #include <linux/freezer.h> |
44 | #include <linux/vmalloc.h> | 44 | #include <linux/vmalloc.h> |
45 | #include <linux/blkdev.h> | 45 | #include <linux/blkdev.h> |
46 | #include <linux/delay.h> | 46 | #include <linux/delay.h> |
47 | #include <linux/smp.h> | 47 | #include <linux/smp.h> |
48 | #include <linux/threads.h> | 48 | #include <linux/threads.h> |
49 | #include <linux/timer.h> | 49 | #include <linux/timer.h> |
50 | #include <linux/rcupdate.h> | 50 | #include <linux/rcupdate.h> |
51 | #include <linux/cpu.h> | 51 | #include <linux/cpu.h> |
52 | #include <linux/cpuset.h> | 52 | #include <linux/cpuset.h> |
53 | #include <linux/percpu.h> | 53 | #include <linux/percpu.h> |
54 | #include <linux/kthread.h> | 54 | #include <linux/kthread.h> |
55 | #include <linux/seq_file.h> | 55 | #include <linux/seq_file.h> |
56 | #include <linux/sysctl.h> | 56 | #include <linux/sysctl.h> |
57 | #include <linux/syscalls.h> | 57 | #include <linux/syscalls.h> |
58 | #include <linux/times.h> | 58 | #include <linux/times.h> |
59 | #include <linux/tsacct_kern.h> | 59 | #include <linux/tsacct_kern.h> |
60 | #include <linux/kprobes.h> | 60 | #include <linux/kprobes.h> |
61 | #include <linux/delayacct.h> | 61 | #include <linux/delayacct.h> |
62 | #include <linux/reciprocal_div.h> | 62 | #include <linux/reciprocal_div.h> |
63 | #include <linux/unistd.h> | 63 | #include <linux/unistd.h> |
64 | 64 | ||
65 | #include <asm/tlb.h> | 65 | #include <asm/tlb.h> |
66 | 66 | ||
67 | /* | 67 | /* |
68 | * Scheduler clock - returns current time in nanosec units. | 68 | * Scheduler clock - returns current time in nanosec units. |
69 | * This is default implementation. | 69 | * This is default implementation. |
70 | * Architectures and sub-architectures can override this. | 70 | * Architectures and sub-architectures can override this. |
71 | */ | 71 | */ |
72 | unsigned long long __attribute__((weak)) sched_clock(void) | 72 | unsigned long long __attribute__((weak)) sched_clock(void) |
73 | { | 73 | { |
74 | return (unsigned long long)jiffies * (1000000000 / HZ); | 74 | return (unsigned long long)jiffies * (1000000000 / HZ); |
75 | } | 75 | } |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 78 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
79 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 79 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
80 | * and back. | 80 | * and back. |
81 | */ | 81 | */ |
82 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | 82 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) |
83 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | 83 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) |
84 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | 84 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) |
85 | 85 | ||
86 | /* | 86 | /* |
87 | * 'User priority' is the nice value converted to something we | 87 | * 'User priority' is the nice value converted to something we |
88 | * can work with better when scaling various scheduler parameters, | 88 | * can work with better when scaling various scheduler parameters, |
89 | * it's a [ 0 ... 39 ] range. | 89 | * it's a [ 0 ... 39 ] range. |
90 | */ | 90 | */ |
91 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | 91 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) |
92 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | 92 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) |
93 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 93 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
94 | 94 | ||
95 | /* | 95 | /* |
96 | * Some helpers for converting nanosecond timing to jiffy resolution | 96 | * Some helpers for converting nanosecond timing to jiffy resolution |
97 | */ | 97 | */ |
98 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 98 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
99 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 99 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
100 | 100 | ||
101 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 101 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
102 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 102 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
103 | 103 | ||
104 | /* | 104 | /* |
105 | * These are the 'tuning knobs' of the scheduler: | 105 | * These are the 'tuning knobs' of the scheduler: |
106 | * | 106 | * |
107 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 107 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), |
108 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | 108 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. |
109 | * Timeslices get refilled after they expire. | 109 | * Timeslices get refilled after they expire. |
110 | */ | 110 | */ |
111 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | 111 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
112 | #define DEF_TIMESLICE (100 * HZ / 1000) | 112 | #define DEF_TIMESLICE (100 * HZ / 1000) |
113 | 113 | ||
114 | #ifdef CONFIG_SMP | 114 | #ifdef CONFIG_SMP |
115 | /* | 115 | /* |
116 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | 116 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) |
117 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | 117 | * Since cpu_power is a 'constant', we can use a reciprocal divide. |
118 | */ | 118 | */ |
119 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | 119 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) |
120 | { | 120 | { |
121 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | 121 | return reciprocal_divide(load, sg->reciprocal_cpu_power); |
122 | } | 122 | } |
123 | 123 | ||
124 | /* | 124 | /* |
125 | * Each time a sched group cpu_power is changed, | 125 | * Each time a sched group cpu_power is changed, |
126 | * we must compute its reciprocal value | 126 | * we must compute its reciprocal value |
127 | */ | 127 | */ |
128 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | 128 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) |
129 | { | 129 | { |
130 | sg->__cpu_power += val; | 130 | sg->__cpu_power += val; |
131 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | 131 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); |
132 | } | 132 | } |
133 | #endif | 133 | #endif |
134 | 134 | ||
135 | #define SCALE_PRIO(x, prio) \ | 135 | #define SCALE_PRIO(x, prio) \ |
136 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | 136 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
137 | 137 | ||
138 | /* | 138 | /* |
139 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | 139 | * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
140 | * to time slice values: [800ms ... 100ms ... 5ms] | 140 | * to time slice values: [800ms ... 100ms ... 5ms] |
141 | */ | 141 | */ |
142 | static unsigned int static_prio_timeslice(int static_prio) | 142 | static unsigned int static_prio_timeslice(int static_prio) |
143 | { | 143 | { |
144 | if (static_prio == NICE_TO_PRIO(19)) | 144 | if (static_prio == NICE_TO_PRIO(19)) |
145 | return 1; | 145 | return 1; |
146 | 146 | ||
147 | if (static_prio < NICE_TO_PRIO(0)) | 147 | if (static_prio < NICE_TO_PRIO(0)) |
148 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | 148 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
149 | else | 149 | else |
150 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | 150 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
151 | } | 151 | } |
152 | 152 | ||
153 | static inline int rt_policy(int policy) | 153 | static inline int rt_policy(int policy) |
154 | { | 154 | { |
155 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 155 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) |
156 | return 1; | 156 | return 1; |
157 | return 0; | 157 | return 0; |
158 | } | 158 | } |
159 | 159 | ||
160 | static inline int task_has_rt_policy(struct task_struct *p) | 160 | static inline int task_has_rt_policy(struct task_struct *p) |
161 | { | 161 | { |
162 | return rt_policy(p->policy); | 162 | return rt_policy(p->policy); |
163 | } | 163 | } |
164 | 164 | ||
165 | /* | 165 | /* |
166 | * This is the priority-queue data structure of the RT scheduling class: | 166 | * This is the priority-queue data structure of the RT scheduling class: |
167 | */ | 167 | */ |
168 | struct rt_prio_array { | 168 | struct rt_prio_array { |
169 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ | 169 | DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ |
170 | struct list_head queue[MAX_RT_PRIO]; | 170 | struct list_head queue[MAX_RT_PRIO]; |
171 | }; | 171 | }; |
172 | 172 | ||
173 | struct load_stat { | 173 | struct load_stat { |
174 | struct load_weight load; | 174 | struct load_weight load; |
175 | u64 load_update_start, load_update_last; | 175 | u64 load_update_start, load_update_last; |
176 | unsigned long delta_fair, delta_exec, delta_stat; | 176 | unsigned long delta_fair, delta_exec, delta_stat; |
177 | }; | 177 | }; |
178 | 178 | ||
179 | /* CFS-related fields in a runqueue */ | 179 | /* CFS-related fields in a runqueue */ |
180 | struct cfs_rq { | 180 | struct cfs_rq { |
181 | struct load_weight load; | 181 | struct load_weight load; |
182 | unsigned long nr_running; | 182 | unsigned long nr_running; |
183 | 183 | ||
184 | s64 fair_clock; | 184 | s64 fair_clock; |
185 | u64 exec_clock; | 185 | u64 exec_clock; |
186 | s64 wait_runtime; | 186 | s64 wait_runtime; |
187 | u64 sleeper_bonus; | 187 | u64 sleeper_bonus; |
188 | unsigned long wait_runtime_overruns, wait_runtime_underruns; | 188 | unsigned long wait_runtime_overruns, wait_runtime_underruns; |
189 | 189 | ||
190 | struct rb_root tasks_timeline; | 190 | struct rb_root tasks_timeline; |
191 | struct rb_node *rb_leftmost; | 191 | struct rb_node *rb_leftmost; |
192 | struct rb_node *rb_load_balance_curr; | 192 | struct rb_node *rb_load_balance_curr; |
193 | #ifdef CONFIG_FAIR_GROUP_SCHED | 193 | #ifdef CONFIG_FAIR_GROUP_SCHED |
194 | /* 'curr' points to currently running entity on this cfs_rq. | 194 | /* 'curr' points to currently running entity on this cfs_rq. |
195 | * It is set to NULL otherwise (i.e when none are currently running). | 195 | * It is set to NULL otherwise (i.e when none are currently running). |
196 | */ | 196 | */ |
197 | struct sched_entity *curr; | 197 | struct sched_entity *curr; |
198 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 198 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
199 | 199 | ||
200 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 200 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
201 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 201 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
202 | * (like users, containers etc.) | 202 | * (like users, containers etc.) |
203 | * | 203 | * |
204 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 204 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
205 | * list is used during load balance. | 205 | * list is used during load balance. |
206 | */ | 206 | */ |
207 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 207 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ |
208 | #endif | 208 | #endif |
209 | }; | 209 | }; |
210 | 210 | ||
211 | /* Real-Time classes' related field in a runqueue: */ | 211 | /* Real-Time classes' related field in a runqueue: */ |
212 | struct rt_rq { | 212 | struct rt_rq { |
213 | struct rt_prio_array active; | 213 | struct rt_prio_array active; |
214 | int rt_load_balance_idx; | 214 | int rt_load_balance_idx; |
215 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 215 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; |
216 | }; | 216 | }; |
217 | 217 | ||
218 | /* | 218 | /* |
219 | * This is the main, per-CPU runqueue data structure. | 219 | * This is the main, per-CPU runqueue data structure. |
220 | * | 220 | * |
221 | * Locking rule: those places that want to lock multiple runqueues | 221 | * Locking rule: those places that want to lock multiple runqueues |
222 | * (such as the load balancing or the thread migration code), lock | 222 | * (such as the load balancing or the thread migration code), lock |
223 | * acquire operations must be ordered by ascending &runqueue. | 223 | * acquire operations must be ordered by ascending &runqueue. |
224 | */ | 224 | */ |
225 | struct rq { | 225 | struct rq { |
226 | spinlock_t lock; /* runqueue lock */ | 226 | spinlock_t lock; /* runqueue lock */ |
227 | 227 | ||
228 | /* | 228 | /* |
229 | * nr_running and cpu_load should be in the same cacheline because | 229 | * nr_running and cpu_load should be in the same cacheline because |
230 | * remote CPUs use both these fields when doing load calculation. | 230 | * remote CPUs use both these fields when doing load calculation. |
231 | */ | 231 | */ |
232 | unsigned long nr_running; | 232 | unsigned long nr_running; |
233 | #define CPU_LOAD_IDX_MAX 5 | 233 | #define CPU_LOAD_IDX_MAX 5 |
234 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 234 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
235 | unsigned char idle_at_tick; | 235 | unsigned char idle_at_tick; |
236 | #ifdef CONFIG_NO_HZ | 236 | #ifdef CONFIG_NO_HZ |
237 | unsigned char in_nohz_recently; | 237 | unsigned char in_nohz_recently; |
238 | #endif | 238 | #endif |
239 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ | 239 | struct load_stat ls; /* capture load from *all* tasks on this cpu */ |
240 | unsigned long nr_load_updates; | 240 | unsigned long nr_load_updates; |
241 | u64 nr_switches; | 241 | u64 nr_switches; |
242 | 242 | ||
243 | struct cfs_rq cfs; | 243 | struct cfs_rq cfs; |
244 | #ifdef CONFIG_FAIR_GROUP_SCHED | 244 | #ifdef CONFIG_FAIR_GROUP_SCHED |
245 | struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ | 245 | struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ |
246 | #endif | 246 | #endif |
247 | struct rt_rq rt; | 247 | struct rt_rq rt; |
248 | 248 | ||
249 | /* | 249 | /* |
250 | * This is part of a global counter where only the total sum | 250 | * This is part of a global counter where only the total sum |
251 | * over all CPUs matters. A task can increase this counter on | 251 | * over all CPUs matters. A task can increase this counter on |
252 | * one CPU and if it got migrated afterwards it may decrease | 252 | * one CPU and if it got migrated afterwards it may decrease |
253 | * it on another CPU. Always updated under the runqueue lock: | 253 | * it on another CPU. Always updated under the runqueue lock: |
254 | */ | 254 | */ |
255 | unsigned long nr_uninterruptible; | 255 | unsigned long nr_uninterruptible; |
256 | 256 | ||
257 | struct task_struct *curr, *idle; | 257 | struct task_struct *curr, *idle; |
258 | unsigned long next_balance; | 258 | unsigned long next_balance; |
259 | struct mm_struct *prev_mm; | 259 | struct mm_struct *prev_mm; |
260 | 260 | ||
261 | u64 clock, prev_clock_raw; | 261 | u64 clock, prev_clock_raw; |
262 | s64 clock_max_delta; | 262 | s64 clock_max_delta; |
263 | 263 | ||
264 | unsigned int clock_warps, clock_overflows; | 264 | unsigned int clock_warps, clock_overflows; |
265 | unsigned int clock_unstable_events; | 265 | unsigned int clock_unstable_events; |
266 | 266 | ||
267 | atomic_t nr_iowait; | 267 | atomic_t nr_iowait; |
268 | 268 | ||
269 | #ifdef CONFIG_SMP | 269 | #ifdef CONFIG_SMP |
270 | struct sched_domain *sd; | 270 | struct sched_domain *sd; |
271 | 271 | ||
272 | /* For active balancing */ | 272 | /* For active balancing */ |
273 | int active_balance; | 273 | int active_balance; |
274 | int push_cpu; | 274 | int push_cpu; |
275 | int cpu; /* cpu of this runqueue */ | 275 | int cpu; /* cpu of this runqueue */ |
276 | 276 | ||
277 | struct task_struct *migration_thread; | 277 | struct task_struct *migration_thread; |
278 | struct list_head migration_queue; | 278 | struct list_head migration_queue; |
279 | #endif | 279 | #endif |
280 | 280 | ||
281 | #ifdef CONFIG_SCHEDSTATS | 281 | #ifdef CONFIG_SCHEDSTATS |
282 | /* latency stats */ | 282 | /* latency stats */ |
283 | struct sched_info rq_sched_info; | 283 | struct sched_info rq_sched_info; |
284 | 284 | ||
285 | /* sys_sched_yield() stats */ | 285 | /* sys_sched_yield() stats */ |
286 | unsigned long yld_exp_empty; | 286 | unsigned long yld_exp_empty; |
287 | unsigned long yld_act_empty; | 287 | unsigned long yld_act_empty; |
288 | unsigned long yld_both_empty; | 288 | unsigned long yld_both_empty; |
289 | unsigned long yld_cnt; | 289 | unsigned long yld_cnt; |
290 | 290 | ||
291 | /* schedule() stats */ | 291 | /* schedule() stats */ |
292 | unsigned long sched_switch; | 292 | unsigned long sched_switch; |
293 | unsigned long sched_cnt; | 293 | unsigned long sched_cnt; |
294 | unsigned long sched_goidle; | 294 | unsigned long sched_goidle; |
295 | 295 | ||
296 | /* try_to_wake_up() stats */ | 296 | /* try_to_wake_up() stats */ |
297 | unsigned long ttwu_cnt; | 297 | unsigned long ttwu_cnt; |
298 | unsigned long ttwu_local; | 298 | unsigned long ttwu_local; |
299 | #endif | 299 | #endif |
300 | struct lock_class_key rq_lock_key; | 300 | struct lock_class_key rq_lock_key; |
301 | }; | 301 | }; |
302 | 302 | ||
303 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 303 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
304 | static DEFINE_MUTEX(sched_hotcpu_mutex); | 304 | static DEFINE_MUTEX(sched_hotcpu_mutex); |
305 | 305 | ||
306 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 306 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
307 | { | 307 | { |
308 | rq->curr->sched_class->check_preempt_curr(rq, p); | 308 | rq->curr->sched_class->check_preempt_curr(rq, p); |
309 | } | 309 | } |
310 | 310 | ||
311 | static inline int cpu_of(struct rq *rq) | 311 | static inline int cpu_of(struct rq *rq) |
312 | { | 312 | { |
313 | #ifdef CONFIG_SMP | 313 | #ifdef CONFIG_SMP |
314 | return rq->cpu; | 314 | return rq->cpu; |
315 | #else | 315 | #else |
316 | return 0; | 316 | return 0; |
317 | #endif | 317 | #endif |
318 | } | 318 | } |
319 | 319 | ||
320 | /* | 320 | /* |
321 | * Per-runqueue clock, as finegrained as the platform can give us: | 321 | * Per-runqueue clock, as finegrained as the platform can give us: |
322 | */ | 322 | */ |
323 | static unsigned long long __rq_clock(struct rq *rq) | 323 | static unsigned long long __rq_clock(struct rq *rq) |
324 | { | 324 | { |
325 | u64 prev_raw = rq->prev_clock_raw; | 325 | u64 prev_raw = rq->prev_clock_raw; |
326 | u64 now = sched_clock(); | 326 | u64 now = sched_clock(); |
327 | s64 delta = now - prev_raw; | 327 | s64 delta = now - prev_raw; |
328 | u64 clock = rq->clock; | 328 | u64 clock = rq->clock; |
329 | 329 | ||
330 | /* | 330 | /* |
331 | * Protect against sched_clock() occasionally going backwards: | 331 | * Protect against sched_clock() occasionally going backwards: |
332 | */ | 332 | */ |
333 | if (unlikely(delta < 0)) { | 333 | if (unlikely(delta < 0)) { |
334 | clock++; | 334 | clock++; |
335 | rq->clock_warps++; | 335 | rq->clock_warps++; |
336 | } else { | 336 | } else { |
337 | /* | 337 | /* |
338 | * Catch too large forward jumps too: | 338 | * Catch too large forward jumps too: |
339 | */ | 339 | */ |
340 | if (unlikely(delta > 2*TICK_NSEC)) { | 340 | if (unlikely(delta > 2*TICK_NSEC)) { |
341 | clock++; | 341 | clock++; |
342 | rq->clock_overflows++; | 342 | rq->clock_overflows++; |
343 | } else { | 343 | } else { |
344 | if (unlikely(delta > rq->clock_max_delta)) | 344 | if (unlikely(delta > rq->clock_max_delta)) |
345 | rq->clock_max_delta = delta; | 345 | rq->clock_max_delta = delta; |
346 | clock += delta; | 346 | clock += delta; |
347 | } | 347 | } |
348 | } | 348 | } |
349 | 349 | ||
350 | rq->prev_clock_raw = now; | 350 | rq->prev_clock_raw = now; |
351 | rq->clock = clock; | 351 | rq->clock = clock; |
352 | 352 | ||
353 | return clock; | 353 | return clock; |
354 | } | 354 | } |
355 | 355 | ||
356 | static inline unsigned long long rq_clock(struct rq *rq) | 356 | static inline unsigned long long rq_clock(struct rq *rq) |
357 | { | 357 | { |
358 | int this_cpu = smp_processor_id(); | 358 | int this_cpu = smp_processor_id(); |
359 | 359 | ||
360 | if (this_cpu == cpu_of(rq)) | 360 | if (this_cpu == cpu_of(rq)) |
361 | return __rq_clock(rq); | 361 | return __rq_clock(rq); |
362 | 362 | ||
363 | return rq->clock; | 363 | return rq->clock; |
364 | } | 364 | } |
365 | 365 | ||
366 | /* | 366 | /* |
367 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 367 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
368 | * See detach_destroy_domains: synchronize_sched for details. | 368 | * See detach_destroy_domains: synchronize_sched for details. |
369 | * | 369 | * |
370 | * The domain tree of any CPU may only be accessed from within | 370 | * The domain tree of any CPU may only be accessed from within |
371 | * preempt-disabled sections. | 371 | * preempt-disabled sections. |
372 | */ | 372 | */ |
373 | #define for_each_domain(cpu, __sd) \ | 373 | #define for_each_domain(cpu, __sd) \ |
374 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 374 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
375 | 375 | ||
376 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 376 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
377 | #define this_rq() (&__get_cpu_var(runqueues)) | 377 | #define this_rq() (&__get_cpu_var(runqueues)) |
378 | #define task_rq(p) cpu_rq(task_cpu(p)) | 378 | #define task_rq(p) cpu_rq(task_cpu(p)) |
379 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 379 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
380 | 380 | ||
381 | /* | 381 | /* |
382 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 382 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
383 | * clock constructed from sched_clock(): | 383 | * clock constructed from sched_clock(): |
384 | */ | 384 | */ |
385 | unsigned long long cpu_clock(int cpu) | 385 | unsigned long long cpu_clock(int cpu) |
386 | { | 386 | { |
387 | unsigned long long now; | 387 | unsigned long long now; |
388 | unsigned long flags; | 388 | unsigned long flags; |
389 | 389 | ||
390 | local_irq_save(flags); | 390 | local_irq_save(flags); |
391 | now = rq_clock(cpu_rq(cpu)); | 391 | now = rq_clock(cpu_rq(cpu)); |
392 | local_irq_restore(flags); | 392 | local_irq_restore(flags); |
393 | 393 | ||
394 | return now; | 394 | return now; |
395 | } | 395 | } |
396 | 396 | ||
397 | #ifdef CONFIG_FAIR_GROUP_SCHED | 397 | #ifdef CONFIG_FAIR_GROUP_SCHED |
398 | /* Change a task's ->cfs_rq if it moves across CPUs */ | 398 | /* Change a task's ->cfs_rq if it moves across CPUs */ |
399 | static inline void set_task_cfs_rq(struct task_struct *p) | 399 | static inline void set_task_cfs_rq(struct task_struct *p) |
400 | { | 400 | { |
401 | p->se.cfs_rq = &task_rq(p)->cfs; | 401 | p->se.cfs_rq = &task_rq(p)->cfs; |
402 | } | 402 | } |
403 | #else | 403 | #else |
404 | static inline void set_task_cfs_rq(struct task_struct *p) | 404 | static inline void set_task_cfs_rq(struct task_struct *p) |
405 | { | 405 | { |
406 | } | 406 | } |
407 | #endif | 407 | #endif |
408 | 408 | ||
409 | #ifndef prepare_arch_switch | 409 | #ifndef prepare_arch_switch |
410 | # define prepare_arch_switch(next) do { } while (0) | 410 | # define prepare_arch_switch(next) do { } while (0) |
411 | #endif | 411 | #endif |
412 | #ifndef finish_arch_switch | 412 | #ifndef finish_arch_switch |
413 | # define finish_arch_switch(prev) do { } while (0) | 413 | # define finish_arch_switch(prev) do { } while (0) |
414 | #endif | 414 | #endif |
415 | 415 | ||
416 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 416 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
417 | static inline int task_running(struct rq *rq, struct task_struct *p) | 417 | static inline int task_running(struct rq *rq, struct task_struct *p) |
418 | { | 418 | { |
419 | return rq->curr == p; | 419 | return rq->curr == p; |
420 | } | 420 | } |
421 | 421 | ||
422 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 422 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
423 | { | 423 | { |
424 | } | 424 | } |
425 | 425 | ||
426 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 426 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
427 | { | 427 | { |
428 | #ifdef CONFIG_DEBUG_SPINLOCK | 428 | #ifdef CONFIG_DEBUG_SPINLOCK |
429 | /* this is a valid case when another task releases the spinlock */ | 429 | /* this is a valid case when another task releases the spinlock */ |
430 | rq->lock.owner = current; | 430 | rq->lock.owner = current; |
431 | #endif | 431 | #endif |
432 | /* | 432 | /* |
433 | * If we are tracking spinlock dependencies then we have to | 433 | * If we are tracking spinlock dependencies then we have to |
434 | * fix up the runqueue lock - which gets 'carried over' from | 434 | * fix up the runqueue lock - which gets 'carried over' from |
435 | * prev into current: | 435 | * prev into current: |
436 | */ | 436 | */ |
437 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | 437 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); |
438 | 438 | ||
439 | spin_unlock_irq(&rq->lock); | 439 | spin_unlock_irq(&rq->lock); |
440 | } | 440 | } |
441 | 441 | ||
442 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 442 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
443 | static inline int task_running(struct rq *rq, struct task_struct *p) | 443 | static inline int task_running(struct rq *rq, struct task_struct *p) |
444 | { | 444 | { |
445 | #ifdef CONFIG_SMP | 445 | #ifdef CONFIG_SMP |
446 | return p->oncpu; | 446 | return p->oncpu; |
447 | #else | 447 | #else |
448 | return rq->curr == p; | 448 | return rq->curr == p; |
449 | #endif | 449 | #endif |
450 | } | 450 | } |
451 | 451 | ||
452 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 452 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
453 | { | 453 | { |
454 | #ifdef CONFIG_SMP | 454 | #ifdef CONFIG_SMP |
455 | /* | 455 | /* |
456 | * We can optimise this out completely for !SMP, because the | 456 | * We can optimise this out completely for !SMP, because the |
457 | * SMP rebalancing from interrupt is the only thing that cares | 457 | * SMP rebalancing from interrupt is the only thing that cares |
458 | * here. | 458 | * here. |
459 | */ | 459 | */ |
460 | next->oncpu = 1; | 460 | next->oncpu = 1; |
461 | #endif | 461 | #endif |
462 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 462 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
463 | spin_unlock_irq(&rq->lock); | 463 | spin_unlock_irq(&rq->lock); |
464 | #else | 464 | #else |
465 | spin_unlock(&rq->lock); | 465 | spin_unlock(&rq->lock); |
466 | #endif | 466 | #endif |
467 | } | 467 | } |
468 | 468 | ||
469 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 469 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
470 | { | 470 | { |
471 | #ifdef CONFIG_SMP | 471 | #ifdef CONFIG_SMP |
472 | /* | 472 | /* |
473 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 473 | * After ->oncpu is cleared, the task can be moved to a different CPU. |
474 | * We must ensure this doesn't happen until the switch is completely | 474 | * We must ensure this doesn't happen until the switch is completely |
475 | * finished. | 475 | * finished. |
476 | */ | 476 | */ |
477 | smp_wmb(); | 477 | smp_wmb(); |
478 | prev->oncpu = 0; | 478 | prev->oncpu = 0; |
479 | #endif | 479 | #endif |
480 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 480 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
481 | local_irq_enable(); | 481 | local_irq_enable(); |
482 | #endif | 482 | #endif |
483 | } | 483 | } |
484 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 484 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
485 | 485 | ||
486 | /* | 486 | /* |
487 | * __task_rq_lock - lock the runqueue a given task resides on. | 487 | * __task_rq_lock - lock the runqueue a given task resides on. |
488 | * Must be called interrupts disabled. | 488 | * Must be called interrupts disabled. |
489 | */ | 489 | */ |
490 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 490 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
491 | __acquires(rq->lock) | 491 | __acquires(rq->lock) |
492 | { | 492 | { |
493 | struct rq *rq; | 493 | struct rq *rq; |
494 | 494 | ||
495 | repeat_lock_task: | 495 | repeat_lock_task: |
496 | rq = task_rq(p); | 496 | rq = task_rq(p); |
497 | spin_lock(&rq->lock); | 497 | spin_lock(&rq->lock); |
498 | if (unlikely(rq != task_rq(p))) { | 498 | if (unlikely(rq != task_rq(p))) { |
499 | spin_unlock(&rq->lock); | 499 | spin_unlock(&rq->lock); |
500 | goto repeat_lock_task; | 500 | goto repeat_lock_task; |
501 | } | 501 | } |
502 | return rq; | 502 | return rq; |
503 | } | 503 | } |
504 | 504 | ||
505 | /* | 505 | /* |
506 | * task_rq_lock - lock the runqueue a given task resides on and disable | 506 | * task_rq_lock - lock the runqueue a given task resides on and disable |
507 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 507 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
508 | * explicitly disabling preemption. | 508 | * explicitly disabling preemption. |
509 | */ | 509 | */ |
510 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 510 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
511 | __acquires(rq->lock) | 511 | __acquires(rq->lock) |
512 | { | 512 | { |
513 | struct rq *rq; | 513 | struct rq *rq; |
514 | 514 | ||
515 | repeat_lock_task: | 515 | repeat_lock_task: |
516 | local_irq_save(*flags); | 516 | local_irq_save(*flags); |
517 | rq = task_rq(p); | 517 | rq = task_rq(p); |
518 | spin_lock(&rq->lock); | 518 | spin_lock(&rq->lock); |
519 | if (unlikely(rq != task_rq(p))) { | 519 | if (unlikely(rq != task_rq(p))) { |
520 | spin_unlock_irqrestore(&rq->lock, *flags); | 520 | spin_unlock_irqrestore(&rq->lock, *flags); |
521 | goto repeat_lock_task; | 521 | goto repeat_lock_task; |
522 | } | 522 | } |
523 | return rq; | 523 | return rq; |
524 | } | 524 | } |
525 | 525 | ||
526 | static inline void __task_rq_unlock(struct rq *rq) | 526 | static inline void __task_rq_unlock(struct rq *rq) |
527 | __releases(rq->lock) | 527 | __releases(rq->lock) |
528 | { | 528 | { |
529 | spin_unlock(&rq->lock); | 529 | spin_unlock(&rq->lock); |
530 | } | 530 | } |
531 | 531 | ||
532 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 532 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) |
533 | __releases(rq->lock) | 533 | __releases(rq->lock) |
534 | { | 534 | { |
535 | spin_unlock_irqrestore(&rq->lock, *flags); | 535 | spin_unlock_irqrestore(&rq->lock, *flags); |
536 | } | 536 | } |
537 | 537 | ||
538 | /* | 538 | /* |
539 | * this_rq_lock - lock this runqueue and disable interrupts. | 539 | * this_rq_lock - lock this runqueue and disable interrupts. |
540 | */ | 540 | */ |
541 | static inline struct rq *this_rq_lock(void) | 541 | static inline struct rq *this_rq_lock(void) |
542 | __acquires(rq->lock) | 542 | __acquires(rq->lock) |
543 | { | 543 | { |
544 | struct rq *rq; | 544 | struct rq *rq; |
545 | 545 | ||
546 | local_irq_disable(); | 546 | local_irq_disable(); |
547 | rq = this_rq(); | 547 | rq = this_rq(); |
548 | spin_lock(&rq->lock); | 548 | spin_lock(&rq->lock); |
549 | 549 | ||
550 | return rq; | 550 | return rq; |
551 | } | 551 | } |
552 | 552 | ||
553 | /* | 553 | /* |
554 | * CPU frequency is/was unstable - start new by setting prev_clock_raw: | 554 | * CPU frequency is/was unstable - start new by setting prev_clock_raw: |
555 | */ | 555 | */ |
556 | void sched_clock_unstable_event(void) | 556 | void sched_clock_unstable_event(void) |
557 | { | 557 | { |
558 | unsigned long flags; | 558 | unsigned long flags; |
559 | struct rq *rq; | 559 | struct rq *rq; |
560 | 560 | ||
561 | rq = task_rq_lock(current, &flags); | 561 | rq = task_rq_lock(current, &flags); |
562 | rq->prev_clock_raw = sched_clock(); | 562 | rq->prev_clock_raw = sched_clock(); |
563 | rq->clock_unstable_events++; | 563 | rq->clock_unstable_events++; |
564 | task_rq_unlock(rq, &flags); | 564 | task_rq_unlock(rq, &flags); |
565 | } | 565 | } |
566 | 566 | ||
567 | /* | 567 | /* |
568 | * resched_task - mark a task 'to be rescheduled now'. | 568 | * resched_task - mark a task 'to be rescheduled now'. |
569 | * | 569 | * |
570 | * On UP this means the setting of the need_resched flag, on SMP it | 570 | * On UP this means the setting of the need_resched flag, on SMP it |
571 | * might also involve a cross-CPU call to trigger the scheduler on | 571 | * might also involve a cross-CPU call to trigger the scheduler on |
572 | * the target CPU. | 572 | * the target CPU. |
573 | */ | 573 | */ |
574 | #ifdef CONFIG_SMP | 574 | #ifdef CONFIG_SMP |
575 | 575 | ||
576 | #ifndef tsk_is_polling | 576 | #ifndef tsk_is_polling |
577 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 577 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
578 | #endif | 578 | #endif |
579 | 579 | ||
580 | static void resched_task(struct task_struct *p) | 580 | static void resched_task(struct task_struct *p) |
581 | { | 581 | { |
582 | int cpu; | 582 | int cpu; |
583 | 583 | ||
584 | assert_spin_locked(&task_rq(p)->lock); | 584 | assert_spin_locked(&task_rq(p)->lock); |
585 | 585 | ||
586 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 586 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) |
587 | return; | 587 | return; |
588 | 588 | ||
589 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 589 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); |
590 | 590 | ||
591 | cpu = task_cpu(p); | 591 | cpu = task_cpu(p); |
592 | if (cpu == smp_processor_id()) | 592 | if (cpu == smp_processor_id()) |
593 | return; | 593 | return; |
594 | 594 | ||
595 | /* NEED_RESCHED must be visible before we test polling */ | 595 | /* NEED_RESCHED must be visible before we test polling */ |
596 | smp_mb(); | 596 | smp_mb(); |
597 | if (!tsk_is_polling(p)) | 597 | if (!tsk_is_polling(p)) |
598 | smp_send_reschedule(cpu); | 598 | smp_send_reschedule(cpu); |
599 | } | 599 | } |
600 | 600 | ||
601 | static void resched_cpu(int cpu) | 601 | static void resched_cpu(int cpu) |
602 | { | 602 | { |
603 | struct rq *rq = cpu_rq(cpu); | 603 | struct rq *rq = cpu_rq(cpu); |
604 | unsigned long flags; | 604 | unsigned long flags; |
605 | 605 | ||
606 | if (!spin_trylock_irqsave(&rq->lock, flags)) | 606 | if (!spin_trylock_irqsave(&rq->lock, flags)) |
607 | return; | 607 | return; |
608 | resched_task(cpu_curr(cpu)); | 608 | resched_task(cpu_curr(cpu)); |
609 | spin_unlock_irqrestore(&rq->lock, flags); | 609 | spin_unlock_irqrestore(&rq->lock, flags); |
610 | } | 610 | } |
611 | #else | 611 | #else |
612 | static inline void resched_task(struct task_struct *p) | 612 | static inline void resched_task(struct task_struct *p) |
613 | { | 613 | { |
614 | assert_spin_locked(&task_rq(p)->lock); | 614 | assert_spin_locked(&task_rq(p)->lock); |
615 | set_tsk_need_resched(p); | 615 | set_tsk_need_resched(p); |
616 | } | 616 | } |
617 | #endif | 617 | #endif |
618 | 618 | ||
619 | static u64 div64_likely32(u64 divident, unsigned long divisor) | 619 | static u64 div64_likely32(u64 divident, unsigned long divisor) |
620 | { | 620 | { |
621 | #if BITS_PER_LONG == 32 | 621 | #if BITS_PER_LONG == 32 |
622 | if (likely(divident <= 0xffffffffULL)) | 622 | if (likely(divident <= 0xffffffffULL)) |
623 | return (u32)divident / divisor; | 623 | return (u32)divident / divisor; |
624 | do_div(divident, divisor); | 624 | do_div(divident, divisor); |
625 | 625 | ||
626 | return divident; | 626 | return divident; |
627 | #else | 627 | #else |
628 | return divident / divisor; | 628 | return divident / divisor; |
629 | #endif | 629 | #endif |
630 | } | 630 | } |
631 | 631 | ||
632 | #if BITS_PER_LONG == 32 | 632 | #if BITS_PER_LONG == 32 |
633 | # define WMULT_CONST (~0UL) | 633 | # define WMULT_CONST (~0UL) |
634 | #else | 634 | #else |
635 | # define WMULT_CONST (1UL << 32) | 635 | # define WMULT_CONST (1UL << 32) |
636 | #endif | 636 | #endif |
637 | 637 | ||
638 | #define WMULT_SHIFT 32 | 638 | #define WMULT_SHIFT 32 |
639 | 639 | ||
640 | static unsigned long | 640 | static unsigned long |
641 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 641 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
642 | struct load_weight *lw) | 642 | struct load_weight *lw) |
643 | { | 643 | { |
644 | u64 tmp; | 644 | u64 tmp; |
645 | 645 | ||
646 | if (unlikely(!lw->inv_weight)) | 646 | if (unlikely(!lw->inv_weight)) |
647 | lw->inv_weight = WMULT_CONST / lw->weight; | 647 | lw->inv_weight = WMULT_CONST / lw->weight; |
648 | 648 | ||
649 | tmp = (u64)delta_exec * weight; | 649 | tmp = (u64)delta_exec * weight; |
650 | /* | 650 | /* |
651 | * Check whether we'd overflow the 64-bit multiplication: | 651 | * Check whether we'd overflow the 64-bit multiplication: |
652 | */ | 652 | */ |
653 | if (unlikely(tmp > WMULT_CONST)) { | 653 | if (unlikely(tmp > WMULT_CONST)) { |
654 | tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) | 654 | tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) |
655 | >> (WMULT_SHIFT/2); | 655 | >> (WMULT_SHIFT/2); |
656 | } else { | 656 | } else { |
657 | tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; | 657 | tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; |
658 | } | 658 | } |
659 | 659 | ||
660 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 660 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
661 | } | 661 | } |
662 | 662 | ||
663 | static inline unsigned long | 663 | static inline unsigned long |
664 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | 664 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) |
665 | { | 665 | { |
666 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | 666 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); |
667 | } | 667 | } |
668 | 668 | ||
669 | static void update_load_add(struct load_weight *lw, unsigned long inc) | 669 | static void update_load_add(struct load_weight *lw, unsigned long inc) |
670 | { | 670 | { |
671 | lw->weight += inc; | 671 | lw->weight += inc; |
672 | lw->inv_weight = 0; | 672 | lw->inv_weight = 0; |
673 | } | 673 | } |
674 | 674 | ||
675 | static void update_load_sub(struct load_weight *lw, unsigned long dec) | 675 | static void update_load_sub(struct load_weight *lw, unsigned long dec) |
676 | { | 676 | { |
677 | lw->weight -= dec; | 677 | lw->weight -= dec; |
678 | lw->inv_weight = 0; | 678 | lw->inv_weight = 0; |
679 | } | 679 | } |
680 | 680 | ||
681 | /* | 681 | /* |
682 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 682 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
683 | * of tasks with abnormal "nice" values across CPUs the contribution that | 683 | * of tasks with abnormal "nice" values across CPUs the contribution that |
684 | * each task makes to its run queue's load is weighted according to its | 684 | * each task makes to its run queue's load is weighted according to its |
685 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | 685 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a |
686 | * scaled version of the new time slice allocation that they receive on time | 686 | * scaled version of the new time slice allocation that they receive on time |
687 | * slice expiry etc. | 687 | * slice expiry etc. |
688 | */ | 688 | */ |
689 | 689 | ||
690 | #define WEIGHT_IDLEPRIO 2 | 690 | #define WEIGHT_IDLEPRIO 2 |
691 | #define WMULT_IDLEPRIO (1 << 31) | 691 | #define WMULT_IDLEPRIO (1 << 31) |
692 | 692 | ||
693 | /* | 693 | /* |
694 | * Nice levels are multiplicative, with a gentle 10% change for every | 694 | * Nice levels are multiplicative, with a gentle 10% change for every |
695 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to | 695 | * nice level changed. I.e. when a CPU-bound task goes from nice 0 to |
696 | * nice 1, it will get ~10% less CPU time than another CPU-bound task | 696 | * nice 1, it will get ~10% less CPU time than another CPU-bound task |
697 | * that remained on nice 0. | 697 | * that remained on nice 0. |
698 | * | 698 | * |
699 | * The "10% effect" is relative and cumulative: from _any_ nice level, | 699 | * The "10% effect" is relative and cumulative: from _any_ nice level, |
700 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level | 700 | * if you go up 1 level, it's -10% CPU usage, if you go down 1 level |
701 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. | 701 | * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. |
702 | * If a task goes up by ~10% and another task goes down by ~10% then | 702 | * If a task goes up by ~10% and another task goes down by ~10% then |
703 | * the relative distance between them is ~25%.) | 703 | * the relative distance between them is ~25%.) |
704 | */ | 704 | */ |
705 | static const int prio_to_weight[40] = { | 705 | static const int prio_to_weight[40] = { |
706 | /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, | 706 | /* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, |
707 | /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, | 707 | /* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, |
708 | /* 0 */ NICE_0_LOAD /* 1024 */, | 708 | /* 0 */ NICE_0_LOAD /* 1024 */, |
709 | /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, | 709 | /* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, |
710 | /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, | 710 | /* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, |
711 | }; | 711 | }; |
712 | 712 | ||
713 | /* | 713 | /* |
714 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. | 714 | * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. |
715 | * | 715 | * |
716 | * In cases where the weight does not change often, we can use the | 716 | * In cases where the weight does not change often, we can use the |
717 | * precalculated inverse to speed up arithmetics by turning divisions | 717 | * precalculated inverse to speed up arithmetics by turning divisions |
718 | * into multiplications: | 718 | * into multiplications: |
719 | */ | 719 | */ |
720 | static const u32 prio_to_wmult[40] = { | 720 | static const u32 prio_to_wmult[40] = { |
721 | /* -20 */ 48356, 60446, 75558, 94446, 118058, | 721 | /* -20 */ 48356, 60446, 75558, 94446, 118058, |
722 | /* -15 */ 147573, 184467, 230589, 288233, 360285, | 722 | /* -15 */ 147573, 184467, 230589, 288233, 360285, |
723 | /* -10 */ 450347, 562979, 703746, 879575, 1099582, | 723 | /* -10 */ 450347, 562979, 703746, 879575, 1099582, |
724 | /* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, | 724 | /* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443, |
725 | /* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, | 725 | /* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518, |
726 | /* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, | 726 | /* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126, |
727 | /* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, | 727 | /* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717, |
728 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 728 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
729 | }; | 729 | }; |
730 | 730 | ||
731 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); | 731 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); |
732 | 732 | ||
733 | /* | 733 | /* |
734 | * runqueue iterator, to support SMP load-balancing between different | 734 | * runqueue iterator, to support SMP load-balancing between different |
735 | * scheduling classes, without having to expose their internal data | 735 | * scheduling classes, without having to expose their internal data |
736 | * structures to the load-balancing proper: | 736 | * structures to the load-balancing proper: |
737 | */ | 737 | */ |
738 | struct rq_iterator { | 738 | struct rq_iterator { |
739 | void *arg; | 739 | void *arg; |
740 | struct task_struct *(*start)(void *); | 740 | struct task_struct *(*start)(void *); |
741 | struct task_struct *(*next)(void *); | 741 | struct task_struct *(*next)(void *); |
742 | }; | 742 | }; |
743 | 743 | ||
744 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 744 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
745 | unsigned long max_nr_move, unsigned long max_load_move, | 745 | unsigned long max_nr_move, unsigned long max_load_move, |
746 | struct sched_domain *sd, enum cpu_idle_type idle, | 746 | struct sched_domain *sd, enum cpu_idle_type idle, |
747 | int *all_pinned, unsigned long *load_moved, | 747 | int *all_pinned, unsigned long *load_moved, |
748 | int this_best_prio, int best_prio, int best_prio_seen, | 748 | int this_best_prio, int best_prio, int best_prio_seen, |
749 | struct rq_iterator *iterator); | 749 | struct rq_iterator *iterator); |
750 | 750 | ||
751 | #include "sched_stats.h" | 751 | #include "sched_stats.h" |
752 | #include "sched_rt.c" | 752 | #include "sched_rt.c" |
753 | #include "sched_fair.c" | 753 | #include "sched_fair.c" |
754 | #include "sched_idletask.c" | 754 | #include "sched_idletask.c" |
755 | #ifdef CONFIG_SCHED_DEBUG | 755 | #ifdef CONFIG_SCHED_DEBUG |
756 | # include "sched_debug.c" | 756 | # include "sched_debug.c" |
757 | #endif | 757 | #endif |
758 | 758 | ||
759 | #define sched_class_highest (&rt_sched_class) | 759 | #define sched_class_highest (&rt_sched_class) |
760 | 760 | ||
761 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) | 761 | static void __update_curr_load(struct rq *rq, struct load_stat *ls) |
762 | { | 762 | { |
763 | if (rq->curr != rq->idle && ls->load.weight) { | 763 | if (rq->curr != rq->idle && ls->load.weight) { |
764 | ls->delta_exec += ls->delta_stat; | 764 | ls->delta_exec += ls->delta_stat; |
765 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); | 765 | ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); |
766 | ls->delta_stat = 0; | 766 | ls->delta_stat = 0; |
767 | } | 767 | } |
768 | } | 768 | } |
769 | 769 | ||
770 | /* | 770 | /* |
771 | * Update delta_exec, delta_fair fields for rq. | 771 | * Update delta_exec, delta_fair fields for rq. |
772 | * | 772 | * |
773 | * delta_fair clock advances at a rate inversely proportional to | 773 | * delta_fair clock advances at a rate inversely proportional to |
774 | * total load (rq->ls.load.weight) on the runqueue, while | 774 | * total load (rq->ls.load.weight) on the runqueue, while |
775 | * delta_exec advances at the same rate as wall-clock (provided | 775 | * delta_exec advances at the same rate as wall-clock (provided |
776 | * cpu is not idle). | 776 | * cpu is not idle). |
777 | * | 777 | * |
778 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | 778 | * delta_exec / delta_fair is a measure of the (smoothened) load on this |
779 | * runqueue over any given interval. This (smoothened) load is used | 779 | * runqueue over any given interval. This (smoothened) load is used |
780 | * during load balance. | 780 | * during load balance. |
781 | * | 781 | * |
782 | * This function is called /before/ updating rq->ls.load | 782 | * This function is called /before/ updating rq->ls.load |
783 | * and when switching tasks. | 783 | * and when switching tasks. |
784 | */ | 784 | */ |
785 | static void update_curr_load(struct rq *rq, u64 now) | 785 | static void update_curr_load(struct rq *rq, u64 now) |
786 | { | 786 | { |
787 | struct load_stat *ls = &rq->ls; | 787 | struct load_stat *ls = &rq->ls; |
788 | u64 start; | 788 | u64 start; |
789 | 789 | ||
790 | start = ls->load_update_start; | 790 | start = ls->load_update_start; |
791 | ls->load_update_start = now; | 791 | ls->load_update_start = now; |
792 | ls->delta_stat += now - start; | 792 | ls->delta_stat += now - start; |
793 | /* | 793 | /* |
794 | * Stagger updates to ls->delta_fair. Very frequent updates | 794 | * Stagger updates to ls->delta_fair. Very frequent updates |
795 | * can be expensive. | 795 | * can be expensive. |
796 | */ | 796 | */ |
797 | if (ls->delta_stat >= sysctl_sched_stat_granularity) | 797 | if (ls->delta_stat >= sysctl_sched_stat_granularity) |
798 | __update_curr_load(rq, ls); | 798 | __update_curr_load(rq, ls); |
799 | } | 799 | } |
800 | 800 | ||
801 | static inline void | 801 | static inline void |
802 | inc_load(struct rq *rq, const struct task_struct *p, u64 now) | 802 | inc_load(struct rq *rq, const struct task_struct *p, u64 now) |
803 | { | 803 | { |
804 | update_curr_load(rq, now); | 804 | update_curr_load(rq, now); |
805 | update_load_add(&rq->ls.load, p->se.load.weight); | 805 | update_load_add(&rq->ls.load, p->se.load.weight); |
806 | } | 806 | } |
807 | 807 | ||
808 | static inline void | 808 | static inline void |
809 | dec_load(struct rq *rq, const struct task_struct *p, u64 now) | 809 | dec_load(struct rq *rq, const struct task_struct *p, u64 now) |
810 | { | 810 | { |
811 | update_curr_load(rq, now); | 811 | update_curr_load(rq, now); |
812 | update_load_sub(&rq->ls.load, p->se.load.weight); | 812 | update_load_sub(&rq->ls.load, p->se.load.weight); |
813 | } | 813 | } |
814 | 814 | ||
815 | static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) | 815 | static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
816 | { | 816 | { |
817 | rq->nr_running++; | 817 | rq->nr_running++; |
818 | inc_load(rq, p, now); | 818 | inc_load(rq, p, now); |
819 | } | 819 | } |
820 | 820 | ||
821 | static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) | 821 | static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) |
822 | { | 822 | { |
823 | rq->nr_running--; | 823 | rq->nr_running--; |
824 | dec_load(rq, p, now); | 824 | dec_load(rq, p, now); |
825 | } | 825 | } |
826 | 826 | ||
827 | static void set_load_weight(struct task_struct *p) | 827 | static void set_load_weight(struct task_struct *p) |
828 | { | 828 | { |
829 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; | 829 | task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; |
830 | p->se.wait_runtime = 0; | 830 | p->se.wait_runtime = 0; |
831 | 831 | ||
832 | if (task_has_rt_policy(p)) { | 832 | if (task_has_rt_policy(p)) { |
833 | p->se.load.weight = prio_to_weight[0] * 2; | 833 | p->se.load.weight = prio_to_weight[0] * 2; |
834 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; | 834 | p->se.load.inv_weight = prio_to_wmult[0] >> 1; |
835 | return; | 835 | return; |
836 | } | 836 | } |
837 | 837 | ||
838 | /* | 838 | /* |
839 | * SCHED_IDLE tasks get minimal weight: | 839 | * SCHED_IDLE tasks get minimal weight: |
840 | */ | 840 | */ |
841 | if (p->policy == SCHED_IDLE) { | 841 | if (p->policy == SCHED_IDLE) { |
842 | p->se.load.weight = WEIGHT_IDLEPRIO; | 842 | p->se.load.weight = WEIGHT_IDLEPRIO; |
843 | p->se.load.inv_weight = WMULT_IDLEPRIO; | 843 | p->se.load.inv_weight = WMULT_IDLEPRIO; |
844 | return; | 844 | return; |
845 | } | 845 | } |
846 | 846 | ||
847 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; | 847 | p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; |
848 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; | 848 | p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; |
849 | } | 849 | } |
850 | 850 | ||
851 | static void | 851 | static void |
852 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | 852 | enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) |
853 | { | 853 | { |
854 | sched_info_queued(p); | 854 | sched_info_queued(p); |
855 | p->sched_class->enqueue_task(rq, p, wakeup, now); | 855 | p->sched_class->enqueue_task(rq, p, wakeup, now); |
856 | p->se.on_rq = 1; | 856 | p->se.on_rq = 1; |
857 | } | 857 | } |
858 | 858 | ||
859 | static void | 859 | static void |
860 | dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) | 860 | dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) |
861 | { | 861 | { |
862 | p->sched_class->dequeue_task(rq, p, sleep, now); | 862 | p->sched_class->dequeue_task(rq, p, sleep, now); |
863 | p->se.on_rq = 0; | 863 | p->se.on_rq = 0; |
864 | } | 864 | } |
865 | 865 | ||
866 | /* | 866 | /* |
867 | * __normal_prio - return the priority that is based on the static prio | 867 | * __normal_prio - return the priority that is based on the static prio |
868 | */ | 868 | */ |
869 | static inline int __normal_prio(struct task_struct *p) | 869 | static inline int __normal_prio(struct task_struct *p) |
870 | { | 870 | { |
871 | return p->static_prio; | 871 | return p->static_prio; |
872 | } | 872 | } |
873 | 873 | ||
874 | /* | 874 | /* |
875 | * Calculate the expected normal priority: i.e. priority | 875 | * Calculate the expected normal priority: i.e. priority |
876 | * without taking RT-inheritance into account. Might be | 876 | * without taking RT-inheritance into account. Might be |
877 | * boosted by interactivity modifiers. Changes upon fork, | 877 | * boosted by interactivity modifiers. Changes upon fork, |
878 | * setprio syscalls, and whenever the interactivity | 878 | * setprio syscalls, and whenever the interactivity |
879 | * estimator recalculates. | 879 | * estimator recalculates. |
880 | */ | 880 | */ |
881 | static inline int normal_prio(struct task_struct *p) | 881 | static inline int normal_prio(struct task_struct *p) |
882 | { | 882 | { |
883 | int prio; | 883 | int prio; |
884 | 884 | ||
885 | if (task_has_rt_policy(p)) | 885 | if (task_has_rt_policy(p)) |
886 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 886 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
887 | else | 887 | else |
888 | prio = __normal_prio(p); | 888 | prio = __normal_prio(p); |
889 | return prio; | 889 | return prio; |
890 | } | 890 | } |
891 | 891 | ||
892 | /* | 892 | /* |
893 | * Calculate the current priority, i.e. the priority | 893 | * Calculate the current priority, i.e. the priority |
894 | * taken into account by the scheduler. This value might | 894 | * taken into account by the scheduler. This value might |
895 | * be boosted by RT tasks, or might be boosted by | 895 | * be boosted by RT tasks, or might be boosted by |
896 | * interactivity modifiers. Will be RT if the task got | 896 | * interactivity modifiers. Will be RT if the task got |
897 | * RT-boosted. If not then it returns p->normal_prio. | 897 | * RT-boosted. If not then it returns p->normal_prio. |
898 | */ | 898 | */ |
899 | static int effective_prio(struct task_struct *p) | 899 | static int effective_prio(struct task_struct *p) |
900 | { | 900 | { |
901 | p->normal_prio = normal_prio(p); | 901 | p->normal_prio = normal_prio(p); |
902 | /* | 902 | /* |
903 | * If we are RT tasks or we were boosted to RT priority, | 903 | * If we are RT tasks or we were boosted to RT priority, |
904 | * keep the priority unchanged. Otherwise, update priority | 904 | * keep the priority unchanged. Otherwise, update priority |
905 | * to the normal priority: | 905 | * to the normal priority: |
906 | */ | 906 | */ |
907 | if (!rt_prio(p->prio)) | 907 | if (!rt_prio(p->prio)) |
908 | return p->normal_prio; | 908 | return p->normal_prio; |
909 | return p->prio; | 909 | return p->prio; |
910 | } | 910 | } |
911 | 911 | ||
912 | /* | 912 | /* |
913 | * activate_task - move a task to the runqueue. | 913 | * activate_task - move a task to the runqueue. |
914 | */ | 914 | */ |
915 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | 915 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
916 | { | 916 | { |
917 | u64 now = rq_clock(rq); | 917 | u64 now = rq_clock(rq); |
918 | 918 | ||
919 | if (p->state == TASK_UNINTERRUPTIBLE) | 919 | if (p->state == TASK_UNINTERRUPTIBLE) |
920 | rq->nr_uninterruptible--; | 920 | rq->nr_uninterruptible--; |
921 | 921 | ||
922 | enqueue_task(rq, p, wakeup, now); | 922 | enqueue_task(rq, p, wakeup, now); |
923 | inc_nr_running(p, rq, now); | 923 | inc_nr_running(p, rq, now); |
924 | } | 924 | } |
925 | 925 | ||
926 | /* | 926 | /* |
927 | * activate_idle_task - move idle task to the _front_ of runqueue. | 927 | * activate_idle_task - move idle task to the _front_ of runqueue. |
928 | */ | 928 | */ |
929 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) | 929 | static inline void activate_idle_task(struct task_struct *p, struct rq *rq) |
930 | { | 930 | { |
931 | u64 now = rq_clock(rq); | 931 | u64 now = rq_clock(rq); |
932 | 932 | ||
933 | if (p->state == TASK_UNINTERRUPTIBLE) | 933 | if (p->state == TASK_UNINTERRUPTIBLE) |
934 | rq->nr_uninterruptible--; | 934 | rq->nr_uninterruptible--; |
935 | 935 | ||
936 | enqueue_task(rq, p, 0, now); | 936 | enqueue_task(rq, p, 0, now); |
937 | inc_nr_running(p, rq, now); | 937 | inc_nr_running(p, rq, now); |
938 | } | 938 | } |
939 | 939 | ||
940 | /* | 940 | /* |
941 | * deactivate_task - remove a task from the runqueue. | 941 | * deactivate_task - remove a task from the runqueue. |
942 | */ | 942 | */ |
943 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 943 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
944 | { | 944 | { |
945 | u64 now = rq_clock(rq); | 945 | u64 now = rq_clock(rq); |
946 | 946 | ||
947 | if (p->state == TASK_UNINTERRUPTIBLE) | 947 | if (p->state == TASK_UNINTERRUPTIBLE) |
948 | rq->nr_uninterruptible++; | 948 | rq->nr_uninterruptible++; |
949 | 949 | ||
950 | dequeue_task(rq, p, sleep, now); | 950 | dequeue_task(rq, p, sleep, now); |
951 | dec_nr_running(p, rq, now); | 951 | dec_nr_running(p, rq, now); |
952 | } | 952 | } |
953 | 953 | ||
954 | /** | 954 | /** |
955 | * task_curr - is this task currently executing on a CPU? | 955 | * task_curr - is this task currently executing on a CPU? |
956 | * @p: the task in question. | 956 | * @p: the task in question. |
957 | */ | 957 | */ |
958 | inline int task_curr(const struct task_struct *p) | 958 | inline int task_curr(const struct task_struct *p) |
959 | { | 959 | { |
960 | return cpu_curr(task_cpu(p)) == p; | 960 | return cpu_curr(task_cpu(p)) == p; |
961 | } | 961 | } |
962 | 962 | ||
963 | /* Used instead of source_load when we know the type == 0 */ | 963 | /* Used instead of source_load when we know the type == 0 */ |
964 | unsigned long weighted_cpuload(const int cpu) | 964 | unsigned long weighted_cpuload(const int cpu) |
965 | { | 965 | { |
966 | return cpu_rq(cpu)->ls.load.weight; | 966 | return cpu_rq(cpu)->ls.load.weight; |
967 | } | 967 | } |
968 | 968 | ||
969 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 969 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
970 | { | 970 | { |
971 | #ifdef CONFIG_SMP | 971 | #ifdef CONFIG_SMP |
972 | task_thread_info(p)->cpu = cpu; | 972 | task_thread_info(p)->cpu = cpu; |
973 | set_task_cfs_rq(p); | 973 | set_task_cfs_rq(p); |
974 | #endif | 974 | #endif |
975 | } | 975 | } |
976 | 976 | ||
977 | #ifdef CONFIG_SMP | 977 | #ifdef CONFIG_SMP |
978 | 978 | ||
979 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | 979 | void set_task_cpu(struct task_struct *p, unsigned int new_cpu) |
980 | { | 980 | { |
981 | int old_cpu = task_cpu(p); | 981 | int old_cpu = task_cpu(p); |
982 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); | 982 | struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); |
983 | u64 clock_offset, fair_clock_offset; | 983 | u64 clock_offset, fair_clock_offset; |
984 | 984 | ||
985 | clock_offset = old_rq->clock - new_rq->clock; | 985 | clock_offset = old_rq->clock - new_rq->clock; |
986 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; | 986 | fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; |
987 | 987 | ||
988 | if (p->se.wait_start_fair) | 988 | if (p->se.wait_start_fair) |
989 | p->se.wait_start_fair -= fair_clock_offset; | 989 | p->se.wait_start_fair -= fair_clock_offset; |
990 | if (p->se.sleep_start_fair) | 990 | if (p->se.sleep_start_fair) |
991 | p->se.sleep_start_fair -= fair_clock_offset; | 991 | p->se.sleep_start_fair -= fair_clock_offset; |
992 | 992 | ||
993 | #ifdef CONFIG_SCHEDSTATS | 993 | #ifdef CONFIG_SCHEDSTATS |
994 | if (p->se.wait_start) | 994 | if (p->se.wait_start) |
995 | p->se.wait_start -= clock_offset; | 995 | p->se.wait_start -= clock_offset; |
996 | if (p->se.sleep_start) | 996 | if (p->se.sleep_start) |
997 | p->se.sleep_start -= clock_offset; | 997 | p->se.sleep_start -= clock_offset; |
998 | if (p->se.block_start) | 998 | if (p->se.block_start) |
999 | p->se.block_start -= clock_offset; | 999 | p->se.block_start -= clock_offset; |
1000 | #endif | 1000 | #endif |
1001 | 1001 | ||
1002 | __set_task_cpu(p, new_cpu); | 1002 | __set_task_cpu(p, new_cpu); |
1003 | } | 1003 | } |
1004 | 1004 | ||
1005 | struct migration_req { | 1005 | struct migration_req { |
1006 | struct list_head list; | 1006 | struct list_head list; |
1007 | 1007 | ||
1008 | struct task_struct *task; | 1008 | struct task_struct *task; |
1009 | int dest_cpu; | 1009 | int dest_cpu; |
1010 | 1010 | ||
1011 | struct completion done; | 1011 | struct completion done; |
1012 | }; | 1012 | }; |
1013 | 1013 | ||
1014 | /* | 1014 | /* |
1015 | * The task's runqueue lock must be held. | 1015 | * The task's runqueue lock must be held. |
1016 | * Returns true if you have to wait for migration thread. | 1016 | * Returns true if you have to wait for migration thread. |
1017 | */ | 1017 | */ |
1018 | static int | 1018 | static int |
1019 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | 1019 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) |
1020 | { | 1020 | { |
1021 | struct rq *rq = task_rq(p); | 1021 | struct rq *rq = task_rq(p); |
1022 | 1022 | ||
1023 | /* | 1023 | /* |
1024 | * If the task is not on a runqueue (and not running), then | 1024 | * If the task is not on a runqueue (and not running), then |
1025 | * it is sufficient to simply update the task's cpu field. | 1025 | * it is sufficient to simply update the task's cpu field. |
1026 | */ | 1026 | */ |
1027 | if (!p->se.on_rq && !task_running(rq, p)) { | 1027 | if (!p->se.on_rq && !task_running(rq, p)) { |
1028 | set_task_cpu(p, dest_cpu); | 1028 | set_task_cpu(p, dest_cpu); |
1029 | return 0; | 1029 | return 0; |
1030 | } | 1030 | } |
1031 | 1031 | ||
1032 | init_completion(&req->done); | 1032 | init_completion(&req->done); |
1033 | req->task = p; | 1033 | req->task = p; |
1034 | req->dest_cpu = dest_cpu; | 1034 | req->dest_cpu = dest_cpu; |
1035 | list_add(&req->list, &rq->migration_queue); | 1035 | list_add(&req->list, &rq->migration_queue); |
1036 | 1036 | ||
1037 | return 1; | 1037 | return 1; |
1038 | } | 1038 | } |
1039 | 1039 | ||
1040 | /* | 1040 | /* |
1041 | * wait_task_inactive - wait for a thread to unschedule. | 1041 | * wait_task_inactive - wait for a thread to unschedule. |
1042 | * | 1042 | * |
1043 | * The caller must ensure that the task *will* unschedule sometime soon, | 1043 | * The caller must ensure that the task *will* unschedule sometime soon, |
1044 | * else this function might spin for a *long* time. This function can't | 1044 | * else this function might spin for a *long* time. This function can't |
1045 | * be called with interrupts off, or it may introduce deadlock with | 1045 | * be called with interrupts off, or it may introduce deadlock with |
1046 | * smp_call_function() if an IPI is sent by the same process we are | 1046 | * smp_call_function() if an IPI is sent by the same process we are |
1047 | * waiting to become inactive. | 1047 | * waiting to become inactive. |
1048 | */ | 1048 | */ |
1049 | void wait_task_inactive(struct task_struct *p) | 1049 | void wait_task_inactive(struct task_struct *p) |
1050 | { | 1050 | { |
1051 | unsigned long flags; | 1051 | unsigned long flags; |
1052 | int running, on_rq; | 1052 | int running, on_rq; |
1053 | struct rq *rq; | 1053 | struct rq *rq; |
1054 | 1054 | ||
1055 | repeat: | 1055 | repeat: |
1056 | /* | 1056 | /* |
1057 | * We do the initial early heuristics without holding | 1057 | * We do the initial early heuristics without holding |
1058 | * any task-queue locks at all. We'll only try to get | 1058 | * any task-queue locks at all. We'll only try to get |
1059 | * the runqueue lock when things look like they will | 1059 | * the runqueue lock when things look like they will |
1060 | * work out! | 1060 | * work out! |
1061 | */ | 1061 | */ |
1062 | rq = task_rq(p); | 1062 | rq = task_rq(p); |
1063 | 1063 | ||
1064 | /* | 1064 | /* |
1065 | * If the task is actively running on another CPU | 1065 | * If the task is actively running on another CPU |
1066 | * still, just relax and busy-wait without holding | 1066 | * still, just relax and busy-wait without holding |
1067 | * any locks. | 1067 | * any locks. |
1068 | * | 1068 | * |
1069 | * NOTE! Since we don't hold any locks, it's not | 1069 | * NOTE! Since we don't hold any locks, it's not |
1070 | * even sure that "rq" stays as the right runqueue! | 1070 | * even sure that "rq" stays as the right runqueue! |
1071 | * But we don't care, since "task_running()" will | 1071 | * But we don't care, since "task_running()" will |
1072 | * return false if the runqueue has changed and p | 1072 | * return false if the runqueue has changed and p |
1073 | * is actually now running somewhere else! | 1073 | * is actually now running somewhere else! |
1074 | */ | 1074 | */ |
1075 | while (task_running(rq, p)) | 1075 | while (task_running(rq, p)) |
1076 | cpu_relax(); | 1076 | cpu_relax(); |
1077 | 1077 | ||
1078 | /* | 1078 | /* |
1079 | * Ok, time to look more closely! We need the rq | 1079 | * Ok, time to look more closely! We need the rq |
1080 | * lock now, to be *sure*. If we're wrong, we'll | 1080 | * lock now, to be *sure*. If we're wrong, we'll |
1081 | * just go back and repeat. | 1081 | * just go back and repeat. |
1082 | */ | 1082 | */ |
1083 | rq = task_rq_lock(p, &flags); | 1083 | rq = task_rq_lock(p, &flags); |
1084 | running = task_running(rq, p); | 1084 | running = task_running(rq, p); |
1085 | on_rq = p->se.on_rq; | 1085 | on_rq = p->se.on_rq; |
1086 | task_rq_unlock(rq, &flags); | 1086 | task_rq_unlock(rq, &flags); |
1087 | 1087 | ||
1088 | /* | 1088 | /* |
1089 | * Was it really running after all now that we | 1089 | * Was it really running after all now that we |
1090 | * checked with the proper locks actually held? | 1090 | * checked with the proper locks actually held? |
1091 | * | 1091 | * |
1092 | * Oops. Go back and try again.. | 1092 | * Oops. Go back and try again.. |
1093 | */ | 1093 | */ |
1094 | if (unlikely(running)) { | 1094 | if (unlikely(running)) { |
1095 | cpu_relax(); | 1095 | cpu_relax(); |
1096 | goto repeat; | 1096 | goto repeat; |
1097 | } | 1097 | } |
1098 | 1098 | ||
1099 | /* | 1099 | /* |
1100 | * It's not enough that it's not actively running, | 1100 | * It's not enough that it's not actively running, |
1101 | * it must be off the runqueue _entirely_, and not | 1101 | * it must be off the runqueue _entirely_, and not |
1102 | * preempted! | 1102 | * preempted! |
1103 | * | 1103 | * |
1104 | * So if it wa still runnable (but just not actively | 1104 | * So if it wa still runnable (but just not actively |
1105 | * running right now), it's preempted, and we should | 1105 | * running right now), it's preempted, and we should |
1106 | * yield - it could be a while. | 1106 | * yield - it could be a while. |
1107 | */ | 1107 | */ |
1108 | if (unlikely(on_rq)) { | 1108 | if (unlikely(on_rq)) { |
1109 | yield(); | 1109 | yield(); |
1110 | goto repeat; | 1110 | goto repeat; |
1111 | } | 1111 | } |
1112 | 1112 | ||
1113 | /* | 1113 | /* |
1114 | * Ahh, all good. It wasn't running, and it wasn't | 1114 | * Ahh, all good. It wasn't running, and it wasn't |
1115 | * runnable, which means that it will never become | 1115 | * runnable, which means that it will never become |
1116 | * running in the future either. We're all done! | 1116 | * running in the future either. We're all done! |
1117 | */ | 1117 | */ |
1118 | } | 1118 | } |
1119 | 1119 | ||
1120 | /*** | 1120 | /*** |
1121 | * kick_process - kick a running thread to enter/exit the kernel | 1121 | * kick_process - kick a running thread to enter/exit the kernel |
1122 | * @p: the to-be-kicked thread | 1122 | * @p: the to-be-kicked thread |
1123 | * | 1123 | * |
1124 | * Cause a process which is running on another CPU to enter | 1124 | * Cause a process which is running on another CPU to enter |
1125 | * kernel-mode, without any delay. (to get signals handled.) | 1125 | * kernel-mode, without any delay. (to get signals handled.) |
1126 | * | 1126 | * |
1127 | * NOTE: this function doesnt have to take the runqueue lock, | 1127 | * NOTE: this function doesnt have to take the runqueue lock, |
1128 | * because all it wants to ensure is that the remote task enters | 1128 | * because all it wants to ensure is that the remote task enters |
1129 | * the kernel. If the IPI races and the task has been migrated | 1129 | * the kernel. If the IPI races and the task has been migrated |
1130 | * to another CPU then no harm is done and the purpose has been | 1130 | * to another CPU then no harm is done and the purpose has been |
1131 | * achieved as well. | 1131 | * achieved as well. |
1132 | */ | 1132 | */ |
1133 | void kick_process(struct task_struct *p) | 1133 | void kick_process(struct task_struct *p) |
1134 | { | 1134 | { |
1135 | int cpu; | 1135 | int cpu; |
1136 | 1136 | ||
1137 | preempt_disable(); | 1137 | preempt_disable(); |
1138 | cpu = task_cpu(p); | 1138 | cpu = task_cpu(p); |
1139 | if ((cpu != smp_processor_id()) && task_curr(p)) | 1139 | if ((cpu != smp_processor_id()) && task_curr(p)) |
1140 | smp_send_reschedule(cpu); | 1140 | smp_send_reschedule(cpu); |
1141 | preempt_enable(); | 1141 | preempt_enable(); |
1142 | } | 1142 | } |
1143 | 1143 | ||
1144 | /* | 1144 | /* |
1145 | * Return a low guess at the load of a migration-source cpu weighted | 1145 | * Return a low guess at the load of a migration-source cpu weighted |
1146 | * according to the scheduling class and "nice" value. | 1146 | * according to the scheduling class and "nice" value. |
1147 | * | 1147 | * |
1148 | * We want to under-estimate the load of migration sources, to | 1148 | * We want to under-estimate the load of migration sources, to |
1149 | * balance conservatively. | 1149 | * balance conservatively. |
1150 | */ | 1150 | */ |
1151 | static inline unsigned long source_load(int cpu, int type) | 1151 | static inline unsigned long source_load(int cpu, int type) |
1152 | { | 1152 | { |
1153 | struct rq *rq = cpu_rq(cpu); | 1153 | struct rq *rq = cpu_rq(cpu); |
1154 | unsigned long total = weighted_cpuload(cpu); | 1154 | unsigned long total = weighted_cpuload(cpu); |
1155 | 1155 | ||
1156 | if (type == 0) | 1156 | if (type == 0) |
1157 | return total; | 1157 | return total; |
1158 | 1158 | ||
1159 | return min(rq->cpu_load[type-1], total); | 1159 | return min(rq->cpu_load[type-1], total); |
1160 | } | 1160 | } |
1161 | 1161 | ||
1162 | /* | 1162 | /* |
1163 | * Return a high guess at the load of a migration-target cpu weighted | 1163 | * Return a high guess at the load of a migration-target cpu weighted |
1164 | * according to the scheduling class and "nice" value. | 1164 | * according to the scheduling class and "nice" value. |
1165 | */ | 1165 | */ |
1166 | static inline unsigned long target_load(int cpu, int type) | 1166 | static inline unsigned long target_load(int cpu, int type) |
1167 | { | 1167 | { |
1168 | struct rq *rq = cpu_rq(cpu); | 1168 | struct rq *rq = cpu_rq(cpu); |
1169 | unsigned long total = weighted_cpuload(cpu); | 1169 | unsigned long total = weighted_cpuload(cpu); |
1170 | 1170 | ||
1171 | if (type == 0) | 1171 | if (type == 0) |
1172 | return total; | 1172 | return total; |
1173 | 1173 | ||
1174 | return max(rq->cpu_load[type-1], total); | 1174 | return max(rq->cpu_load[type-1], total); |
1175 | } | 1175 | } |
1176 | 1176 | ||
1177 | /* | 1177 | /* |
1178 | * Return the average load per task on the cpu's run queue | 1178 | * Return the average load per task on the cpu's run queue |
1179 | */ | 1179 | */ |
1180 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1180 | static inline unsigned long cpu_avg_load_per_task(int cpu) |
1181 | { | 1181 | { |
1182 | struct rq *rq = cpu_rq(cpu); | 1182 | struct rq *rq = cpu_rq(cpu); |
1183 | unsigned long total = weighted_cpuload(cpu); | 1183 | unsigned long total = weighted_cpuload(cpu); |
1184 | unsigned long n = rq->nr_running; | 1184 | unsigned long n = rq->nr_running; |
1185 | 1185 | ||
1186 | return n ? total / n : SCHED_LOAD_SCALE; | 1186 | return n ? total / n : SCHED_LOAD_SCALE; |
1187 | } | 1187 | } |
1188 | 1188 | ||
1189 | /* | 1189 | /* |
1190 | * find_idlest_group finds and returns the least busy CPU group within the | 1190 | * find_idlest_group finds and returns the least busy CPU group within the |
1191 | * domain. | 1191 | * domain. |
1192 | */ | 1192 | */ |
1193 | static struct sched_group * | 1193 | static struct sched_group * |
1194 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | 1194 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) |
1195 | { | 1195 | { |
1196 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1196 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; |
1197 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1197 | unsigned long min_load = ULONG_MAX, this_load = 0; |
1198 | int load_idx = sd->forkexec_idx; | 1198 | int load_idx = sd->forkexec_idx; |
1199 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1199 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
1200 | 1200 | ||
1201 | do { | 1201 | do { |
1202 | unsigned long load, avg_load; | 1202 | unsigned long load, avg_load; |
1203 | int local_group; | 1203 | int local_group; |
1204 | int i; | 1204 | int i; |
1205 | 1205 | ||
1206 | /* Skip over this group if it has no CPUs allowed */ | 1206 | /* Skip over this group if it has no CPUs allowed */ |
1207 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1207 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
1208 | goto nextgroup; | 1208 | goto nextgroup; |
1209 | 1209 | ||
1210 | local_group = cpu_isset(this_cpu, group->cpumask); | 1210 | local_group = cpu_isset(this_cpu, group->cpumask); |
1211 | 1211 | ||
1212 | /* Tally up the load of all CPUs in the group */ | 1212 | /* Tally up the load of all CPUs in the group */ |
1213 | avg_load = 0; | 1213 | avg_load = 0; |
1214 | 1214 | ||
1215 | for_each_cpu_mask(i, group->cpumask) { | 1215 | for_each_cpu_mask(i, group->cpumask) { |
1216 | /* Bias balancing toward cpus of our domain */ | 1216 | /* Bias balancing toward cpus of our domain */ |
1217 | if (local_group) | 1217 | if (local_group) |
1218 | load = source_load(i, load_idx); | 1218 | load = source_load(i, load_idx); |
1219 | else | 1219 | else |
1220 | load = target_load(i, load_idx); | 1220 | load = target_load(i, load_idx); |
1221 | 1221 | ||
1222 | avg_load += load; | 1222 | avg_load += load; |
1223 | } | 1223 | } |
1224 | 1224 | ||
1225 | /* Adjust by relative CPU power of the group */ | 1225 | /* Adjust by relative CPU power of the group */ |
1226 | avg_load = sg_div_cpu_power(group, | 1226 | avg_load = sg_div_cpu_power(group, |
1227 | avg_load * SCHED_LOAD_SCALE); | 1227 | avg_load * SCHED_LOAD_SCALE); |
1228 | 1228 | ||
1229 | if (local_group) { | 1229 | if (local_group) { |
1230 | this_load = avg_load; | 1230 | this_load = avg_load; |
1231 | this = group; | 1231 | this = group; |
1232 | } else if (avg_load < min_load) { | 1232 | } else if (avg_load < min_load) { |
1233 | min_load = avg_load; | 1233 | min_load = avg_load; |
1234 | idlest = group; | 1234 | idlest = group; |
1235 | } | 1235 | } |
1236 | nextgroup: | 1236 | nextgroup: |
1237 | group = group->next; | 1237 | group = group->next; |
1238 | } while (group != sd->groups); | 1238 | } while (group != sd->groups); |
1239 | 1239 | ||
1240 | if (!idlest || 100*this_load < imbalance*min_load) | 1240 | if (!idlest || 100*this_load < imbalance*min_load) |
1241 | return NULL; | 1241 | return NULL; |
1242 | return idlest; | 1242 | return idlest; |
1243 | } | 1243 | } |
1244 | 1244 | ||
1245 | /* | 1245 | /* |
1246 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 1246 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
1247 | */ | 1247 | */ |
1248 | static int | 1248 | static int |
1249 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 1249 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
1250 | { | 1250 | { |
1251 | cpumask_t tmp; | 1251 | cpumask_t tmp; |
1252 | unsigned long load, min_load = ULONG_MAX; | 1252 | unsigned long load, min_load = ULONG_MAX; |
1253 | int idlest = -1; | 1253 | int idlest = -1; |
1254 | int i; | 1254 | int i; |
1255 | 1255 | ||
1256 | /* Traverse only the allowed CPUs */ | 1256 | /* Traverse only the allowed CPUs */ |
1257 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1257 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1258 | 1258 | ||
1259 | for_each_cpu_mask(i, tmp) { | 1259 | for_each_cpu_mask(i, tmp) { |
1260 | load = weighted_cpuload(i); | 1260 | load = weighted_cpuload(i); |
1261 | 1261 | ||
1262 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1262 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1263 | min_load = load; | 1263 | min_load = load; |
1264 | idlest = i; | 1264 | idlest = i; |
1265 | } | 1265 | } |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | return idlest; | 1268 | return idlest; |
1269 | } | 1269 | } |
1270 | 1270 | ||
1271 | /* | 1271 | /* |
1272 | * sched_balance_self: balance the current task (running on cpu) in domains | 1272 | * sched_balance_self: balance the current task (running on cpu) in domains |
1273 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | 1273 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and |
1274 | * SD_BALANCE_EXEC. | 1274 | * SD_BALANCE_EXEC. |
1275 | * | 1275 | * |
1276 | * Balance, ie. select the least loaded group. | 1276 | * Balance, ie. select the least loaded group. |
1277 | * | 1277 | * |
1278 | * Returns the target CPU number, or the same CPU if no balancing is needed. | 1278 | * Returns the target CPU number, or the same CPU if no balancing is needed. |
1279 | * | 1279 | * |
1280 | * preempt must be disabled. | 1280 | * preempt must be disabled. |
1281 | */ | 1281 | */ |
1282 | static int sched_balance_self(int cpu, int flag) | 1282 | static int sched_balance_self(int cpu, int flag) |
1283 | { | 1283 | { |
1284 | struct task_struct *t = current; | 1284 | struct task_struct *t = current; |
1285 | struct sched_domain *tmp, *sd = NULL; | 1285 | struct sched_domain *tmp, *sd = NULL; |
1286 | 1286 | ||
1287 | for_each_domain(cpu, tmp) { | 1287 | for_each_domain(cpu, tmp) { |
1288 | /* | 1288 | /* |
1289 | * If power savings logic is enabled for a domain, stop there. | 1289 | * If power savings logic is enabled for a domain, stop there. |
1290 | */ | 1290 | */ |
1291 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1291 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1292 | break; | 1292 | break; |
1293 | if (tmp->flags & flag) | 1293 | if (tmp->flags & flag) |
1294 | sd = tmp; | 1294 | sd = tmp; |
1295 | } | 1295 | } |
1296 | 1296 | ||
1297 | while (sd) { | 1297 | while (sd) { |
1298 | cpumask_t span; | 1298 | cpumask_t span; |
1299 | struct sched_group *group; | 1299 | struct sched_group *group; |
1300 | int new_cpu, weight; | 1300 | int new_cpu, weight; |
1301 | 1301 | ||
1302 | if (!(sd->flags & flag)) { | 1302 | if (!(sd->flags & flag)) { |
1303 | sd = sd->child; | 1303 | sd = sd->child; |
1304 | continue; | 1304 | continue; |
1305 | } | 1305 | } |
1306 | 1306 | ||
1307 | span = sd->span; | 1307 | span = sd->span; |
1308 | group = find_idlest_group(sd, t, cpu); | 1308 | group = find_idlest_group(sd, t, cpu); |
1309 | if (!group) { | 1309 | if (!group) { |
1310 | sd = sd->child; | 1310 | sd = sd->child; |
1311 | continue; | 1311 | continue; |
1312 | } | 1312 | } |
1313 | 1313 | ||
1314 | new_cpu = find_idlest_cpu(group, t, cpu); | 1314 | new_cpu = find_idlest_cpu(group, t, cpu); |
1315 | if (new_cpu == -1 || new_cpu == cpu) { | 1315 | if (new_cpu == -1 || new_cpu == cpu) { |
1316 | /* Now try balancing at a lower domain level of cpu */ | 1316 | /* Now try balancing at a lower domain level of cpu */ |
1317 | sd = sd->child; | 1317 | sd = sd->child; |
1318 | continue; | 1318 | continue; |
1319 | } | 1319 | } |
1320 | 1320 | ||
1321 | /* Now try balancing at a lower domain level of new_cpu */ | 1321 | /* Now try balancing at a lower domain level of new_cpu */ |
1322 | cpu = new_cpu; | 1322 | cpu = new_cpu; |
1323 | sd = NULL; | 1323 | sd = NULL; |
1324 | weight = cpus_weight(span); | 1324 | weight = cpus_weight(span); |
1325 | for_each_domain(cpu, tmp) { | 1325 | for_each_domain(cpu, tmp) { |
1326 | if (weight <= cpus_weight(tmp->span)) | 1326 | if (weight <= cpus_weight(tmp->span)) |
1327 | break; | 1327 | break; |
1328 | if (tmp->flags & flag) | 1328 | if (tmp->flags & flag) |
1329 | sd = tmp; | 1329 | sd = tmp; |
1330 | } | 1330 | } |
1331 | /* while loop will break here if sd == NULL */ | 1331 | /* while loop will break here if sd == NULL */ |
1332 | } | 1332 | } |
1333 | 1333 | ||
1334 | return cpu; | 1334 | return cpu; |
1335 | } | 1335 | } |
1336 | 1336 | ||
1337 | #endif /* CONFIG_SMP */ | 1337 | #endif /* CONFIG_SMP */ |
1338 | 1338 | ||
1339 | /* | 1339 | /* |
1340 | * wake_idle() will wake a task on an idle cpu if task->cpu is | 1340 | * wake_idle() will wake a task on an idle cpu if task->cpu is |
1341 | * not idle and an idle cpu is available. The span of cpus to | 1341 | * not idle and an idle cpu is available. The span of cpus to |
1342 | * search starts with cpus closest then further out as needed, | 1342 | * search starts with cpus closest then further out as needed, |
1343 | * so we always favor a closer, idle cpu. | 1343 | * so we always favor a closer, idle cpu. |
1344 | * | 1344 | * |
1345 | * Returns the CPU we should wake onto. | 1345 | * Returns the CPU we should wake onto. |
1346 | */ | 1346 | */ |
1347 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1347 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
1348 | static int wake_idle(int cpu, struct task_struct *p) | 1348 | static int wake_idle(int cpu, struct task_struct *p) |
1349 | { | 1349 | { |
1350 | cpumask_t tmp; | 1350 | cpumask_t tmp; |
1351 | struct sched_domain *sd; | 1351 | struct sched_domain *sd; |
1352 | int i; | 1352 | int i; |
1353 | 1353 | ||
1354 | /* | 1354 | /* |
1355 | * If it is idle, then it is the best cpu to run this task. | 1355 | * If it is idle, then it is the best cpu to run this task. |
1356 | * | 1356 | * |
1357 | * This cpu is also the best, if it has more than one task already. | 1357 | * This cpu is also the best, if it has more than one task already. |
1358 | * Siblings must be also busy(in most cases) as they didn't already | 1358 | * Siblings must be also busy(in most cases) as they didn't already |
1359 | * pickup the extra load from this cpu and hence we need not check | 1359 | * pickup the extra load from this cpu and hence we need not check |
1360 | * sibling runqueue info. This will avoid the checks and cache miss | 1360 | * sibling runqueue info. This will avoid the checks and cache miss |
1361 | * penalities associated with that. | 1361 | * penalities associated with that. |
1362 | */ | 1362 | */ |
1363 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | 1363 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) |
1364 | return cpu; | 1364 | return cpu; |
1365 | 1365 | ||
1366 | for_each_domain(cpu, sd) { | 1366 | for_each_domain(cpu, sd) { |
1367 | if (sd->flags & SD_WAKE_IDLE) { | 1367 | if (sd->flags & SD_WAKE_IDLE) { |
1368 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1368 | cpus_and(tmp, sd->span, p->cpus_allowed); |
1369 | for_each_cpu_mask(i, tmp) { | 1369 | for_each_cpu_mask(i, tmp) { |
1370 | if (idle_cpu(i)) | 1370 | if (idle_cpu(i)) |
1371 | return i; | 1371 | return i; |
1372 | } | 1372 | } |
1373 | } else { | 1373 | } else { |
1374 | break; | 1374 | break; |
1375 | } | 1375 | } |
1376 | } | 1376 | } |
1377 | return cpu; | 1377 | return cpu; |
1378 | } | 1378 | } |
1379 | #else | 1379 | #else |
1380 | static inline int wake_idle(int cpu, struct task_struct *p) | 1380 | static inline int wake_idle(int cpu, struct task_struct *p) |
1381 | { | 1381 | { |
1382 | return cpu; | 1382 | return cpu; |
1383 | } | 1383 | } |
1384 | #endif | 1384 | #endif |
1385 | 1385 | ||
1386 | /*** | 1386 | /*** |
1387 | * try_to_wake_up - wake up a thread | 1387 | * try_to_wake_up - wake up a thread |
1388 | * @p: the to-be-woken-up thread | 1388 | * @p: the to-be-woken-up thread |
1389 | * @state: the mask of task states that can be woken | 1389 | * @state: the mask of task states that can be woken |
1390 | * @sync: do a synchronous wakeup? | 1390 | * @sync: do a synchronous wakeup? |
1391 | * | 1391 | * |
1392 | * Put it on the run-queue if it's not already there. The "current" | 1392 | * Put it on the run-queue if it's not already there. The "current" |
1393 | * thread is always on the run-queue (except when the actual | 1393 | * thread is always on the run-queue (except when the actual |
1394 | * re-schedule is in progress), and as such you're allowed to do | 1394 | * re-schedule is in progress), and as such you're allowed to do |
1395 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1395 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
1396 | * runnable without the overhead of this. | 1396 | * runnable without the overhead of this. |
1397 | * | 1397 | * |
1398 | * returns failure only if the task is already active. | 1398 | * returns failure only if the task is already active. |
1399 | */ | 1399 | */ |
1400 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1400 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1401 | { | 1401 | { |
1402 | int cpu, this_cpu, success = 0; | 1402 | int cpu, this_cpu, success = 0; |
1403 | unsigned long flags; | 1403 | unsigned long flags; |
1404 | long old_state; | 1404 | long old_state; |
1405 | struct rq *rq; | 1405 | struct rq *rq; |
1406 | #ifdef CONFIG_SMP | 1406 | #ifdef CONFIG_SMP |
1407 | struct sched_domain *sd, *this_sd = NULL; | 1407 | struct sched_domain *sd, *this_sd = NULL; |
1408 | unsigned long load, this_load; | 1408 | unsigned long load, this_load; |
1409 | int new_cpu; | 1409 | int new_cpu; |
1410 | #endif | 1410 | #endif |
1411 | 1411 | ||
1412 | rq = task_rq_lock(p, &flags); | 1412 | rq = task_rq_lock(p, &flags); |
1413 | old_state = p->state; | 1413 | old_state = p->state; |
1414 | if (!(old_state & state)) | 1414 | if (!(old_state & state)) |
1415 | goto out; | 1415 | goto out; |
1416 | 1416 | ||
1417 | if (p->se.on_rq) | 1417 | if (p->se.on_rq) |
1418 | goto out_running; | 1418 | goto out_running; |
1419 | 1419 | ||
1420 | cpu = task_cpu(p); | 1420 | cpu = task_cpu(p); |
1421 | this_cpu = smp_processor_id(); | 1421 | this_cpu = smp_processor_id(); |
1422 | 1422 | ||
1423 | #ifdef CONFIG_SMP | 1423 | #ifdef CONFIG_SMP |
1424 | if (unlikely(task_running(rq, p))) | 1424 | if (unlikely(task_running(rq, p))) |
1425 | goto out_activate; | 1425 | goto out_activate; |
1426 | 1426 | ||
1427 | new_cpu = cpu; | 1427 | new_cpu = cpu; |
1428 | 1428 | ||
1429 | schedstat_inc(rq, ttwu_cnt); | 1429 | schedstat_inc(rq, ttwu_cnt); |
1430 | if (cpu == this_cpu) { | 1430 | if (cpu == this_cpu) { |
1431 | schedstat_inc(rq, ttwu_local); | 1431 | schedstat_inc(rq, ttwu_local); |
1432 | goto out_set_cpu; | 1432 | goto out_set_cpu; |
1433 | } | 1433 | } |
1434 | 1434 | ||
1435 | for_each_domain(this_cpu, sd) { | 1435 | for_each_domain(this_cpu, sd) { |
1436 | if (cpu_isset(cpu, sd->span)) { | 1436 | if (cpu_isset(cpu, sd->span)) { |
1437 | schedstat_inc(sd, ttwu_wake_remote); | 1437 | schedstat_inc(sd, ttwu_wake_remote); |
1438 | this_sd = sd; | 1438 | this_sd = sd; |
1439 | break; | 1439 | break; |
1440 | } | 1440 | } |
1441 | } | 1441 | } |
1442 | 1442 | ||
1443 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1443 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1444 | goto out_set_cpu; | 1444 | goto out_set_cpu; |
1445 | 1445 | ||
1446 | /* | 1446 | /* |
1447 | * Check for affine wakeup and passive balancing possibilities. | 1447 | * Check for affine wakeup and passive balancing possibilities. |
1448 | */ | 1448 | */ |
1449 | if (this_sd) { | 1449 | if (this_sd) { |
1450 | int idx = this_sd->wake_idx; | 1450 | int idx = this_sd->wake_idx; |
1451 | unsigned int imbalance; | 1451 | unsigned int imbalance; |
1452 | 1452 | ||
1453 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | 1453 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
1454 | 1454 | ||
1455 | load = source_load(cpu, idx); | 1455 | load = source_load(cpu, idx); |
1456 | this_load = target_load(this_cpu, idx); | 1456 | this_load = target_load(this_cpu, idx); |
1457 | 1457 | ||
1458 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | 1458 | new_cpu = this_cpu; /* Wake to this CPU if we can */ |
1459 | 1459 | ||
1460 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1460 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1461 | unsigned long tl = this_load; | 1461 | unsigned long tl = this_load; |
1462 | unsigned long tl_per_task; | 1462 | unsigned long tl_per_task; |
1463 | 1463 | ||
1464 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1464 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
1465 | 1465 | ||
1466 | /* | 1466 | /* |
1467 | * If sync wakeup then subtract the (maximum possible) | 1467 | * If sync wakeup then subtract the (maximum possible) |
1468 | * effect of the currently running task from the load | 1468 | * effect of the currently running task from the load |
1469 | * of the current CPU: | 1469 | * of the current CPU: |
1470 | */ | 1470 | */ |
1471 | if (sync) | 1471 | if (sync) |
1472 | tl -= current->se.load.weight; | 1472 | tl -= current->se.load.weight; |
1473 | 1473 | ||
1474 | if ((tl <= load && | 1474 | if ((tl <= load && |
1475 | tl + target_load(cpu, idx) <= tl_per_task) || | 1475 | tl + target_load(cpu, idx) <= tl_per_task) || |
1476 | 100*(tl + p->se.load.weight) <= imbalance*load) { | 1476 | 100*(tl + p->se.load.weight) <= imbalance*load) { |
1477 | /* | 1477 | /* |
1478 | * This domain has SD_WAKE_AFFINE and | 1478 | * This domain has SD_WAKE_AFFINE and |
1479 | * p is cache cold in this domain, and | 1479 | * p is cache cold in this domain, and |
1480 | * there is no bad imbalance. | 1480 | * there is no bad imbalance. |
1481 | */ | 1481 | */ |
1482 | schedstat_inc(this_sd, ttwu_move_affine); | 1482 | schedstat_inc(this_sd, ttwu_move_affine); |
1483 | goto out_set_cpu; | 1483 | goto out_set_cpu; |
1484 | } | 1484 | } |
1485 | } | 1485 | } |
1486 | 1486 | ||
1487 | /* | 1487 | /* |
1488 | * Start passive balancing when half the imbalance_pct | 1488 | * Start passive balancing when half the imbalance_pct |
1489 | * limit is reached. | 1489 | * limit is reached. |
1490 | */ | 1490 | */ |
1491 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1491 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1492 | if (imbalance*this_load <= 100*load) { | 1492 | if (imbalance*this_load <= 100*load) { |
1493 | schedstat_inc(this_sd, ttwu_move_balance); | 1493 | schedstat_inc(this_sd, ttwu_move_balance); |
1494 | goto out_set_cpu; | 1494 | goto out_set_cpu; |
1495 | } | 1495 | } |
1496 | } | 1496 | } |
1497 | } | 1497 | } |
1498 | 1498 | ||
1499 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | 1499 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ |
1500 | out_set_cpu: | 1500 | out_set_cpu: |
1501 | new_cpu = wake_idle(new_cpu, p); | 1501 | new_cpu = wake_idle(new_cpu, p); |
1502 | if (new_cpu != cpu) { | 1502 | if (new_cpu != cpu) { |
1503 | set_task_cpu(p, new_cpu); | 1503 | set_task_cpu(p, new_cpu); |
1504 | task_rq_unlock(rq, &flags); | 1504 | task_rq_unlock(rq, &flags); |
1505 | /* might preempt at this point */ | 1505 | /* might preempt at this point */ |
1506 | rq = task_rq_lock(p, &flags); | 1506 | rq = task_rq_lock(p, &flags); |
1507 | old_state = p->state; | 1507 | old_state = p->state; |
1508 | if (!(old_state & state)) | 1508 | if (!(old_state & state)) |
1509 | goto out; | 1509 | goto out; |
1510 | if (p->se.on_rq) | 1510 | if (p->se.on_rq) |
1511 | goto out_running; | 1511 | goto out_running; |
1512 | 1512 | ||
1513 | this_cpu = smp_processor_id(); | 1513 | this_cpu = smp_processor_id(); |
1514 | cpu = task_cpu(p); | 1514 | cpu = task_cpu(p); |
1515 | } | 1515 | } |
1516 | 1516 | ||
1517 | out_activate: | 1517 | out_activate: |
1518 | #endif /* CONFIG_SMP */ | 1518 | #endif /* CONFIG_SMP */ |
1519 | activate_task(rq, p, 1); | 1519 | activate_task(rq, p, 1); |
1520 | /* | 1520 | /* |
1521 | * Sync wakeups (i.e. those types of wakeups where the waker | 1521 | * Sync wakeups (i.e. those types of wakeups where the waker |
1522 | * has indicated that it will leave the CPU in short order) | 1522 | * has indicated that it will leave the CPU in short order) |
1523 | * don't trigger a preemption, if the woken up task will run on | 1523 | * don't trigger a preemption, if the woken up task will run on |
1524 | * this cpu. (in this case the 'I will reschedule' promise of | 1524 | * this cpu. (in this case the 'I will reschedule' promise of |
1525 | * the waker guarantees that the freshly woken up task is going | 1525 | * the waker guarantees that the freshly woken up task is going |
1526 | * to be considered on this CPU.) | 1526 | * to be considered on this CPU.) |
1527 | */ | 1527 | */ |
1528 | if (!sync || cpu != this_cpu) | 1528 | if (!sync || cpu != this_cpu) |
1529 | check_preempt_curr(rq, p); | 1529 | check_preempt_curr(rq, p); |
1530 | success = 1; | 1530 | success = 1; |
1531 | 1531 | ||
1532 | out_running: | 1532 | out_running: |
1533 | p->state = TASK_RUNNING; | 1533 | p->state = TASK_RUNNING; |
1534 | out: | 1534 | out: |
1535 | task_rq_unlock(rq, &flags); | 1535 | task_rq_unlock(rq, &flags); |
1536 | 1536 | ||
1537 | return success; | 1537 | return success; |
1538 | } | 1538 | } |
1539 | 1539 | ||
1540 | int fastcall wake_up_process(struct task_struct *p) | 1540 | int fastcall wake_up_process(struct task_struct *p) |
1541 | { | 1541 | { |
1542 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1542 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1543 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1543 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
1544 | } | 1544 | } |
1545 | EXPORT_SYMBOL(wake_up_process); | 1545 | EXPORT_SYMBOL(wake_up_process); |
1546 | 1546 | ||
1547 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) | 1547 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
1548 | { | 1548 | { |
1549 | return try_to_wake_up(p, state, 0); | 1549 | return try_to_wake_up(p, state, 0); |
1550 | } | 1550 | } |
1551 | 1551 | ||
1552 | /* | 1552 | /* |
1553 | * Perform scheduler related setup for a newly forked process p. | 1553 | * Perform scheduler related setup for a newly forked process p. |
1554 | * p is forked by current. | 1554 | * p is forked by current. |
1555 | * | 1555 | * |
1556 | * __sched_fork() is basic setup used by init_idle() too: | 1556 | * __sched_fork() is basic setup used by init_idle() too: |
1557 | */ | 1557 | */ |
1558 | static void __sched_fork(struct task_struct *p) | 1558 | static void __sched_fork(struct task_struct *p) |
1559 | { | 1559 | { |
1560 | p->se.wait_start_fair = 0; | 1560 | p->se.wait_start_fair = 0; |
1561 | p->se.exec_start = 0; | 1561 | p->se.exec_start = 0; |
1562 | p->se.sum_exec_runtime = 0; | 1562 | p->se.sum_exec_runtime = 0; |
1563 | p->se.delta_exec = 0; | 1563 | p->se.delta_exec = 0; |
1564 | p->se.delta_fair_run = 0; | 1564 | p->se.delta_fair_run = 0; |
1565 | p->se.delta_fair_sleep = 0; | 1565 | p->se.delta_fair_sleep = 0; |
1566 | p->se.wait_runtime = 0; | 1566 | p->se.wait_runtime = 0; |
1567 | p->se.sleep_start_fair = 0; | 1567 | p->se.sleep_start_fair = 0; |
1568 | 1568 | ||
1569 | #ifdef CONFIG_SCHEDSTATS | 1569 | #ifdef CONFIG_SCHEDSTATS |
1570 | p->se.wait_start = 0; | 1570 | p->se.wait_start = 0; |
1571 | p->se.sum_wait_runtime = 0; | 1571 | p->se.sum_wait_runtime = 0; |
1572 | p->se.sum_sleep_runtime = 0; | 1572 | p->se.sum_sleep_runtime = 0; |
1573 | p->se.sleep_start = 0; | 1573 | p->se.sleep_start = 0; |
1574 | p->se.block_start = 0; | 1574 | p->se.block_start = 0; |
1575 | p->se.sleep_max = 0; | 1575 | p->se.sleep_max = 0; |
1576 | p->se.block_max = 0; | 1576 | p->se.block_max = 0; |
1577 | p->se.exec_max = 0; | 1577 | p->se.exec_max = 0; |
1578 | p->se.wait_max = 0; | 1578 | p->se.wait_max = 0; |
1579 | p->se.wait_runtime_overruns = 0; | 1579 | p->se.wait_runtime_overruns = 0; |
1580 | p->se.wait_runtime_underruns = 0; | 1580 | p->se.wait_runtime_underruns = 0; |
1581 | #endif | 1581 | #endif |
1582 | 1582 | ||
1583 | INIT_LIST_HEAD(&p->run_list); | 1583 | INIT_LIST_HEAD(&p->run_list); |
1584 | p->se.on_rq = 0; | 1584 | p->se.on_rq = 0; |
1585 | 1585 | ||
1586 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1586 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1587 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1587 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1588 | #endif | 1588 | #endif |
1589 | 1589 | ||
1590 | /* | 1590 | /* |
1591 | * We mark the process as running here, but have not actually | 1591 | * We mark the process as running here, but have not actually |
1592 | * inserted it onto the runqueue yet. This guarantees that | 1592 | * inserted it onto the runqueue yet. This guarantees that |
1593 | * nobody will actually run it, and a signal or other external | 1593 | * nobody will actually run it, and a signal or other external |
1594 | * event cannot wake it up and insert it on the runqueue either. | 1594 | * event cannot wake it up and insert it on the runqueue either. |
1595 | */ | 1595 | */ |
1596 | p->state = TASK_RUNNING; | 1596 | p->state = TASK_RUNNING; |
1597 | } | 1597 | } |
1598 | 1598 | ||
1599 | /* | 1599 | /* |
1600 | * fork()/clone()-time setup: | 1600 | * fork()/clone()-time setup: |
1601 | */ | 1601 | */ |
1602 | void sched_fork(struct task_struct *p, int clone_flags) | 1602 | void sched_fork(struct task_struct *p, int clone_flags) |
1603 | { | 1603 | { |
1604 | int cpu = get_cpu(); | 1604 | int cpu = get_cpu(); |
1605 | 1605 | ||
1606 | __sched_fork(p); | 1606 | __sched_fork(p); |
1607 | 1607 | ||
1608 | #ifdef CONFIG_SMP | 1608 | #ifdef CONFIG_SMP |
1609 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1609 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
1610 | #endif | 1610 | #endif |
1611 | __set_task_cpu(p, cpu); | 1611 | __set_task_cpu(p, cpu); |
1612 | 1612 | ||
1613 | /* | 1613 | /* |
1614 | * Make sure we do not leak PI boosting priority to the child: | 1614 | * Make sure we do not leak PI boosting priority to the child: |
1615 | */ | 1615 | */ |
1616 | p->prio = current->normal_prio; | 1616 | p->prio = current->normal_prio; |
1617 | 1617 | ||
1618 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1618 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1619 | if (likely(sched_info_on())) | 1619 | if (likely(sched_info_on())) |
1620 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1620 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1621 | #endif | 1621 | #endif |
1622 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1622 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1623 | p->oncpu = 0; | 1623 | p->oncpu = 0; |
1624 | #endif | 1624 | #endif |
1625 | #ifdef CONFIG_PREEMPT | 1625 | #ifdef CONFIG_PREEMPT |
1626 | /* Want to start with kernel preemption disabled. */ | 1626 | /* Want to start with kernel preemption disabled. */ |
1627 | task_thread_info(p)->preempt_count = 1; | 1627 | task_thread_info(p)->preempt_count = 1; |
1628 | #endif | 1628 | #endif |
1629 | put_cpu(); | 1629 | put_cpu(); |
1630 | } | 1630 | } |
1631 | 1631 | ||
1632 | /* | 1632 | /* |
1633 | * After fork, child runs first. (default) If set to 0 then | 1633 | * After fork, child runs first. (default) If set to 0 then |
1634 | * parent will (try to) run first. | 1634 | * parent will (try to) run first. |
1635 | */ | 1635 | */ |
1636 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; | 1636 | unsigned int __read_mostly sysctl_sched_child_runs_first = 1; |
1637 | 1637 | ||
1638 | /* | 1638 | /* |
1639 | * wake_up_new_task - wake up a newly created task for the first time. | 1639 | * wake_up_new_task - wake up a newly created task for the first time. |
1640 | * | 1640 | * |
1641 | * This function will do some initial scheduler statistics housekeeping | 1641 | * This function will do some initial scheduler statistics housekeeping |
1642 | * that must be done for every newly created context, then puts the task | 1642 | * that must be done for every newly created context, then puts the task |
1643 | * on the runqueue and wakes it. | 1643 | * on the runqueue and wakes it. |
1644 | */ | 1644 | */ |
1645 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 1645 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1646 | { | 1646 | { |
1647 | unsigned long flags; | 1647 | unsigned long flags; |
1648 | struct rq *rq; | 1648 | struct rq *rq; |
1649 | int this_cpu; | 1649 | int this_cpu; |
1650 | u64 now; | 1650 | u64 now; |
1651 | 1651 | ||
1652 | rq = task_rq_lock(p, &flags); | 1652 | rq = task_rq_lock(p, &flags); |
1653 | BUG_ON(p->state != TASK_RUNNING); | 1653 | BUG_ON(p->state != TASK_RUNNING); |
1654 | this_cpu = smp_processor_id(); /* parent's CPU */ | 1654 | this_cpu = smp_processor_id(); /* parent's CPU */ |
1655 | now = rq_clock(rq); | 1655 | now = rq_clock(rq); |
1656 | 1656 | ||
1657 | p->prio = effective_prio(p); | 1657 | p->prio = effective_prio(p); |
1658 | 1658 | ||
1659 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || | 1659 | if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || |
1660 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || | 1660 | (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || |
1661 | !current->se.on_rq) { | 1661 | !current->se.on_rq) { |
1662 | 1662 | ||
1663 | activate_task(rq, p, 0); | 1663 | activate_task(rq, p, 0); |
1664 | } else { | 1664 | } else { |
1665 | /* | 1665 | /* |
1666 | * Let the scheduling class do new task startup | 1666 | * Let the scheduling class do new task startup |
1667 | * management (if any): | 1667 | * management (if any): |
1668 | */ | 1668 | */ |
1669 | p->sched_class->task_new(rq, p, now); | 1669 | p->sched_class->task_new(rq, p, now); |
1670 | inc_nr_running(p, rq, now); | 1670 | inc_nr_running(p, rq, now); |
1671 | } | 1671 | } |
1672 | check_preempt_curr(rq, p); | 1672 | check_preempt_curr(rq, p); |
1673 | task_rq_unlock(rq, &flags); | 1673 | task_rq_unlock(rq, &flags); |
1674 | } | 1674 | } |
1675 | 1675 | ||
1676 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1676 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1677 | 1677 | ||
1678 | /** | 1678 | /** |
1679 | * preempt_notifier_register - tell me when current is being being preempted & rescheduled | 1679 | * preempt_notifier_register - tell me when current is being being preempted & rescheduled |
1680 | * @notifier: notifier struct to register | 1680 | * @notifier: notifier struct to register |
1681 | */ | 1681 | */ |
1682 | void preempt_notifier_register(struct preempt_notifier *notifier) | 1682 | void preempt_notifier_register(struct preempt_notifier *notifier) |
1683 | { | 1683 | { |
1684 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); | 1684 | hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); |
1685 | } | 1685 | } |
1686 | EXPORT_SYMBOL_GPL(preempt_notifier_register); | 1686 | EXPORT_SYMBOL_GPL(preempt_notifier_register); |
1687 | 1687 | ||
1688 | /** | 1688 | /** |
1689 | * preempt_notifier_unregister - no longer interested in preemption notifications | 1689 | * preempt_notifier_unregister - no longer interested in preemption notifications |
1690 | * @notifier: notifier struct to unregister | 1690 | * @notifier: notifier struct to unregister |
1691 | * | 1691 | * |
1692 | * This is safe to call from within a preemption notifier. | 1692 | * This is safe to call from within a preemption notifier. |
1693 | */ | 1693 | */ |
1694 | void preempt_notifier_unregister(struct preempt_notifier *notifier) | 1694 | void preempt_notifier_unregister(struct preempt_notifier *notifier) |
1695 | { | 1695 | { |
1696 | hlist_del(¬ifier->link); | 1696 | hlist_del(¬ifier->link); |
1697 | } | 1697 | } |
1698 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); | 1698 | EXPORT_SYMBOL_GPL(preempt_notifier_unregister); |
1699 | 1699 | ||
1700 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 1700 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
1701 | { | 1701 | { |
1702 | struct preempt_notifier *notifier; | 1702 | struct preempt_notifier *notifier; |
1703 | struct hlist_node *node; | 1703 | struct hlist_node *node; |
1704 | 1704 | ||
1705 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 1705 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
1706 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); | 1706 | notifier->ops->sched_in(notifier, raw_smp_processor_id()); |
1707 | } | 1707 | } |
1708 | 1708 | ||
1709 | static void | 1709 | static void |
1710 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 1710 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
1711 | struct task_struct *next) | 1711 | struct task_struct *next) |
1712 | { | 1712 | { |
1713 | struct preempt_notifier *notifier; | 1713 | struct preempt_notifier *notifier; |
1714 | struct hlist_node *node; | 1714 | struct hlist_node *node; |
1715 | 1715 | ||
1716 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) | 1716 | hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) |
1717 | notifier->ops->sched_out(notifier, next); | 1717 | notifier->ops->sched_out(notifier, next); |
1718 | } | 1718 | } |
1719 | 1719 | ||
1720 | #else | 1720 | #else |
1721 | 1721 | ||
1722 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) | 1722 | static void fire_sched_in_preempt_notifiers(struct task_struct *curr) |
1723 | { | 1723 | { |
1724 | } | 1724 | } |
1725 | 1725 | ||
1726 | static void | 1726 | static void |
1727 | fire_sched_out_preempt_notifiers(struct task_struct *curr, | 1727 | fire_sched_out_preempt_notifiers(struct task_struct *curr, |
1728 | struct task_struct *next) | 1728 | struct task_struct *next) |
1729 | { | 1729 | { |
1730 | } | 1730 | } |
1731 | 1731 | ||
1732 | #endif | 1732 | #endif |
1733 | 1733 | ||
1734 | /** | 1734 | /** |
1735 | * prepare_task_switch - prepare to switch tasks | 1735 | * prepare_task_switch - prepare to switch tasks |
1736 | * @rq: the runqueue preparing to switch | 1736 | * @rq: the runqueue preparing to switch |
1737 | * @prev: the current task that is being switched out | 1737 | * @prev: the current task that is being switched out |
1738 | * @next: the task we are going to switch to. | 1738 | * @next: the task we are going to switch to. |
1739 | * | 1739 | * |
1740 | * This is called with the rq lock held and interrupts off. It must | 1740 | * This is called with the rq lock held and interrupts off. It must |
1741 | * be paired with a subsequent finish_task_switch after the context | 1741 | * be paired with a subsequent finish_task_switch after the context |
1742 | * switch. | 1742 | * switch. |
1743 | * | 1743 | * |
1744 | * prepare_task_switch sets up locking and calls architecture specific | 1744 | * prepare_task_switch sets up locking and calls architecture specific |
1745 | * hooks. | 1745 | * hooks. |
1746 | */ | 1746 | */ |
1747 | static inline void | 1747 | static inline void |
1748 | prepare_task_switch(struct rq *rq, struct task_struct *prev, | 1748 | prepare_task_switch(struct rq *rq, struct task_struct *prev, |
1749 | struct task_struct *next) | 1749 | struct task_struct *next) |
1750 | { | 1750 | { |
1751 | fire_sched_out_preempt_notifiers(prev, next); | 1751 | fire_sched_out_preempt_notifiers(prev, next); |
1752 | prepare_lock_switch(rq, next); | 1752 | prepare_lock_switch(rq, next); |
1753 | prepare_arch_switch(next); | 1753 | prepare_arch_switch(next); |
1754 | } | 1754 | } |
1755 | 1755 | ||
1756 | /** | 1756 | /** |
1757 | * finish_task_switch - clean up after a task-switch | 1757 | * finish_task_switch - clean up after a task-switch |
1758 | * @rq: runqueue associated with task-switch | 1758 | * @rq: runqueue associated with task-switch |
1759 | * @prev: the thread we just switched away from. | 1759 | * @prev: the thread we just switched away from. |
1760 | * | 1760 | * |
1761 | * finish_task_switch must be called after the context switch, paired | 1761 | * finish_task_switch must be called after the context switch, paired |
1762 | * with a prepare_task_switch call before the context switch. | 1762 | * with a prepare_task_switch call before the context switch. |
1763 | * finish_task_switch will reconcile locking set up by prepare_task_switch, | 1763 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
1764 | * and do any other architecture-specific cleanup actions. | 1764 | * and do any other architecture-specific cleanup actions. |
1765 | * | 1765 | * |
1766 | * Note that we may have delayed dropping an mm in context_switch(). If | 1766 | * Note that we may have delayed dropping an mm in context_switch(). If |
1767 | * so, we finish that here outside of the runqueue lock. (Doing it | 1767 | * so, we finish that here outside of the runqueue lock. (Doing it |
1768 | * with the lock held can cause deadlocks; see schedule() for | 1768 | * with the lock held can cause deadlocks; see schedule() for |
1769 | * details.) | 1769 | * details.) |
1770 | */ | 1770 | */ |
1771 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | 1771 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1772 | __releases(rq->lock) | 1772 | __releases(rq->lock) |
1773 | { | 1773 | { |
1774 | struct mm_struct *mm = rq->prev_mm; | 1774 | struct mm_struct *mm = rq->prev_mm; |
1775 | long prev_state; | 1775 | long prev_state; |
1776 | 1776 | ||
1777 | rq->prev_mm = NULL; | 1777 | rq->prev_mm = NULL; |
1778 | 1778 | ||
1779 | /* | 1779 | /* |
1780 | * A task struct has one reference for the use as "current". | 1780 | * A task struct has one reference for the use as "current". |
1781 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls | 1781 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
1782 | * schedule one last time. The schedule call will never return, and | 1782 | * schedule one last time. The schedule call will never return, and |
1783 | * the scheduled task must drop that reference. | 1783 | * the scheduled task must drop that reference. |
1784 | * The test for TASK_DEAD must occur while the runqueue locks are | 1784 | * The test for TASK_DEAD must occur while the runqueue locks are |
1785 | * still held, otherwise prev could be scheduled on another cpu, die | 1785 | * still held, otherwise prev could be scheduled on another cpu, die |
1786 | * there before we look at prev->state, and then the reference would | 1786 | * there before we look at prev->state, and then the reference would |
1787 | * be dropped twice. | 1787 | * be dropped twice. |
1788 | * Manfred Spraul <manfred@colorfullife.com> | 1788 | * Manfred Spraul <manfred@colorfullife.com> |
1789 | */ | 1789 | */ |
1790 | prev_state = prev->state; | 1790 | prev_state = prev->state; |
1791 | finish_arch_switch(prev); | 1791 | finish_arch_switch(prev); |
1792 | finish_lock_switch(rq, prev); | 1792 | finish_lock_switch(rq, prev); |
1793 | fire_sched_in_preempt_notifiers(current); | 1793 | fire_sched_in_preempt_notifiers(current); |
1794 | if (mm) | 1794 | if (mm) |
1795 | mmdrop(mm); | 1795 | mmdrop(mm); |
1796 | if (unlikely(prev_state == TASK_DEAD)) { | 1796 | if (unlikely(prev_state == TASK_DEAD)) { |
1797 | /* | 1797 | /* |
1798 | * Remove function-return probe instances associated with this | 1798 | * Remove function-return probe instances associated with this |
1799 | * task and put them back on the free list. | 1799 | * task and put them back on the free list. |
1800 | */ | 1800 | */ |
1801 | kprobe_flush_task(prev); | 1801 | kprobe_flush_task(prev); |
1802 | put_task_struct(prev); | 1802 | put_task_struct(prev); |
1803 | } | 1803 | } |
1804 | } | 1804 | } |
1805 | 1805 | ||
1806 | /** | 1806 | /** |
1807 | * schedule_tail - first thing a freshly forked thread must call. | 1807 | * schedule_tail - first thing a freshly forked thread must call. |
1808 | * @prev: the thread we just switched away from. | 1808 | * @prev: the thread we just switched away from. |
1809 | */ | 1809 | */ |
1810 | asmlinkage void schedule_tail(struct task_struct *prev) | 1810 | asmlinkage void schedule_tail(struct task_struct *prev) |
1811 | __releases(rq->lock) | 1811 | __releases(rq->lock) |
1812 | { | 1812 | { |
1813 | struct rq *rq = this_rq(); | 1813 | struct rq *rq = this_rq(); |
1814 | 1814 | ||
1815 | finish_task_switch(rq, prev); | 1815 | finish_task_switch(rq, prev); |
1816 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1816 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
1817 | /* In this case, finish_task_switch does not reenable preemption */ | 1817 | /* In this case, finish_task_switch does not reenable preemption */ |
1818 | preempt_enable(); | 1818 | preempt_enable(); |
1819 | #endif | 1819 | #endif |
1820 | if (current->set_child_tid) | 1820 | if (current->set_child_tid) |
1821 | put_user(current->pid, current->set_child_tid); | 1821 | put_user(current->pid, current->set_child_tid); |
1822 | } | 1822 | } |
1823 | 1823 | ||
1824 | /* | 1824 | /* |
1825 | * context_switch - switch to the new MM and the new | 1825 | * context_switch - switch to the new MM and the new |
1826 | * thread's register state. | 1826 | * thread's register state. |
1827 | */ | 1827 | */ |
1828 | static inline void | 1828 | static inline void |
1829 | context_switch(struct rq *rq, struct task_struct *prev, | 1829 | context_switch(struct rq *rq, struct task_struct *prev, |
1830 | struct task_struct *next) | 1830 | struct task_struct *next) |
1831 | { | 1831 | { |
1832 | struct mm_struct *mm, *oldmm; | 1832 | struct mm_struct *mm, *oldmm; |
1833 | 1833 | ||
1834 | prepare_task_switch(rq, prev, next); | 1834 | prepare_task_switch(rq, prev, next); |
1835 | mm = next->mm; | 1835 | mm = next->mm; |
1836 | oldmm = prev->active_mm; | 1836 | oldmm = prev->active_mm; |
1837 | /* | 1837 | /* |
1838 | * For paravirt, this is coupled with an exit in switch_to to | 1838 | * For paravirt, this is coupled with an exit in switch_to to |
1839 | * combine the page table reload and the switch backend into | 1839 | * combine the page table reload and the switch backend into |
1840 | * one hypercall. | 1840 | * one hypercall. |
1841 | */ | 1841 | */ |
1842 | arch_enter_lazy_cpu_mode(); | 1842 | arch_enter_lazy_cpu_mode(); |
1843 | 1843 | ||
1844 | if (unlikely(!mm)) { | 1844 | if (unlikely(!mm)) { |
1845 | next->active_mm = oldmm; | 1845 | next->active_mm = oldmm; |
1846 | atomic_inc(&oldmm->mm_count); | 1846 | atomic_inc(&oldmm->mm_count); |
1847 | enter_lazy_tlb(oldmm, next); | 1847 | enter_lazy_tlb(oldmm, next); |
1848 | } else | 1848 | } else |
1849 | switch_mm(oldmm, mm, next); | 1849 | switch_mm(oldmm, mm, next); |
1850 | 1850 | ||
1851 | if (unlikely(!prev->mm)) { | 1851 | if (unlikely(!prev->mm)) { |
1852 | prev->active_mm = NULL; | 1852 | prev->active_mm = NULL; |
1853 | rq->prev_mm = oldmm; | 1853 | rq->prev_mm = oldmm; |
1854 | } | 1854 | } |
1855 | /* | 1855 | /* |
1856 | * Since the runqueue lock will be released by the next | 1856 | * Since the runqueue lock will be released by the next |
1857 | * task (which is an invalid locking op but in the case | 1857 | * task (which is an invalid locking op but in the case |
1858 | * of the scheduler it's an obvious special-case), so we | 1858 | * of the scheduler it's an obvious special-case), so we |
1859 | * do an early lockdep release here: | 1859 | * do an early lockdep release here: |
1860 | */ | 1860 | */ |
1861 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 1861 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
1862 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 1862 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
1863 | #endif | 1863 | #endif |
1864 | 1864 | ||
1865 | /* Here we just switch the register state and the stack. */ | 1865 | /* Here we just switch the register state and the stack. */ |
1866 | switch_to(prev, next, prev); | 1866 | switch_to(prev, next, prev); |
1867 | 1867 | ||
1868 | barrier(); | 1868 | barrier(); |
1869 | /* | 1869 | /* |
1870 | * this_rq must be evaluated again because prev may have moved | 1870 | * this_rq must be evaluated again because prev may have moved |
1871 | * CPUs since it called schedule(), thus the 'rq' on its stack | 1871 | * CPUs since it called schedule(), thus the 'rq' on its stack |
1872 | * frame will be invalid. | 1872 | * frame will be invalid. |
1873 | */ | 1873 | */ |
1874 | finish_task_switch(this_rq(), prev); | 1874 | finish_task_switch(this_rq(), prev); |
1875 | } | 1875 | } |
1876 | 1876 | ||
1877 | /* | 1877 | /* |
1878 | * nr_running, nr_uninterruptible and nr_context_switches: | 1878 | * nr_running, nr_uninterruptible and nr_context_switches: |
1879 | * | 1879 | * |
1880 | * externally visible scheduler statistics: current number of runnable | 1880 | * externally visible scheduler statistics: current number of runnable |
1881 | * threads, current number of uninterruptible-sleeping threads, total | 1881 | * threads, current number of uninterruptible-sleeping threads, total |
1882 | * number of context switches performed since bootup. | 1882 | * number of context switches performed since bootup. |
1883 | */ | 1883 | */ |
1884 | unsigned long nr_running(void) | 1884 | unsigned long nr_running(void) |
1885 | { | 1885 | { |
1886 | unsigned long i, sum = 0; | 1886 | unsigned long i, sum = 0; |
1887 | 1887 | ||
1888 | for_each_online_cpu(i) | 1888 | for_each_online_cpu(i) |
1889 | sum += cpu_rq(i)->nr_running; | 1889 | sum += cpu_rq(i)->nr_running; |
1890 | 1890 | ||
1891 | return sum; | 1891 | return sum; |
1892 | } | 1892 | } |
1893 | 1893 | ||
1894 | unsigned long nr_uninterruptible(void) | 1894 | unsigned long nr_uninterruptible(void) |
1895 | { | 1895 | { |
1896 | unsigned long i, sum = 0; | 1896 | unsigned long i, sum = 0; |
1897 | 1897 | ||
1898 | for_each_possible_cpu(i) | 1898 | for_each_possible_cpu(i) |
1899 | sum += cpu_rq(i)->nr_uninterruptible; | 1899 | sum += cpu_rq(i)->nr_uninterruptible; |
1900 | 1900 | ||
1901 | /* | 1901 | /* |
1902 | * Since we read the counters lockless, it might be slightly | 1902 | * Since we read the counters lockless, it might be slightly |
1903 | * inaccurate. Do not allow it to go below zero though: | 1903 | * inaccurate. Do not allow it to go below zero though: |
1904 | */ | 1904 | */ |
1905 | if (unlikely((long)sum < 0)) | 1905 | if (unlikely((long)sum < 0)) |
1906 | sum = 0; | 1906 | sum = 0; |
1907 | 1907 | ||
1908 | return sum; | 1908 | return sum; |
1909 | } | 1909 | } |
1910 | 1910 | ||
1911 | unsigned long long nr_context_switches(void) | 1911 | unsigned long long nr_context_switches(void) |
1912 | { | 1912 | { |
1913 | int i; | 1913 | int i; |
1914 | unsigned long long sum = 0; | 1914 | unsigned long long sum = 0; |
1915 | 1915 | ||
1916 | for_each_possible_cpu(i) | 1916 | for_each_possible_cpu(i) |
1917 | sum += cpu_rq(i)->nr_switches; | 1917 | sum += cpu_rq(i)->nr_switches; |
1918 | 1918 | ||
1919 | return sum; | 1919 | return sum; |
1920 | } | 1920 | } |
1921 | 1921 | ||
1922 | unsigned long nr_iowait(void) | 1922 | unsigned long nr_iowait(void) |
1923 | { | 1923 | { |
1924 | unsigned long i, sum = 0; | 1924 | unsigned long i, sum = 0; |
1925 | 1925 | ||
1926 | for_each_possible_cpu(i) | 1926 | for_each_possible_cpu(i) |
1927 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 1927 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
1928 | 1928 | ||
1929 | return sum; | 1929 | return sum; |
1930 | } | 1930 | } |
1931 | 1931 | ||
1932 | unsigned long nr_active(void) | 1932 | unsigned long nr_active(void) |
1933 | { | 1933 | { |
1934 | unsigned long i, running = 0, uninterruptible = 0; | 1934 | unsigned long i, running = 0, uninterruptible = 0; |
1935 | 1935 | ||
1936 | for_each_online_cpu(i) { | 1936 | for_each_online_cpu(i) { |
1937 | running += cpu_rq(i)->nr_running; | 1937 | running += cpu_rq(i)->nr_running; |
1938 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 1938 | uninterruptible += cpu_rq(i)->nr_uninterruptible; |
1939 | } | 1939 | } |
1940 | 1940 | ||
1941 | if (unlikely((long)uninterruptible < 0)) | 1941 | if (unlikely((long)uninterruptible < 0)) |
1942 | uninterruptible = 0; | 1942 | uninterruptible = 0; |
1943 | 1943 | ||
1944 | return running + uninterruptible; | 1944 | return running + uninterruptible; |
1945 | } | 1945 | } |
1946 | 1946 | ||
1947 | /* | 1947 | /* |
1948 | * Update rq->cpu_load[] statistics. This function is usually called every | 1948 | * Update rq->cpu_load[] statistics. This function is usually called every |
1949 | * scheduler tick (TICK_NSEC). | 1949 | * scheduler tick (TICK_NSEC). |
1950 | */ | 1950 | */ |
1951 | static void update_cpu_load(struct rq *this_rq) | 1951 | static void update_cpu_load(struct rq *this_rq) |
1952 | { | 1952 | { |
1953 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; | 1953 | u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64; |
1954 | unsigned long total_load = this_rq->ls.load.weight; | 1954 | unsigned long total_load = this_rq->ls.load.weight; |
1955 | unsigned long this_load = total_load; | 1955 | unsigned long this_load = total_load; |
1956 | struct load_stat *ls = &this_rq->ls; | 1956 | struct load_stat *ls = &this_rq->ls; |
1957 | u64 now = __rq_clock(this_rq); | 1957 | u64 now = __rq_clock(this_rq); |
1958 | int i, scale; | 1958 | int i, scale; |
1959 | 1959 | ||
1960 | this_rq->nr_load_updates++; | 1960 | this_rq->nr_load_updates++; |
1961 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) | 1961 | if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD))) |
1962 | goto do_avg; | 1962 | goto do_avg; |
1963 | 1963 | ||
1964 | /* Update delta_fair/delta_exec fields first */ | 1964 | /* Update delta_fair/delta_exec fields first */ |
1965 | update_curr_load(this_rq, now); | 1965 | update_curr_load(this_rq, now); |
1966 | 1966 | ||
1967 | fair_delta64 = ls->delta_fair + 1; | 1967 | fair_delta64 = ls->delta_fair + 1; |
1968 | ls->delta_fair = 0; | 1968 | ls->delta_fair = 0; |
1969 | 1969 | ||
1970 | exec_delta64 = ls->delta_exec + 1; | 1970 | exec_delta64 = ls->delta_exec + 1; |
1971 | ls->delta_exec = 0; | 1971 | ls->delta_exec = 0; |
1972 | 1972 | ||
1973 | sample_interval64 = now - ls->load_update_last; | 1973 | sample_interval64 = now - ls->load_update_last; |
1974 | ls->load_update_last = now; | 1974 | ls->load_update_last = now; |
1975 | 1975 | ||
1976 | if ((s64)sample_interval64 < (s64)TICK_NSEC) | 1976 | if ((s64)sample_interval64 < (s64)TICK_NSEC) |
1977 | sample_interval64 = TICK_NSEC; | 1977 | sample_interval64 = TICK_NSEC; |
1978 | 1978 | ||
1979 | if (exec_delta64 > sample_interval64) | 1979 | if (exec_delta64 > sample_interval64) |
1980 | exec_delta64 = sample_interval64; | 1980 | exec_delta64 = sample_interval64; |
1981 | 1981 | ||
1982 | idle_delta64 = sample_interval64 - exec_delta64; | 1982 | idle_delta64 = sample_interval64 - exec_delta64; |
1983 | 1983 | ||
1984 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); | 1984 | tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64); |
1985 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); | 1985 | tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64); |
1986 | 1986 | ||
1987 | this_load = (unsigned long)tmp64; | 1987 | this_load = (unsigned long)tmp64; |
1988 | 1988 | ||
1989 | do_avg: | 1989 | do_avg: |
1990 | 1990 | ||
1991 | /* Update our load: */ | 1991 | /* Update our load: */ |
1992 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 1992 | for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
1993 | unsigned long old_load, new_load; | 1993 | unsigned long old_load, new_load; |
1994 | 1994 | ||
1995 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 1995 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
1996 | 1996 | ||
1997 | old_load = this_rq->cpu_load[i]; | 1997 | old_load = this_rq->cpu_load[i]; |
1998 | new_load = this_load; | 1998 | new_load = this_load; |
1999 | 1999 | ||
2000 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2000 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2001 | } | 2001 | } |
2002 | } | 2002 | } |
2003 | 2003 | ||
2004 | #ifdef CONFIG_SMP | 2004 | #ifdef CONFIG_SMP |
2005 | 2005 | ||
2006 | /* | 2006 | /* |
2007 | * double_rq_lock - safely lock two runqueues | 2007 | * double_rq_lock - safely lock two runqueues |
2008 | * | 2008 | * |
2009 | * Note this does not disable interrupts like task_rq_lock, | 2009 | * Note this does not disable interrupts like task_rq_lock, |
2010 | * you need to do so manually before calling. | 2010 | * you need to do so manually before calling. |
2011 | */ | 2011 | */ |
2012 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | 2012 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
2013 | __acquires(rq1->lock) | 2013 | __acquires(rq1->lock) |
2014 | __acquires(rq2->lock) | 2014 | __acquires(rq2->lock) |
2015 | { | 2015 | { |
2016 | BUG_ON(!irqs_disabled()); | 2016 | BUG_ON(!irqs_disabled()); |
2017 | if (rq1 == rq2) { | 2017 | if (rq1 == rq2) { |
2018 | spin_lock(&rq1->lock); | 2018 | spin_lock(&rq1->lock); |
2019 | __acquire(rq2->lock); /* Fake it out ;) */ | 2019 | __acquire(rq2->lock); /* Fake it out ;) */ |
2020 | } else { | 2020 | } else { |
2021 | if (rq1 < rq2) { | 2021 | if (rq1 < rq2) { |
2022 | spin_lock(&rq1->lock); | 2022 | spin_lock(&rq1->lock); |
2023 | spin_lock(&rq2->lock); | 2023 | spin_lock(&rq2->lock); |
2024 | } else { | 2024 | } else { |
2025 | spin_lock(&rq2->lock); | 2025 | spin_lock(&rq2->lock); |
2026 | spin_lock(&rq1->lock); | 2026 | spin_lock(&rq1->lock); |
2027 | } | 2027 | } |
2028 | } | 2028 | } |
2029 | } | 2029 | } |
2030 | 2030 | ||
2031 | /* | 2031 | /* |
2032 | * double_rq_unlock - safely unlock two runqueues | 2032 | * double_rq_unlock - safely unlock two runqueues |
2033 | * | 2033 | * |
2034 | * Note this does not restore interrupts like task_rq_unlock, | 2034 | * Note this does not restore interrupts like task_rq_unlock, |
2035 | * you need to do so manually after calling. | 2035 | * you need to do so manually after calling. |
2036 | */ | 2036 | */ |
2037 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | 2037 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
2038 | __releases(rq1->lock) | 2038 | __releases(rq1->lock) |
2039 | __releases(rq2->lock) | 2039 | __releases(rq2->lock) |
2040 | { | 2040 | { |
2041 | spin_unlock(&rq1->lock); | 2041 | spin_unlock(&rq1->lock); |
2042 | if (rq1 != rq2) | 2042 | if (rq1 != rq2) |
2043 | spin_unlock(&rq2->lock); | 2043 | spin_unlock(&rq2->lock); |
2044 | else | 2044 | else |
2045 | __release(rq2->lock); | 2045 | __release(rq2->lock); |
2046 | } | 2046 | } |
2047 | 2047 | ||
2048 | /* | 2048 | /* |
2049 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2049 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2050 | */ | 2050 | */ |
2051 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2051 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2052 | __releases(this_rq->lock) | 2052 | __releases(this_rq->lock) |
2053 | __acquires(busiest->lock) | 2053 | __acquires(busiest->lock) |
2054 | __acquires(this_rq->lock) | 2054 | __acquires(this_rq->lock) |
2055 | { | 2055 | { |
2056 | if (unlikely(!irqs_disabled())) { | 2056 | if (unlikely(!irqs_disabled())) { |
2057 | /* printk() doesn't work good under rq->lock */ | 2057 | /* printk() doesn't work good under rq->lock */ |
2058 | spin_unlock(&this_rq->lock); | 2058 | spin_unlock(&this_rq->lock); |
2059 | BUG_ON(1); | 2059 | BUG_ON(1); |
2060 | } | 2060 | } |
2061 | if (unlikely(!spin_trylock(&busiest->lock))) { | 2061 | if (unlikely(!spin_trylock(&busiest->lock))) { |
2062 | if (busiest < this_rq) { | 2062 | if (busiest < this_rq) { |
2063 | spin_unlock(&this_rq->lock); | 2063 | spin_unlock(&this_rq->lock); |
2064 | spin_lock(&busiest->lock); | 2064 | spin_lock(&busiest->lock); |
2065 | spin_lock(&this_rq->lock); | 2065 | spin_lock(&this_rq->lock); |
2066 | } else | 2066 | } else |
2067 | spin_lock(&busiest->lock); | 2067 | spin_lock(&busiest->lock); |
2068 | } | 2068 | } |
2069 | } | 2069 | } |
2070 | 2070 | ||
2071 | /* | 2071 | /* |
2072 | * If dest_cpu is allowed for this process, migrate the task to it. | 2072 | * If dest_cpu is allowed for this process, migrate the task to it. |
2073 | * This is accomplished by forcing the cpu_allowed mask to only | 2073 | * This is accomplished by forcing the cpu_allowed mask to only |
2074 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 2074 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
2075 | * the cpu_allowed mask is restored. | 2075 | * the cpu_allowed mask is restored. |
2076 | */ | 2076 | */ |
2077 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) | 2077 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
2078 | { | 2078 | { |
2079 | struct migration_req req; | 2079 | struct migration_req req; |
2080 | unsigned long flags; | 2080 | unsigned long flags; |
2081 | struct rq *rq; | 2081 | struct rq *rq; |
2082 | 2082 | ||
2083 | rq = task_rq_lock(p, &flags); | 2083 | rq = task_rq_lock(p, &flags); |
2084 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 2084 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
2085 | || unlikely(cpu_is_offline(dest_cpu))) | 2085 | || unlikely(cpu_is_offline(dest_cpu))) |
2086 | goto out; | 2086 | goto out; |
2087 | 2087 | ||
2088 | /* force the process onto the specified CPU */ | 2088 | /* force the process onto the specified CPU */ |
2089 | if (migrate_task(p, dest_cpu, &req)) { | 2089 | if (migrate_task(p, dest_cpu, &req)) { |
2090 | /* Need to wait for migration thread (might exit: take ref). */ | 2090 | /* Need to wait for migration thread (might exit: take ref). */ |
2091 | struct task_struct *mt = rq->migration_thread; | 2091 | struct task_struct *mt = rq->migration_thread; |
2092 | 2092 | ||
2093 | get_task_struct(mt); | 2093 | get_task_struct(mt); |
2094 | task_rq_unlock(rq, &flags); | 2094 | task_rq_unlock(rq, &flags); |
2095 | wake_up_process(mt); | 2095 | wake_up_process(mt); |
2096 | put_task_struct(mt); | 2096 | put_task_struct(mt); |
2097 | wait_for_completion(&req.done); | 2097 | wait_for_completion(&req.done); |
2098 | 2098 | ||
2099 | return; | 2099 | return; |
2100 | } | 2100 | } |
2101 | out: | 2101 | out: |
2102 | task_rq_unlock(rq, &flags); | 2102 | task_rq_unlock(rq, &flags); |
2103 | } | 2103 | } |
2104 | 2104 | ||
2105 | /* | 2105 | /* |
2106 | * sched_exec - execve() is a valuable balancing opportunity, because at | 2106 | * sched_exec - execve() is a valuable balancing opportunity, because at |
2107 | * this point the task has the smallest effective memory and cache footprint. | 2107 | * this point the task has the smallest effective memory and cache footprint. |
2108 | */ | 2108 | */ |
2109 | void sched_exec(void) | 2109 | void sched_exec(void) |
2110 | { | 2110 | { |
2111 | int new_cpu, this_cpu = get_cpu(); | 2111 | int new_cpu, this_cpu = get_cpu(); |
2112 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 2112 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); |
2113 | put_cpu(); | 2113 | put_cpu(); |
2114 | if (new_cpu != this_cpu) | 2114 | if (new_cpu != this_cpu) |
2115 | sched_migrate_task(current, new_cpu); | 2115 | sched_migrate_task(current, new_cpu); |
2116 | } | 2116 | } |
2117 | 2117 | ||
2118 | /* | 2118 | /* |
2119 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2119 | * pull_task - move a task from a remote runqueue to the local runqueue. |
2120 | * Both runqueues must be locked. | 2120 | * Both runqueues must be locked. |
2121 | */ | 2121 | */ |
2122 | static void pull_task(struct rq *src_rq, struct task_struct *p, | 2122 | static void pull_task(struct rq *src_rq, struct task_struct *p, |
2123 | struct rq *this_rq, int this_cpu) | 2123 | struct rq *this_rq, int this_cpu) |
2124 | { | 2124 | { |
2125 | deactivate_task(src_rq, p, 0); | 2125 | deactivate_task(src_rq, p, 0); |
2126 | set_task_cpu(p, this_cpu); | 2126 | set_task_cpu(p, this_cpu); |
2127 | activate_task(this_rq, p, 0); | 2127 | activate_task(this_rq, p, 0); |
2128 | /* | 2128 | /* |
2129 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2129 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2130 | * to be always true for them. | 2130 | * to be always true for them. |
2131 | */ | 2131 | */ |
2132 | check_preempt_curr(this_rq, p); | 2132 | check_preempt_curr(this_rq, p); |
2133 | } | 2133 | } |
2134 | 2134 | ||
2135 | /* | 2135 | /* |
2136 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2136 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
2137 | */ | 2137 | */ |
2138 | static | 2138 | static |
2139 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 2139 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2140 | struct sched_domain *sd, enum cpu_idle_type idle, | 2140 | struct sched_domain *sd, enum cpu_idle_type idle, |
2141 | int *all_pinned) | 2141 | int *all_pinned) |
2142 | { | 2142 | { |
2143 | /* | 2143 | /* |
2144 | * We do not migrate tasks that are: | 2144 | * We do not migrate tasks that are: |
2145 | * 1) running (obviously), or | 2145 | * 1) running (obviously), or |
2146 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2146 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2147 | * 3) are cache-hot on their current CPU. | 2147 | * 3) are cache-hot on their current CPU. |
2148 | */ | 2148 | */ |
2149 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 2149 | if (!cpu_isset(this_cpu, p->cpus_allowed)) |
2150 | return 0; | 2150 | return 0; |
2151 | *all_pinned = 0; | 2151 | *all_pinned = 0; |
2152 | 2152 | ||
2153 | if (task_running(rq, p)) | 2153 | if (task_running(rq, p)) |
2154 | return 0; | 2154 | return 0; |
2155 | 2155 | ||
2156 | /* | 2156 | /* |
2157 | * Aggressive migration if too many balance attempts have failed: | 2157 | * Aggressive migration if too many balance attempts have failed: |
2158 | */ | 2158 | */ |
2159 | if (sd->nr_balance_failed > sd->cache_nice_tries) | 2159 | if (sd->nr_balance_failed > sd->cache_nice_tries) |
2160 | return 1; | 2160 | return 1; |
2161 | 2161 | ||
2162 | return 1; | 2162 | return 1; |
2163 | } | 2163 | } |
2164 | 2164 | ||
2165 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2165 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2166 | unsigned long max_nr_move, unsigned long max_load_move, | 2166 | unsigned long max_nr_move, unsigned long max_load_move, |
2167 | struct sched_domain *sd, enum cpu_idle_type idle, | 2167 | struct sched_domain *sd, enum cpu_idle_type idle, |
2168 | int *all_pinned, unsigned long *load_moved, | 2168 | int *all_pinned, unsigned long *load_moved, |
2169 | int this_best_prio, int best_prio, int best_prio_seen, | 2169 | int this_best_prio, int best_prio, int best_prio_seen, |
2170 | struct rq_iterator *iterator) | 2170 | struct rq_iterator *iterator) |
2171 | { | 2171 | { |
2172 | int pulled = 0, pinned = 0, skip_for_load; | 2172 | int pulled = 0, pinned = 0, skip_for_load; |
2173 | struct task_struct *p; | 2173 | struct task_struct *p; |
2174 | long rem_load_move = max_load_move; | 2174 | long rem_load_move = max_load_move; |
2175 | 2175 | ||
2176 | if (max_nr_move == 0 || max_load_move == 0) | 2176 | if (max_nr_move == 0 || max_load_move == 0) |
2177 | goto out; | 2177 | goto out; |
2178 | 2178 | ||
2179 | pinned = 1; | 2179 | pinned = 1; |
2180 | 2180 | ||
2181 | /* | 2181 | /* |
2182 | * Start the load-balancing iterator: | 2182 | * Start the load-balancing iterator: |
2183 | */ | 2183 | */ |
2184 | p = iterator->start(iterator->arg); | 2184 | p = iterator->start(iterator->arg); |
2185 | next: | 2185 | next: |
2186 | if (!p) | 2186 | if (!p) |
2187 | goto out; | 2187 | goto out; |
2188 | /* | 2188 | /* |
2189 | * To help distribute high priority tasks accross CPUs we don't | 2189 | * To help distribute high priority tasks accross CPUs we don't |
2190 | * skip a task if it will be the highest priority task (i.e. smallest | 2190 | * skip a task if it will be the highest priority task (i.e. smallest |
2191 | * prio value) on its new queue regardless of its load weight | 2191 | * prio value) on its new queue regardless of its load weight |
2192 | */ | 2192 | */ |
2193 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + | 2193 | skip_for_load = (p->se.load.weight >> 1) > rem_load_move + |
2194 | SCHED_LOAD_SCALE_FUZZ; | 2194 | SCHED_LOAD_SCALE_FUZZ; |
2195 | if (skip_for_load && p->prio < this_best_prio) | 2195 | if (skip_for_load && p->prio < this_best_prio) |
2196 | skip_for_load = !best_prio_seen && p->prio == best_prio; | 2196 | skip_for_load = !best_prio_seen && p->prio == best_prio; |
2197 | if (skip_for_load || | 2197 | if (skip_for_load || |
2198 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { | 2198 | !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { |
2199 | 2199 | ||
2200 | best_prio_seen |= p->prio == best_prio; | 2200 | best_prio_seen |= p->prio == best_prio; |
2201 | p = iterator->next(iterator->arg); | 2201 | p = iterator->next(iterator->arg); |
2202 | goto next; | 2202 | goto next; |
2203 | } | 2203 | } |
2204 | 2204 | ||
2205 | pull_task(busiest, p, this_rq, this_cpu); | 2205 | pull_task(busiest, p, this_rq, this_cpu); |
2206 | pulled++; | 2206 | pulled++; |
2207 | rem_load_move -= p->se.load.weight; | 2207 | rem_load_move -= p->se.load.weight; |
2208 | 2208 | ||
2209 | /* | 2209 | /* |
2210 | * We only want to steal up to the prescribed number of tasks | 2210 | * We only want to steal up to the prescribed number of tasks |
2211 | * and the prescribed amount of weighted load. | 2211 | * and the prescribed amount of weighted load. |
2212 | */ | 2212 | */ |
2213 | if (pulled < max_nr_move && rem_load_move > 0) { | 2213 | if (pulled < max_nr_move && rem_load_move > 0) { |
2214 | if (p->prio < this_best_prio) | 2214 | if (p->prio < this_best_prio) |
2215 | this_best_prio = p->prio; | 2215 | this_best_prio = p->prio; |
2216 | p = iterator->next(iterator->arg); | 2216 | p = iterator->next(iterator->arg); |
2217 | goto next; | 2217 | goto next; |
2218 | } | 2218 | } |
2219 | out: | 2219 | out: |
2220 | /* | 2220 | /* |
2221 | * Right now, this is the only place pull_task() is called, | 2221 | * Right now, this is the only place pull_task() is called, |
2222 | * so we can safely collect pull_task() stats here rather than | 2222 | * so we can safely collect pull_task() stats here rather than |
2223 | * inside pull_task(). | 2223 | * inside pull_task(). |
2224 | */ | 2224 | */ |
2225 | schedstat_add(sd, lb_gained[idle], pulled); | 2225 | schedstat_add(sd, lb_gained[idle], pulled); |
2226 | 2226 | ||
2227 | if (all_pinned) | 2227 | if (all_pinned) |
2228 | *all_pinned = pinned; | 2228 | *all_pinned = pinned; |
2229 | *load_moved = max_load_move - rem_load_move; | 2229 | *load_moved = max_load_move - rem_load_move; |
2230 | return pulled; | 2230 | return pulled; |
2231 | } | 2231 | } |
2232 | 2232 | ||
2233 | /* | 2233 | /* |
2234 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | 2234 | * move_tasks tries to move up to max_load_move weighted load from busiest to |
2235 | * load from busiest to this_rq, as part of a balancing operation within | 2235 | * this_rq, as part of a balancing operation within domain "sd". |
2236 | * "domain". Returns the number of tasks moved. | 2236 | * Returns 1 if successful and 0 otherwise. |
2237 | * | 2237 | * |
2238 | * Called with both runqueues locked. | 2238 | * Called with both runqueues locked. |
2239 | */ | 2239 | */ |
2240 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2240 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2241 | unsigned long max_nr_move, unsigned long max_load_move, | 2241 | unsigned long max_load_move, |
2242 | struct sched_domain *sd, enum cpu_idle_type idle, | 2242 | struct sched_domain *sd, enum cpu_idle_type idle, |
2243 | int *all_pinned) | 2243 | int *all_pinned) |
2244 | { | 2244 | { |
2245 | struct sched_class *class = sched_class_highest; | 2245 | struct sched_class *class = sched_class_highest; |
2246 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | 2246 | unsigned long total_load_moved = 0; |
2247 | long rem_load_move = max_load_move; | ||
2248 | 2247 | ||
2249 | do { | 2248 | do { |
2250 | nr_moved = class->load_balance(this_rq, this_cpu, busiest, | 2249 | total_load_moved += |
2251 | max_nr_move, (unsigned long)rem_load_move, | 2250 | class->load_balance(this_rq, this_cpu, busiest, |
2252 | sd, idle, all_pinned, &load_moved); | 2251 | ULONG_MAX, max_load_move - total_load_moved, |
2253 | total_nr_moved += nr_moved; | 2252 | sd, idle, all_pinned); |
2254 | max_nr_move -= nr_moved; | ||
2255 | rem_load_move -= load_moved; | ||
2256 | class = class->next; | 2253 | class = class->next; |
2257 | } while (class && max_nr_move && rem_load_move > 0); | 2254 | } while (class && max_load_move > total_load_moved); |
2258 | 2255 | ||
2259 | return total_nr_moved; | 2256 | return total_load_moved > 0; |
2260 | } | 2257 | } |
2261 | 2258 | ||
2262 | /* | 2259 | /* |
2260 | * move_one_task tries to move exactly one task from busiest to this_rq, as | ||
2261 | * part of active balancing operations within "domain". | ||
2262 | * Returns 1 if successful and 0 otherwise. | ||
2263 | * | ||
2264 | * Called with both runqueues locked. | ||
2265 | */ | ||
2266 | static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
2267 | struct sched_domain *sd, enum cpu_idle_type idle) | ||
2268 | { | ||
2269 | struct sched_class *class; | ||
2270 | |||
2271 | for (class = sched_class_highest; class; class = class->next) | ||
2272 | if (class->load_balance(this_rq, this_cpu, busiest, | ||
2273 | 1, ULONG_MAX, sd, idle, NULL)) | ||
2274 | return 1; | ||
2275 | |||
2276 | return 0; | ||
2277 | } | ||
2278 | |||
2279 | /* | ||
2263 | * find_busiest_group finds and returns the busiest CPU group within the | 2280 | * find_busiest_group finds and returns the busiest CPU group within the |
2264 | * domain. It calculates and returns the amount of weighted load which | 2281 | * domain. It calculates and returns the amount of weighted load which |
2265 | * should be moved to restore balance via the imbalance parameter. | 2282 | * should be moved to restore balance via the imbalance parameter. |
2266 | */ | 2283 | */ |
2267 | static struct sched_group * | 2284 | static struct sched_group * |
2268 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2285 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2269 | unsigned long *imbalance, enum cpu_idle_type idle, | 2286 | unsigned long *imbalance, enum cpu_idle_type idle, |
2270 | int *sd_idle, cpumask_t *cpus, int *balance) | 2287 | int *sd_idle, cpumask_t *cpus, int *balance) |
2271 | { | 2288 | { |
2272 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2289 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2273 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2290 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
2274 | unsigned long max_pull; | 2291 | unsigned long max_pull; |
2275 | unsigned long busiest_load_per_task, busiest_nr_running; | 2292 | unsigned long busiest_load_per_task, busiest_nr_running; |
2276 | unsigned long this_load_per_task, this_nr_running; | 2293 | unsigned long this_load_per_task, this_nr_running; |
2277 | int load_idx; | 2294 | int load_idx; |
2278 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2295 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2279 | int power_savings_balance = 1; | 2296 | int power_savings_balance = 1; |
2280 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | 2297 | unsigned long leader_nr_running = 0, min_load_per_task = 0; |
2281 | unsigned long min_nr_running = ULONG_MAX; | 2298 | unsigned long min_nr_running = ULONG_MAX; |
2282 | struct sched_group *group_min = NULL, *group_leader = NULL; | 2299 | struct sched_group *group_min = NULL, *group_leader = NULL; |
2283 | #endif | 2300 | #endif |
2284 | 2301 | ||
2285 | max_load = this_load = total_load = total_pwr = 0; | 2302 | max_load = this_load = total_load = total_pwr = 0; |
2286 | busiest_load_per_task = busiest_nr_running = 0; | 2303 | busiest_load_per_task = busiest_nr_running = 0; |
2287 | this_load_per_task = this_nr_running = 0; | 2304 | this_load_per_task = this_nr_running = 0; |
2288 | if (idle == CPU_NOT_IDLE) | 2305 | if (idle == CPU_NOT_IDLE) |
2289 | load_idx = sd->busy_idx; | 2306 | load_idx = sd->busy_idx; |
2290 | else if (idle == CPU_NEWLY_IDLE) | 2307 | else if (idle == CPU_NEWLY_IDLE) |
2291 | load_idx = sd->newidle_idx; | 2308 | load_idx = sd->newidle_idx; |
2292 | else | 2309 | else |
2293 | load_idx = sd->idle_idx; | 2310 | load_idx = sd->idle_idx; |
2294 | 2311 | ||
2295 | do { | 2312 | do { |
2296 | unsigned long load, group_capacity; | 2313 | unsigned long load, group_capacity; |
2297 | int local_group; | 2314 | int local_group; |
2298 | int i; | 2315 | int i; |
2299 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2316 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2300 | unsigned long sum_nr_running, sum_weighted_load; | 2317 | unsigned long sum_nr_running, sum_weighted_load; |
2301 | 2318 | ||
2302 | local_group = cpu_isset(this_cpu, group->cpumask); | 2319 | local_group = cpu_isset(this_cpu, group->cpumask); |
2303 | 2320 | ||
2304 | if (local_group) | 2321 | if (local_group) |
2305 | balance_cpu = first_cpu(group->cpumask); | 2322 | balance_cpu = first_cpu(group->cpumask); |
2306 | 2323 | ||
2307 | /* Tally up the load of all CPUs in the group */ | 2324 | /* Tally up the load of all CPUs in the group */ |
2308 | sum_weighted_load = sum_nr_running = avg_load = 0; | 2325 | sum_weighted_load = sum_nr_running = avg_load = 0; |
2309 | 2326 | ||
2310 | for_each_cpu_mask(i, group->cpumask) { | 2327 | for_each_cpu_mask(i, group->cpumask) { |
2311 | struct rq *rq; | 2328 | struct rq *rq; |
2312 | 2329 | ||
2313 | if (!cpu_isset(i, *cpus)) | 2330 | if (!cpu_isset(i, *cpus)) |
2314 | continue; | 2331 | continue; |
2315 | 2332 | ||
2316 | rq = cpu_rq(i); | 2333 | rq = cpu_rq(i); |
2317 | 2334 | ||
2318 | if (*sd_idle && rq->nr_running) | 2335 | if (*sd_idle && rq->nr_running) |
2319 | *sd_idle = 0; | 2336 | *sd_idle = 0; |
2320 | 2337 | ||
2321 | /* Bias balancing toward cpus of our domain */ | 2338 | /* Bias balancing toward cpus of our domain */ |
2322 | if (local_group) { | 2339 | if (local_group) { |
2323 | if (idle_cpu(i) && !first_idle_cpu) { | 2340 | if (idle_cpu(i) && !first_idle_cpu) { |
2324 | first_idle_cpu = 1; | 2341 | first_idle_cpu = 1; |
2325 | balance_cpu = i; | 2342 | balance_cpu = i; |
2326 | } | 2343 | } |
2327 | 2344 | ||
2328 | load = target_load(i, load_idx); | 2345 | load = target_load(i, load_idx); |
2329 | } else | 2346 | } else |
2330 | load = source_load(i, load_idx); | 2347 | load = source_load(i, load_idx); |
2331 | 2348 | ||
2332 | avg_load += load; | 2349 | avg_load += load; |
2333 | sum_nr_running += rq->nr_running; | 2350 | sum_nr_running += rq->nr_running; |
2334 | sum_weighted_load += weighted_cpuload(i); | 2351 | sum_weighted_load += weighted_cpuload(i); |
2335 | } | 2352 | } |
2336 | 2353 | ||
2337 | /* | 2354 | /* |
2338 | * First idle cpu or the first cpu(busiest) in this sched group | 2355 | * First idle cpu or the first cpu(busiest) in this sched group |
2339 | * is eligible for doing load balancing at this and above | 2356 | * is eligible for doing load balancing at this and above |
2340 | * domains. In the newly idle case, we will allow all the cpu's | 2357 | * domains. In the newly idle case, we will allow all the cpu's |
2341 | * to do the newly idle load balance. | 2358 | * to do the newly idle load balance. |
2342 | */ | 2359 | */ |
2343 | if (idle != CPU_NEWLY_IDLE && local_group && | 2360 | if (idle != CPU_NEWLY_IDLE && local_group && |
2344 | balance_cpu != this_cpu && balance) { | 2361 | balance_cpu != this_cpu && balance) { |
2345 | *balance = 0; | 2362 | *balance = 0; |
2346 | goto ret; | 2363 | goto ret; |
2347 | } | 2364 | } |
2348 | 2365 | ||
2349 | total_load += avg_load; | 2366 | total_load += avg_load; |
2350 | total_pwr += group->__cpu_power; | 2367 | total_pwr += group->__cpu_power; |
2351 | 2368 | ||
2352 | /* Adjust by relative CPU power of the group */ | 2369 | /* Adjust by relative CPU power of the group */ |
2353 | avg_load = sg_div_cpu_power(group, | 2370 | avg_load = sg_div_cpu_power(group, |
2354 | avg_load * SCHED_LOAD_SCALE); | 2371 | avg_load * SCHED_LOAD_SCALE); |
2355 | 2372 | ||
2356 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 2373 | group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; |
2357 | 2374 | ||
2358 | if (local_group) { | 2375 | if (local_group) { |
2359 | this_load = avg_load; | 2376 | this_load = avg_load; |
2360 | this = group; | 2377 | this = group; |
2361 | this_nr_running = sum_nr_running; | 2378 | this_nr_running = sum_nr_running; |
2362 | this_load_per_task = sum_weighted_load; | 2379 | this_load_per_task = sum_weighted_load; |
2363 | } else if (avg_load > max_load && | 2380 | } else if (avg_load > max_load && |
2364 | sum_nr_running > group_capacity) { | 2381 | sum_nr_running > group_capacity) { |
2365 | max_load = avg_load; | 2382 | max_load = avg_load; |
2366 | busiest = group; | 2383 | busiest = group; |
2367 | busiest_nr_running = sum_nr_running; | 2384 | busiest_nr_running = sum_nr_running; |
2368 | busiest_load_per_task = sum_weighted_load; | 2385 | busiest_load_per_task = sum_weighted_load; |
2369 | } | 2386 | } |
2370 | 2387 | ||
2371 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2388 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2372 | /* | 2389 | /* |
2373 | * Busy processors will not participate in power savings | 2390 | * Busy processors will not participate in power savings |
2374 | * balance. | 2391 | * balance. |
2375 | */ | 2392 | */ |
2376 | if (idle == CPU_NOT_IDLE || | 2393 | if (idle == CPU_NOT_IDLE || |
2377 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2394 | !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2378 | goto group_next; | 2395 | goto group_next; |
2379 | 2396 | ||
2380 | /* | 2397 | /* |
2381 | * If the local group is idle or completely loaded | 2398 | * If the local group is idle or completely loaded |
2382 | * no need to do power savings balance at this domain | 2399 | * no need to do power savings balance at this domain |
2383 | */ | 2400 | */ |
2384 | if (local_group && (this_nr_running >= group_capacity || | 2401 | if (local_group && (this_nr_running >= group_capacity || |
2385 | !this_nr_running)) | 2402 | !this_nr_running)) |
2386 | power_savings_balance = 0; | 2403 | power_savings_balance = 0; |
2387 | 2404 | ||
2388 | /* | 2405 | /* |
2389 | * If a group is already running at full capacity or idle, | 2406 | * If a group is already running at full capacity or idle, |
2390 | * don't include that group in power savings calculations | 2407 | * don't include that group in power savings calculations |
2391 | */ | 2408 | */ |
2392 | if (!power_savings_balance || sum_nr_running >= group_capacity | 2409 | if (!power_savings_balance || sum_nr_running >= group_capacity |
2393 | || !sum_nr_running) | 2410 | || !sum_nr_running) |
2394 | goto group_next; | 2411 | goto group_next; |
2395 | 2412 | ||
2396 | /* | 2413 | /* |
2397 | * Calculate the group which has the least non-idle load. | 2414 | * Calculate the group which has the least non-idle load. |
2398 | * This is the group from where we need to pick up the load | 2415 | * This is the group from where we need to pick up the load |
2399 | * for saving power | 2416 | * for saving power |
2400 | */ | 2417 | */ |
2401 | if ((sum_nr_running < min_nr_running) || | 2418 | if ((sum_nr_running < min_nr_running) || |
2402 | (sum_nr_running == min_nr_running && | 2419 | (sum_nr_running == min_nr_running && |
2403 | first_cpu(group->cpumask) < | 2420 | first_cpu(group->cpumask) < |
2404 | first_cpu(group_min->cpumask))) { | 2421 | first_cpu(group_min->cpumask))) { |
2405 | group_min = group; | 2422 | group_min = group; |
2406 | min_nr_running = sum_nr_running; | 2423 | min_nr_running = sum_nr_running; |
2407 | min_load_per_task = sum_weighted_load / | 2424 | min_load_per_task = sum_weighted_load / |
2408 | sum_nr_running; | 2425 | sum_nr_running; |
2409 | } | 2426 | } |
2410 | 2427 | ||
2411 | /* | 2428 | /* |
2412 | * Calculate the group which is almost near its | 2429 | * Calculate the group which is almost near its |
2413 | * capacity but still has some space to pick up some load | 2430 | * capacity but still has some space to pick up some load |
2414 | * from other group and save more power | 2431 | * from other group and save more power |
2415 | */ | 2432 | */ |
2416 | if (sum_nr_running <= group_capacity - 1) { | 2433 | if (sum_nr_running <= group_capacity - 1) { |
2417 | if (sum_nr_running > leader_nr_running || | 2434 | if (sum_nr_running > leader_nr_running || |
2418 | (sum_nr_running == leader_nr_running && | 2435 | (sum_nr_running == leader_nr_running && |
2419 | first_cpu(group->cpumask) > | 2436 | first_cpu(group->cpumask) > |
2420 | first_cpu(group_leader->cpumask))) { | 2437 | first_cpu(group_leader->cpumask))) { |
2421 | group_leader = group; | 2438 | group_leader = group; |
2422 | leader_nr_running = sum_nr_running; | 2439 | leader_nr_running = sum_nr_running; |
2423 | } | 2440 | } |
2424 | } | 2441 | } |
2425 | group_next: | 2442 | group_next: |
2426 | #endif | 2443 | #endif |
2427 | group = group->next; | 2444 | group = group->next; |
2428 | } while (group != sd->groups); | 2445 | } while (group != sd->groups); |
2429 | 2446 | ||
2430 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) | 2447 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2431 | goto out_balanced; | 2448 | goto out_balanced; |
2432 | 2449 | ||
2433 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2450 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
2434 | 2451 | ||
2435 | if (this_load >= avg_load || | 2452 | if (this_load >= avg_load || |
2436 | 100*max_load <= sd->imbalance_pct*this_load) | 2453 | 100*max_load <= sd->imbalance_pct*this_load) |
2437 | goto out_balanced; | 2454 | goto out_balanced; |
2438 | 2455 | ||
2439 | busiest_load_per_task /= busiest_nr_running; | 2456 | busiest_load_per_task /= busiest_nr_running; |
2440 | /* | 2457 | /* |
2441 | * We're trying to get all the cpus to the average_load, so we don't | 2458 | * We're trying to get all the cpus to the average_load, so we don't |
2442 | * want to push ourselves above the average load, nor do we wish to | 2459 | * want to push ourselves above the average load, nor do we wish to |
2443 | * reduce the max loaded cpu below the average load, as either of these | 2460 | * reduce the max loaded cpu below the average load, as either of these |
2444 | * actions would just result in more rebalancing later, and ping-pong | 2461 | * actions would just result in more rebalancing later, and ping-pong |
2445 | * tasks around. Thus we look for the minimum possible imbalance. | 2462 | * tasks around. Thus we look for the minimum possible imbalance. |
2446 | * Negative imbalances (*we* are more loaded than anyone else) will | 2463 | * Negative imbalances (*we* are more loaded than anyone else) will |
2447 | * be counted as no imbalance for these purposes -- we can't fix that | 2464 | * be counted as no imbalance for these purposes -- we can't fix that |
2448 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2465 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2449 | * appear as very large values with unsigned longs. | 2466 | * appear as very large values with unsigned longs. |
2450 | */ | 2467 | */ |
2451 | if (max_load <= busiest_load_per_task) | 2468 | if (max_load <= busiest_load_per_task) |
2452 | goto out_balanced; | 2469 | goto out_balanced; |
2453 | 2470 | ||
2454 | /* | 2471 | /* |
2455 | * In the presence of smp nice balancing, certain scenarios can have | 2472 | * In the presence of smp nice balancing, certain scenarios can have |
2456 | * max load less than avg load(as we skip the groups at or below | 2473 | * max load less than avg load(as we skip the groups at or below |
2457 | * its cpu_power, while calculating max_load..) | 2474 | * its cpu_power, while calculating max_load..) |
2458 | */ | 2475 | */ |
2459 | if (max_load < avg_load) { | 2476 | if (max_load < avg_load) { |
2460 | *imbalance = 0; | 2477 | *imbalance = 0; |
2461 | goto small_imbalance; | 2478 | goto small_imbalance; |
2462 | } | 2479 | } |
2463 | 2480 | ||
2464 | /* Don't want to pull so many tasks that a group would go idle */ | 2481 | /* Don't want to pull so many tasks that a group would go idle */ |
2465 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 2482 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2466 | 2483 | ||
2467 | /* How much load to actually move to equalise the imbalance */ | 2484 | /* How much load to actually move to equalise the imbalance */ |
2468 | *imbalance = min(max_pull * busiest->__cpu_power, | 2485 | *imbalance = min(max_pull * busiest->__cpu_power, |
2469 | (avg_load - this_load) * this->__cpu_power) | 2486 | (avg_load - this_load) * this->__cpu_power) |
2470 | / SCHED_LOAD_SCALE; | 2487 | / SCHED_LOAD_SCALE; |
2471 | 2488 | ||
2472 | /* | 2489 | /* |
2473 | * if *imbalance is less than the average load per runnable task | 2490 | * if *imbalance is less than the average load per runnable task |
2474 | * there is no gaurantee that any tasks will be moved so we'll have | 2491 | * there is no gaurantee that any tasks will be moved so we'll have |
2475 | * a think about bumping its value to force at least one task to be | 2492 | * a think about bumping its value to force at least one task to be |
2476 | * moved | 2493 | * moved |
2477 | */ | 2494 | */ |
2478 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { | 2495 | if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { |
2479 | unsigned long tmp, pwr_now, pwr_move; | 2496 | unsigned long tmp, pwr_now, pwr_move; |
2480 | unsigned int imbn; | 2497 | unsigned int imbn; |
2481 | 2498 | ||
2482 | small_imbalance: | 2499 | small_imbalance: |
2483 | pwr_move = pwr_now = 0; | 2500 | pwr_move = pwr_now = 0; |
2484 | imbn = 2; | 2501 | imbn = 2; |
2485 | if (this_nr_running) { | 2502 | if (this_nr_running) { |
2486 | this_load_per_task /= this_nr_running; | 2503 | this_load_per_task /= this_nr_running; |
2487 | if (busiest_load_per_task > this_load_per_task) | 2504 | if (busiest_load_per_task > this_load_per_task) |
2488 | imbn = 1; | 2505 | imbn = 1; |
2489 | } else | 2506 | } else |
2490 | this_load_per_task = SCHED_LOAD_SCALE; | 2507 | this_load_per_task = SCHED_LOAD_SCALE; |
2491 | 2508 | ||
2492 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= | 2509 | if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= |
2493 | busiest_load_per_task * imbn) { | 2510 | busiest_load_per_task * imbn) { |
2494 | *imbalance = busiest_load_per_task; | 2511 | *imbalance = busiest_load_per_task; |
2495 | return busiest; | 2512 | return busiest; |
2496 | } | 2513 | } |
2497 | 2514 | ||
2498 | /* | 2515 | /* |
2499 | * OK, we don't have enough imbalance to justify moving tasks, | 2516 | * OK, we don't have enough imbalance to justify moving tasks, |
2500 | * however we may be able to increase total CPU power used by | 2517 | * however we may be able to increase total CPU power used by |
2501 | * moving them. | 2518 | * moving them. |
2502 | */ | 2519 | */ |
2503 | 2520 | ||
2504 | pwr_now += busiest->__cpu_power * | 2521 | pwr_now += busiest->__cpu_power * |
2505 | min(busiest_load_per_task, max_load); | 2522 | min(busiest_load_per_task, max_load); |
2506 | pwr_now += this->__cpu_power * | 2523 | pwr_now += this->__cpu_power * |
2507 | min(this_load_per_task, this_load); | 2524 | min(this_load_per_task, this_load); |
2508 | pwr_now /= SCHED_LOAD_SCALE; | 2525 | pwr_now /= SCHED_LOAD_SCALE; |
2509 | 2526 | ||
2510 | /* Amount of load we'd subtract */ | 2527 | /* Amount of load we'd subtract */ |
2511 | tmp = sg_div_cpu_power(busiest, | 2528 | tmp = sg_div_cpu_power(busiest, |
2512 | busiest_load_per_task * SCHED_LOAD_SCALE); | 2529 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2513 | if (max_load > tmp) | 2530 | if (max_load > tmp) |
2514 | pwr_move += busiest->__cpu_power * | 2531 | pwr_move += busiest->__cpu_power * |
2515 | min(busiest_load_per_task, max_load - tmp); | 2532 | min(busiest_load_per_task, max_load - tmp); |
2516 | 2533 | ||
2517 | /* Amount of load we'd add */ | 2534 | /* Amount of load we'd add */ |
2518 | if (max_load * busiest->__cpu_power < | 2535 | if (max_load * busiest->__cpu_power < |
2519 | busiest_load_per_task * SCHED_LOAD_SCALE) | 2536 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2520 | tmp = sg_div_cpu_power(this, | 2537 | tmp = sg_div_cpu_power(this, |
2521 | max_load * busiest->__cpu_power); | 2538 | max_load * busiest->__cpu_power); |
2522 | else | 2539 | else |
2523 | tmp = sg_div_cpu_power(this, | 2540 | tmp = sg_div_cpu_power(this, |
2524 | busiest_load_per_task * SCHED_LOAD_SCALE); | 2541 | busiest_load_per_task * SCHED_LOAD_SCALE); |
2525 | pwr_move += this->__cpu_power * | 2542 | pwr_move += this->__cpu_power * |
2526 | min(this_load_per_task, this_load + tmp); | 2543 | min(this_load_per_task, this_load + tmp); |
2527 | pwr_move /= SCHED_LOAD_SCALE; | 2544 | pwr_move /= SCHED_LOAD_SCALE; |
2528 | 2545 | ||
2529 | /* Move if we gain throughput */ | 2546 | /* Move if we gain throughput */ |
2530 | if (pwr_move <= pwr_now) | 2547 | if (pwr_move <= pwr_now) |
2531 | goto out_balanced; | 2548 | goto out_balanced; |
2532 | 2549 | ||
2533 | *imbalance = busiest_load_per_task; | 2550 | *imbalance = busiest_load_per_task; |
2534 | } | 2551 | } |
2535 | 2552 | ||
2536 | return busiest; | 2553 | return busiest; |
2537 | 2554 | ||
2538 | out_balanced: | 2555 | out_balanced: |
2539 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2556 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2540 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2557 | if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2541 | goto ret; | 2558 | goto ret; |
2542 | 2559 | ||
2543 | if (this == group_leader && group_leader != group_min) { | 2560 | if (this == group_leader && group_leader != group_min) { |
2544 | *imbalance = min_load_per_task; | 2561 | *imbalance = min_load_per_task; |
2545 | return group_min; | 2562 | return group_min; |
2546 | } | 2563 | } |
2547 | #endif | 2564 | #endif |
2548 | ret: | 2565 | ret: |
2549 | *imbalance = 0; | 2566 | *imbalance = 0; |
2550 | return NULL; | 2567 | return NULL; |
2551 | } | 2568 | } |
2552 | 2569 | ||
2553 | /* | 2570 | /* |
2554 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2571 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2555 | */ | 2572 | */ |
2556 | static struct rq * | 2573 | static struct rq * |
2557 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | 2574 | find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, |
2558 | unsigned long imbalance, cpumask_t *cpus) | 2575 | unsigned long imbalance, cpumask_t *cpus) |
2559 | { | 2576 | { |
2560 | struct rq *busiest = NULL, *rq; | 2577 | struct rq *busiest = NULL, *rq; |
2561 | unsigned long max_load = 0; | 2578 | unsigned long max_load = 0; |
2562 | int i; | 2579 | int i; |
2563 | 2580 | ||
2564 | for_each_cpu_mask(i, group->cpumask) { | 2581 | for_each_cpu_mask(i, group->cpumask) { |
2565 | unsigned long wl; | 2582 | unsigned long wl; |
2566 | 2583 | ||
2567 | if (!cpu_isset(i, *cpus)) | 2584 | if (!cpu_isset(i, *cpus)) |
2568 | continue; | 2585 | continue; |
2569 | 2586 | ||
2570 | rq = cpu_rq(i); | 2587 | rq = cpu_rq(i); |
2571 | wl = weighted_cpuload(i); | 2588 | wl = weighted_cpuload(i); |
2572 | 2589 | ||
2573 | if (rq->nr_running == 1 && wl > imbalance) | 2590 | if (rq->nr_running == 1 && wl > imbalance) |
2574 | continue; | 2591 | continue; |
2575 | 2592 | ||
2576 | if (wl > max_load) { | 2593 | if (wl > max_load) { |
2577 | max_load = wl; | 2594 | max_load = wl; |
2578 | busiest = rq; | 2595 | busiest = rq; |
2579 | } | 2596 | } |
2580 | } | 2597 | } |
2581 | 2598 | ||
2582 | return busiest; | 2599 | return busiest; |
2583 | } | 2600 | } |
2584 | 2601 | ||
2585 | /* | 2602 | /* |
2586 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | 2603 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but |
2587 | * so long as it is large enough. | 2604 | * so long as it is large enough. |
2588 | */ | 2605 | */ |
2589 | #define MAX_PINNED_INTERVAL 512 | 2606 | #define MAX_PINNED_INTERVAL 512 |
2590 | 2607 | ||
2591 | static inline unsigned long minus_1_or_zero(unsigned long n) | ||
2592 | { | ||
2593 | return n > 0 ? n - 1 : 0; | ||
2594 | } | ||
2595 | |||
2596 | /* | 2608 | /* |
2597 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2609 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2598 | * tasks if there is an imbalance. | 2610 | * tasks if there is an imbalance. |
2599 | */ | 2611 | */ |
2600 | static int load_balance(int this_cpu, struct rq *this_rq, | 2612 | static int load_balance(int this_cpu, struct rq *this_rq, |
2601 | struct sched_domain *sd, enum cpu_idle_type idle, | 2613 | struct sched_domain *sd, enum cpu_idle_type idle, |
2602 | int *balance) | 2614 | int *balance) |
2603 | { | 2615 | { |
2604 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2616 | int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
2605 | struct sched_group *group; | 2617 | struct sched_group *group; |
2606 | unsigned long imbalance; | 2618 | unsigned long imbalance; |
2607 | struct rq *busiest; | 2619 | struct rq *busiest; |
2608 | cpumask_t cpus = CPU_MASK_ALL; | 2620 | cpumask_t cpus = CPU_MASK_ALL; |
2609 | unsigned long flags; | 2621 | unsigned long flags; |
2610 | 2622 | ||
2611 | /* | 2623 | /* |
2612 | * When power savings policy is enabled for the parent domain, idle | 2624 | * When power savings policy is enabled for the parent domain, idle |
2613 | * sibling can pick up load irrespective of busy siblings. In this case, | 2625 | * sibling can pick up load irrespective of busy siblings. In this case, |
2614 | * let the state of idle sibling percolate up as CPU_IDLE, instead of | 2626 | * let the state of idle sibling percolate up as CPU_IDLE, instead of |
2615 | * portraying it as CPU_NOT_IDLE. | 2627 | * portraying it as CPU_NOT_IDLE. |
2616 | */ | 2628 | */ |
2617 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 2629 | if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2618 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2630 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2619 | sd_idle = 1; | 2631 | sd_idle = 1; |
2620 | 2632 | ||
2621 | schedstat_inc(sd, lb_cnt[idle]); | 2633 | schedstat_inc(sd, lb_cnt[idle]); |
2622 | 2634 | ||
2623 | redo: | 2635 | redo: |
2624 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2636 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
2625 | &cpus, balance); | 2637 | &cpus, balance); |
2626 | 2638 | ||
2627 | if (*balance == 0) | 2639 | if (*balance == 0) |
2628 | goto out_balanced; | 2640 | goto out_balanced; |
2629 | 2641 | ||
2630 | if (!group) { | 2642 | if (!group) { |
2631 | schedstat_inc(sd, lb_nobusyg[idle]); | 2643 | schedstat_inc(sd, lb_nobusyg[idle]); |
2632 | goto out_balanced; | 2644 | goto out_balanced; |
2633 | } | 2645 | } |
2634 | 2646 | ||
2635 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 2647 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); |
2636 | if (!busiest) { | 2648 | if (!busiest) { |
2637 | schedstat_inc(sd, lb_nobusyq[idle]); | 2649 | schedstat_inc(sd, lb_nobusyq[idle]); |
2638 | goto out_balanced; | 2650 | goto out_balanced; |
2639 | } | 2651 | } |
2640 | 2652 | ||
2641 | BUG_ON(busiest == this_rq); | 2653 | BUG_ON(busiest == this_rq); |
2642 | 2654 | ||
2643 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2655 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
2644 | 2656 | ||
2645 | nr_moved = 0; | 2657 | ld_moved = 0; |
2646 | if (busiest->nr_running > 1) { | 2658 | if (busiest->nr_running > 1) { |
2647 | /* | 2659 | /* |
2648 | * Attempt to move tasks. If find_busiest_group has found | 2660 | * Attempt to move tasks. If find_busiest_group has found |
2649 | * an imbalance but busiest->nr_running <= 1, the group is | 2661 | * an imbalance but busiest->nr_running <= 1, the group is |
2650 | * still unbalanced. nr_moved simply stays zero, so it is | 2662 | * still unbalanced. ld_moved simply stays zero, so it is |
2651 | * correctly treated as an imbalance. | 2663 | * correctly treated as an imbalance. |
2652 | */ | 2664 | */ |
2653 | local_irq_save(flags); | 2665 | local_irq_save(flags); |
2654 | double_rq_lock(this_rq, busiest); | 2666 | double_rq_lock(this_rq, busiest); |
2655 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2667 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
2656 | minus_1_or_zero(busiest->nr_running), | ||
2657 | imbalance, sd, idle, &all_pinned); | 2668 | imbalance, sd, idle, &all_pinned); |
2658 | double_rq_unlock(this_rq, busiest); | 2669 | double_rq_unlock(this_rq, busiest); |
2659 | local_irq_restore(flags); | 2670 | local_irq_restore(flags); |
2660 | 2671 | ||
2661 | /* | 2672 | /* |
2662 | * some other cpu did the load balance for us. | 2673 | * some other cpu did the load balance for us. |
2663 | */ | 2674 | */ |
2664 | if (nr_moved && this_cpu != smp_processor_id()) | 2675 | if (ld_moved && this_cpu != smp_processor_id()) |
2665 | resched_cpu(this_cpu); | 2676 | resched_cpu(this_cpu); |
2666 | 2677 | ||
2667 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2678 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2668 | if (unlikely(all_pinned)) { | 2679 | if (unlikely(all_pinned)) { |
2669 | cpu_clear(cpu_of(busiest), cpus); | 2680 | cpu_clear(cpu_of(busiest), cpus); |
2670 | if (!cpus_empty(cpus)) | 2681 | if (!cpus_empty(cpus)) |
2671 | goto redo; | 2682 | goto redo; |
2672 | goto out_balanced; | 2683 | goto out_balanced; |
2673 | } | 2684 | } |
2674 | } | 2685 | } |
2675 | 2686 | ||
2676 | if (!nr_moved) { | 2687 | if (!ld_moved) { |
2677 | schedstat_inc(sd, lb_failed[idle]); | 2688 | schedstat_inc(sd, lb_failed[idle]); |
2678 | sd->nr_balance_failed++; | 2689 | sd->nr_balance_failed++; |
2679 | 2690 | ||
2680 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2691 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
2681 | 2692 | ||
2682 | spin_lock_irqsave(&busiest->lock, flags); | 2693 | spin_lock_irqsave(&busiest->lock, flags); |
2683 | 2694 | ||
2684 | /* don't kick the migration_thread, if the curr | 2695 | /* don't kick the migration_thread, if the curr |
2685 | * task on busiest cpu can't be moved to this_cpu | 2696 | * task on busiest cpu can't be moved to this_cpu |
2686 | */ | 2697 | */ |
2687 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 2698 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
2688 | spin_unlock_irqrestore(&busiest->lock, flags); | 2699 | spin_unlock_irqrestore(&busiest->lock, flags); |
2689 | all_pinned = 1; | 2700 | all_pinned = 1; |
2690 | goto out_one_pinned; | 2701 | goto out_one_pinned; |
2691 | } | 2702 | } |
2692 | 2703 | ||
2693 | if (!busiest->active_balance) { | 2704 | if (!busiest->active_balance) { |
2694 | busiest->active_balance = 1; | 2705 | busiest->active_balance = 1; |
2695 | busiest->push_cpu = this_cpu; | 2706 | busiest->push_cpu = this_cpu; |
2696 | active_balance = 1; | 2707 | active_balance = 1; |
2697 | } | 2708 | } |
2698 | spin_unlock_irqrestore(&busiest->lock, flags); | 2709 | spin_unlock_irqrestore(&busiest->lock, flags); |
2699 | if (active_balance) | 2710 | if (active_balance) |
2700 | wake_up_process(busiest->migration_thread); | 2711 | wake_up_process(busiest->migration_thread); |
2701 | 2712 | ||
2702 | /* | 2713 | /* |
2703 | * We've kicked active balancing, reset the failure | 2714 | * We've kicked active balancing, reset the failure |
2704 | * counter. | 2715 | * counter. |
2705 | */ | 2716 | */ |
2706 | sd->nr_balance_failed = sd->cache_nice_tries+1; | 2717 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
2707 | } | 2718 | } |
2708 | } else | 2719 | } else |
2709 | sd->nr_balance_failed = 0; | 2720 | sd->nr_balance_failed = 0; |
2710 | 2721 | ||
2711 | if (likely(!active_balance)) { | 2722 | if (likely(!active_balance)) { |
2712 | /* We were unbalanced, so reset the balancing interval */ | 2723 | /* We were unbalanced, so reset the balancing interval */ |
2713 | sd->balance_interval = sd->min_interval; | 2724 | sd->balance_interval = sd->min_interval; |
2714 | } else { | 2725 | } else { |
2715 | /* | 2726 | /* |
2716 | * If we've begun active balancing, start to back off. This | 2727 | * If we've begun active balancing, start to back off. This |
2717 | * case may not be covered by the all_pinned logic if there | 2728 | * case may not be covered by the all_pinned logic if there |
2718 | * is only 1 task on the busy runqueue (because we don't call | 2729 | * is only 1 task on the busy runqueue (because we don't call |
2719 | * move_tasks). | 2730 | * move_tasks). |
2720 | */ | 2731 | */ |
2721 | if (sd->balance_interval < sd->max_interval) | 2732 | if (sd->balance_interval < sd->max_interval) |
2722 | sd->balance_interval *= 2; | 2733 | sd->balance_interval *= 2; |
2723 | } | 2734 | } |
2724 | 2735 | ||
2725 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2736 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2726 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2737 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2727 | return -1; | 2738 | return -1; |
2728 | return nr_moved; | 2739 | return ld_moved; |
2729 | 2740 | ||
2730 | out_balanced: | 2741 | out_balanced: |
2731 | schedstat_inc(sd, lb_balanced[idle]); | 2742 | schedstat_inc(sd, lb_balanced[idle]); |
2732 | 2743 | ||
2733 | sd->nr_balance_failed = 0; | 2744 | sd->nr_balance_failed = 0; |
2734 | 2745 | ||
2735 | out_one_pinned: | 2746 | out_one_pinned: |
2736 | /* tune up the balancing interval */ | 2747 | /* tune up the balancing interval */ |
2737 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 2748 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
2738 | (sd->balance_interval < sd->max_interval)) | 2749 | (sd->balance_interval < sd->max_interval)) |
2739 | sd->balance_interval *= 2; | 2750 | sd->balance_interval *= 2; |
2740 | 2751 | ||
2741 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2752 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2742 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2753 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2743 | return -1; | 2754 | return -1; |
2744 | return 0; | 2755 | return 0; |
2745 | } | 2756 | } |
2746 | 2757 | ||
2747 | /* | 2758 | /* |
2748 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2759 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2749 | * tasks if there is an imbalance. | 2760 | * tasks if there is an imbalance. |
2750 | * | 2761 | * |
2751 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). | 2762 | * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). |
2752 | * this_rq is locked. | 2763 | * this_rq is locked. |
2753 | */ | 2764 | */ |
2754 | static int | 2765 | static int |
2755 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 2766 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
2756 | { | 2767 | { |
2757 | struct sched_group *group; | 2768 | struct sched_group *group; |
2758 | struct rq *busiest = NULL; | 2769 | struct rq *busiest = NULL; |
2759 | unsigned long imbalance; | 2770 | unsigned long imbalance; |
2760 | int nr_moved = 0; | 2771 | int ld_moved = 0; |
2761 | int sd_idle = 0; | 2772 | int sd_idle = 0; |
2762 | int all_pinned = 0; | 2773 | int all_pinned = 0; |
2763 | cpumask_t cpus = CPU_MASK_ALL; | 2774 | cpumask_t cpus = CPU_MASK_ALL; |
2764 | 2775 | ||
2765 | /* | 2776 | /* |
2766 | * When power savings policy is enabled for the parent domain, idle | 2777 | * When power savings policy is enabled for the parent domain, idle |
2767 | * sibling can pick up load irrespective of busy siblings. In this case, | 2778 | * sibling can pick up load irrespective of busy siblings. In this case, |
2768 | * let the state of idle sibling percolate up as IDLE, instead of | 2779 | * let the state of idle sibling percolate up as IDLE, instead of |
2769 | * portraying it as CPU_NOT_IDLE. | 2780 | * portraying it as CPU_NOT_IDLE. |
2770 | */ | 2781 | */ |
2771 | if (sd->flags & SD_SHARE_CPUPOWER && | 2782 | if (sd->flags & SD_SHARE_CPUPOWER && |
2772 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2783 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2773 | sd_idle = 1; | 2784 | sd_idle = 1; |
2774 | 2785 | ||
2775 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); | 2786 | schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); |
2776 | redo: | 2787 | redo: |
2777 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, | 2788 | group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, |
2778 | &sd_idle, &cpus, NULL); | 2789 | &sd_idle, &cpus, NULL); |
2779 | if (!group) { | 2790 | if (!group) { |
2780 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); | 2791 | schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); |
2781 | goto out_balanced; | 2792 | goto out_balanced; |
2782 | } | 2793 | } |
2783 | 2794 | ||
2784 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, | 2795 | busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, |
2785 | &cpus); | 2796 | &cpus); |
2786 | if (!busiest) { | 2797 | if (!busiest) { |
2787 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); | 2798 | schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); |
2788 | goto out_balanced; | 2799 | goto out_balanced; |
2789 | } | 2800 | } |
2790 | 2801 | ||
2791 | BUG_ON(busiest == this_rq); | 2802 | BUG_ON(busiest == this_rq); |
2792 | 2803 | ||
2793 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); | 2804 | schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); |
2794 | 2805 | ||
2795 | nr_moved = 0; | 2806 | ld_moved = 0; |
2796 | if (busiest->nr_running > 1) { | 2807 | if (busiest->nr_running > 1) { |
2797 | /* Attempt to move tasks */ | 2808 | /* Attempt to move tasks */ |
2798 | double_lock_balance(this_rq, busiest); | 2809 | double_lock_balance(this_rq, busiest); |
2799 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2810 | ld_moved = move_tasks(this_rq, this_cpu, busiest, |
2800 | minus_1_or_zero(busiest->nr_running), | ||
2801 | imbalance, sd, CPU_NEWLY_IDLE, | 2811 | imbalance, sd, CPU_NEWLY_IDLE, |
2802 | &all_pinned); | 2812 | &all_pinned); |
2803 | spin_unlock(&busiest->lock); | 2813 | spin_unlock(&busiest->lock); |
2804 | 2814 | ||
2805 | if (unlikely(all_pinned)) { | 2815 | if (unlikely(all_pinned)) { |
2806 | cpu_clear(cpu_of(busiest), cpus); | 2816 | cpu_clear(cpu_of(busiest), cpus); |
2807 | if (!cpus_empty(cpus)) | 2817 | if (!cpus_empty(cpus)) |
2808 | goto redo; | 2818 | goto redo; |
2809 | } | 2819 | } |
2810 | } | 2820 | } |
2811 | 2821 | ||
2812 | if (!nr_moved) { | 2822 | if (!ld_moved) { |
2813 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); | 2823 | schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); |
2814 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2824 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2815 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2825 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2816 | return -1; | 2826 | return -1; |
2817 | } else | 2827 | } else |
2818 | sd->nr_balance_failed = 0; | 2828 | sd->nr_balance_failed = 0; |
2819 | 2829 | ||
2820 | return nr_moved; | 2830 | return ld_moved; |
2821 | 2831 | ||
2822 | out_balanced: | 2832 | out_balanced: |
2823 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); | 2833 | schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); |
2824 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2834 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2825 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2835 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2826 | return -1; | 2836 | return -1; |
2827 | sd->nr_balance_failed = 0; | 2837 | sd->nr_balance_failed = 0; |
2828 | 2838 | ||
2829 | return 0; | 2839 | return 0; |
2830 | } | 2840 | } |
2831 | 2841 | ||
2832 | /* | 2842 | /* |
2833 | * idle_balance is called by schedule() if this_cpu is about to become | 2843 | * idle_balance is called by schedule() if this_cpu is about to become |
2834 | * idle. Attempts to pull tasks from other CPUs. | 2844 | * idle. Attempts to pull tasks from other CPUs. |
2835 | */ | 2845 | */ |
2836 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2846 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2837 | { | 2847 | { |
2838 | struct sched_domain *sd; | 2848 | struct sched_domain *sd; |
2839 | int pulled_task = -1; | 2849 | int pulled_task = -1; |
2840 | unsigned long next_balance = jiffies + HZ; | 2850 | unsigned long next_balance = jiffies + HZ; |
2841 | 2851 | ||
2842 | for_each_domain(this_cpu, sd) { | 2852 | for_each_domain(this_cpu, sd) { |
2843 | unsigned long interval; | 2853 | unsigned long interval; |
2844 | 2854 | ||
2845 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2855 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2846 | continue; | 2856 | continue; |
2847 | 2857 | ||
2848 | if (sd->flags & SD_BALANCE_NEWIDLE) | 2858 | if (sd->flags & SD_BALANCE_NEWIDLE) |
2849 | /* If we've pulled tasks over stop searching: */ | 2859 | /* If we've pulled tasks over stop searching: */ |
2850 | pulled_task = load_balance_newidle(this_cpu, | 2860 | pulled_task = load_balance_newidle(this_cpu, |
2851 | this_rq, sd); | 2861 | this_rq, sd); |
2852 | 2862 | ||
2853 | interval = msecs_to_jiffies(sd->balance_interval); | 2863 | interval = msecs_to_jiffies(sd->balance_interval); |
2854 | if (time_after(next_balance, sd->last_balance + interval)) | 2864 | if (time_after(next_balance, sd->last_balance + interval)) |
2855 | next_balance = sd->last_balance + interval; | 2865 | next_balance = sd->last_balance + interval; |
2856 | if (pulled_task) | 2866 | if (pulled_task) |
2857 | break; | 2867 | break; |
2858 | } | 2868 | } |
2859 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { | 2869 | if (pulled_task || time_after(jiffies, this_rq->next_balance)) { |
2860 | /* | 2870 | /* |
2861 | * We are going idle. next_balance may be set based on | 2871 | * We are going idle. next_balance may be set based on |
2862 | * a busy processor. So reset next_balance. | 2872 | * a busy processor. So reset next_balance. |
2863 | */ | 2873 | */ |
2864 | this_rq->next_balance = next_balance; | 2874 | this_rq->next_balance = next_balance; |
2865 | } | 2875 | } |
2866 | } | 2876 | } |
2867 | 2877 | ||
2868 | /* | 2878 | /* |
2869 | * active_load_balance is run by migration threads. It pushes running tasks | 2879 | * active_load_balance is run by migration threads. It pushes running tasks |
2870 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | 2880 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be |
2871 | * running on each physical CPU where possible, and avoids physical / | 2881 | * running on each physical CPU where possible, and avoids physical / |
2872 | * logical imbalances. | 2882 | * logical imbalances. |
2873 | * | 2883 | * |
2874 | * Called with busiest_rq locked. | 2884 | * Called with busiest_rq locked. |
2875 | */ | 2885 | */ |
2876 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | 2886 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
2877 | { | 2887 | { |
2878 | int target_cpu = busiest_rq->push_cpu; | 2888 | int target_cpu = busiest_rq->push_cpu; |
2879 | struct sched_domain *sd; | 2889 | struct sched_domain *sd; |
2880 | struct rq *target_rq; | 2890 | struct rq *target_rq; |
2881 | 2891 | ||
2882 | /* Is there any task to move? */ | 2892 | /* Is there any task to move? */ |
2883 | if (busiest_rq->nr_running <= 1) | 2893 | if (busiest_rq->nr_running <= 1) |
2884 | return; | 2894 | return; |
2885 | 2895 | ||
2886 | target_rq = cpu_rq(target_cpu); | 2896 | target_rq = cpu_rq(target_cpu); |
2887 | 2897 | ||
2888 | /* | 2898 | /* |
2889 | * This condition is "impossible", if it occurs | 2899 | * This condition is "impossible", if it occurs |
2890 | * we need to fix it. Originally reported by | 2900 | * we need to fix it. Originally reported by |
2891 | * Bjorn Helgaas on a 128-cpu setup. | 2901 | * Bjorn Helgaas on a 128-cpu setup. |
2892 | */ | 2902 | */ |
2893 | BUG_ON(busiest_rq == target_rq); | 2903 | BUG_ON(busiest_rq == target_rq); |
2894 | 2904 | ||
2895 | /* move a task from busiest_rq to target_rq */ | 2905 | /* move a task from busiest_rq to target_rq */ |
2896 | double_lock_balance(busiest_rq, target_rq); | 2906 | double_lock_balance(busiest_rq, target_rq); |
2897 | 2907 | ||
2898 | /* Search for an sd spanning us and the target CPU. */ | 2908 | /* Search for an sd spanning us and the target CPU. */ |
2899 | for_each_domain(target_cpu, sd) { | 2909 | for_each_domain(target_cpu, sd) { |
2900 | if ((sd->flags & SD_LOAD_BALANCE) && | 2910 | if ((sd->flags & SD_LOAD_BALANCE) && |
2901 | cpu_isset(busiest_cpu, sd->span)) | 2911 | cpu_isset(busiest_cpu, sd->span)) |
2902 | break; | 2912 | break; |
2903 | } | 2913 | } |
2904 | 2914 | ||
2905 | if (likely(sd)) { | 2915 | if (likely(sd)) { |
2906 | schedstat_inc(sd, alb_cnt); | 2916 | schedstat_inc(sd, alb_cnt); |
2907 | 2917 | ||
2908 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, | 2918 | if (move_one_task(target_rq, target_cpu, busiest_rq, |
2909 | ULONG_MAX, sd, CPU_IDLE, NULL)) | 2919 | sd, CPU_IDLE)) |
2910 | schedstat_inc(sd, alb_pushed); | 2920 | schedstat_inc(sd, alb_pushed); |
2911 | else | 2921 | else |
2912 | schedstat_inc(sd, alb_failed); | 2922 | schedstat_inc(sd, alb_failed); |
2913 | } | 2923 | } |
2914 | spin_unlock(&target_rq->lock); | 2924 | spin_unlock(&target_rq->lock); |
2915 | } | 2925 | } |
2916 | 2926 | ||
2917 | #ifdef CONFIG_NO_HZ | 2927 | #ifdef CONFIG_NO_HZ |
2918 | static struct { | 2928 | static struct { |
2919 | atomic_t load_balancer; | 2929 | atomic_t load_balancer; |
2920 | cpumask_t cpu_mask; | 2930 | cpumask_t cpu_mask; |
2921 | } nohz ____cacheline_aligned = { | 2931 | } nohz ____cacheline_aligned = { |
2922 | .load_balancer = ATOMIC_INIT(-1), | 2932 | .load_balancer = ATOMIC_INIT(-1), |
2923 | .cpu_mask = CPU_MASK_NONE, | 2933 | .cpu_mask = CPU_MASK_NONE, |
2924 | }; | 2934 | }; |
2925 | 2935 | ||
2926 | /* | 2936 | /* |
2927 | * This routine will try to nominate the ilb (idle load balancing) | 2937 | * This routine will try to nominate the ilb (idle load balancing) |
2928 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle | 2938 | * owner among the cpus whose ticks are stopped. ilb owner will do the idle |
2929 | * load balancing on behalf of all those cpus. If all the cpus in the system | 2939 | * load balancing on behalf of all those cpus. If all the cpus in the system |
2930 | * go into this tickless mode, then there will be no ilb owner (as there is | 2940 | * go into this tickless mode, then there will be no ilb owner (as there is |
2931 | * no need for one) and all the cpus will sleep till the next wakeup event | 2941 | * no need for one) and all the cpus will sleep till the next wakeup event |
2932 | * arrives... | 2942 | * arrives... |
2933 | * | 2943 | * |
2934 | * For the ilb owner, tick is not stopped. And this tick will be used | 2944 | * For the ilb owner, tick is not stopped. And this tick will be used |
2935 | * for idle load balancing. ilb owner will still be part of | 2945 | * for idle load balancing. ilb owner will still be part of |
2936 | * nohz.cpu_mask.. | 2946 | * nohz.cpu_mask.. |
2937 | * | 2947 | * |
2938 | * While stopping the tick, this cpu will become the ilb owner if there | 2948 | * While stopping the tick, this cpu will become the ilb owner if there |
2939 | * is no other owner. And will be the owner till that cpu becomes busy | 2949 | * is no other owner. And will be the owner till that cpu becomes busy |
2940 | * or if all cpus in the system stop their ticks at which point | 2950 | * or if all cpus in the system stop their ticks at which point |
2941 | * there is no need for ilb owner. | 2951 | * there is no need for ilb owner. |
2942 | * | 2952 | * |
2943 | * When the ilb owner becomes busy, it nominates another owner, during the | 2953 | * When the ilb owner becomes busy, it nominates another owner, during the |
2944 | * next busy scheduler_tick() | 2954 | * next busy scheduler_tick() |
2945 | */ | 2955 | */ |
2946 | int select_nohz_load_balancer(int stop_tick) | 2956 | int select_nohz_load_balancer(int stop_tick) |
2947 | { | 2957 | { |
2948 | int cpu = smp_processor_id(); | 2958 | int cpu = smp_processor_id(); |
2949 | 2959 | ||
2950 | if (stop_tick) { | 2960 | if (stop_tick) { |
2951 | cpu_set(cpu, nohz.cpu_mask); | 2961 | cpu_set(cpu, nohz.cpu_mask); |
2952 | cpu_rq(cpu)->in_nohz_recently = 1; | 2962 | cpu_rq(cpu)->in_nohz_recently = 1; |
2953 | 2963 | ||
2954 | /* | 2964 | /* |
2955 | * If we are going offline and still the leader, give up! | 2965 | * If we are going offline and still the leader, give up! |
2956 | */ | 2966 | */ |
2957 | if (cpu_is_offline(cpu) && | 2967 | if (cpu_is_offline(cpu) && |
2958 | atomic_read(&nohz.load_balancer) == cpu) { | 2968 | atomic_read(&nohz.load_balancer) == cpu) { |
2959 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 2969 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) |
2960 | BUG(); | 2970 | BUG(); |
2961 | return 0; | 2971 | return 0; |
2962 | } | 2972 | } |
2963 | 2973 | ||
2964 | /* time for ilb owner also to sleep */ | 2974 | /* time for ilb owner also to sleep */ |
2965 | if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | 2975 | if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { |
2966 | if (atomic_read(&nohz.load_balancer) == cpu) | 2976 | if (atomic_read(&nohz.load_balancer) == cpu) |
2967 | atomic_set(&nohz.load_balancer, -1); | 2977 | atomic_set(&nohz.load_balancer, -1); |
2968 | return 0; | 2978 | return 0; |
2969 | } | 2979 | } |
2970 | 2980 | ||
2971 | if (atomic_read(&nohz.load_balancer) == -1) { | 2981 | if (atomic_read(&nohz.load_balancer) == -1) { |
2972 | /* make me the ilb owner */ | 2982 | /* make me the ilb owner */ |
2973 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) | 2983 | if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) |
2974 | return 1; | 2984 | return 1; |
2975 | } else if (atomic_read(&nohz.load_balancer) == cpu) | 2985 | } else if (atomic_read(&nohz.load_balancer) == cpu) |
2976 | return 1; | 2986 | return 1; |
2977 | } else { | 2987 | } else { |
2978 | if (!cpu_isset(cpu, nohz.cpu_mask)) | 2988 | if (!cpu_isset(cpu, nohz.cpu_mask)) |
2979 | return 0; | 2989 | return 0; |
2980 | 2990 | ||
2981 | cpu_clear(cpu, nohz.cpu_mask); | 2991 | cpu_clear(cpu, nohz.cpu_mask); |
2982 | 2992 | ||
2983 | if (atomic_read(&nohz.load_balancer) == cpu) | 2993 | if (atomic_read(&nohz.load_balancer) == cpu) |
2984 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) | 2994 | if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) |
2985 | BUG(); | 2995 | BUG(); |
2986 | } | 2996 | } |
2987 | return 0; | 2997 | return 0; |
2988 | } | 2998 | } |
2989 | #endif | 2999 | #endif |
2990 | 3000 | ||
2991 | static DEFINE_SPINLOCK(balancing); | 3001 | static DEFINE_SPINLOCK(balancing); |
2992 | 3002 | ||
2993 | /* | 3003 | /* |
2994 | * It checks each scheduling domain to see if it is due to be balanced, | 3004 | * It checks each scheduling domain to see if it is due to be balanced, |
2995 | * and initiates a balancing operation if so. | 3005 | * and initiates a balancing operation if so. |
2996 | * | 3006 | * |
2997 | * Balancing parameters are set up in arch_init_sched_domains. | 3007 | * Balancing parameters are set up in arch_init_sched_domains. |
2998 | */ | 3008 | */ |
2999 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) | 3009 | static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) |
3000 | { | 3010 | { |
3001 | int balance = 1; | 3011 | int balance = 1; |
3002 | struct rq *rq = cpu_rq(cpu); | 3012 | struct rq *rq = cpu_rq(cpu); |
3003 | unsigned long interval; | 3013 | unsigned long interval; |
3004 | struct sched_domain *sd; | 3014 | struct sched_domain *sd; |
3005 | /* Earliest time when we have to do rebalance again */ | 3015 | /* Earliest time when we have to do rebalance again */ |
3006 | unsigned long next_balance = jiffies + 60*HZ; | 3016 | unsigned long next_balance = jiffies + 60*HZ; |
3007 | 3017 | ||
3008 | for_each_domain(cpu, sd) { | 3018 | for_each_domain(cpu, sd) { |
3009 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3019 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3010 | continue; | 3020 | continue; |
3011 | 3021 | ||
3012 | interval = sd->balance_interval; | 3022 | interval = sd->balance_interval; |
3013 | if (idle != CPU_IDLE) | 3023 | if (idle != CPU_IDLE) |
3014 | interval *= sd->busy_factor; | 3024 | interval *= sd->busy_factor; |
3015 | 3025 | ||
3016 | /* scale ms to jiffies */ | 3026 | /* scale ms to jiffies */ |
3017 | interval = msecs_to_jiffies(interval); | 3027 | interval = msecs_to_jiffies(interval); |
3018 | if (unlikely(!interval)) | 3028 | if (unlikely(!interval)) |
3019 | interval = 1; | 3029 | interval = 1; |
3020 | if (interval > HZ*NR_CPUS/10) | 3030 | if (interval > HZ*NR_CPUS/10) |
3021 | interval = HZ*NR_CPUS/10; | 3031 | interval = HZ*NR_CPUS/10; |
3022 | 3032 | ||
3023 | 3033 | ||
3024 | if (sd->flags & SD_SERIALIZE) { | 3034 | if (sd->flags & SD_SERIALIZE) { |
3025 | if (!spin_trylock(&balancing)) | 3035 | if (!spin_trylock(&balancing)) |
3026 | goto out; | 3036 | goto out; |
3027 | } | 3037 | } |
3028 | 3038 | ||
3029 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 3039 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
3030 | if (load_balance(cpu, rq, sd, idle, &balance)) { | 3040 | if (load_balance(cpu, rq, sd, idle, &balance)) { |
3031 | /* | 3041 | /* |
3032 | * We've pulled tasks over so either we're no | 3042 | * We've pulled tasks over so either we're no |
3033 | * longer idle, or one of our SMT siblings is | 3043 | * longer idle, or one of our SMT siblings is |
3034 | * not idle. | 3044 | * not idle. |
3035 | */ | 3045 | */ |
3036 | idle = CPU_NOT_IDLE; | 3046 | idle = CPU_NOT_IDLE; |
3037 | } | 3047 | } |
3038 | sd->last_balance = jiffies; | 3048 | sd->last_balance = jiffies; |
3039 | } | 3049 | } |
3040 | if (sd->flags & SD_SERIALIZE) | 3050 | if (sd->flags & SD_SERIALIZE) |
3041 | spin_unlock(&balancing); | 3051 | spin_unlock(&balancing); |
3042 | out: | 3052 | out: |
3043 | if (time_after(next_balance, sd->last_balance + interval)) | 3053 | if (time_after(next_balance, sd->last_balance + interval)) |
3044 | next_balance = sd->last_balance + interval; | 3054 | next_balance = sd->last_balance + interval; |
3045 | 3055 | ||
3046 | /* | 3056 | /* |
3047 | * Stop the load balance at this level. There is another | 3057 | * Stop the load balance at this level. There is another |
3048 | * CPU in our sched group which is doing load balancing more | 3058 | * CPU in our sched group which is doing load balancing more |
3049 | * actively. | 3059 | * actively. |
3050 | */ | 3060 | */ |
3051 | if (!balance) | 3061 | if (!balance) |
3052 | break; | 3062 | break; |
3053 | } | 3063 | } |
3054 | rq->next_balance = next_balance; | 3064 | rq->next_balance = next_balance; |
3055 | } | 3065 | } |
3056 | 3066 | ||
3057 | /* | 3067 | /* |
3058 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 3068 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
3059 | * In CONFIG_NO_HZ case, the idle load balance owner will do the | 3069 | * In CONFIG_NO_HZ case, the idle load balance owner will do the |
3060 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 3070 | * rebalancing for all the cpus for whom scheduler ticks are stopped. |
3061 | */ | 3071 | */ |
3062 | static void run_rebalance_domains(struct softirq_action *h) | 3072 | static void run_rebalance_domains(struct softirq_action *h) |
3063 | { | 3073 | { |
3064 | int this_cpu = smp_processor_id(); | 3074 | int this_cpu = smp_processor_id(); |
3065 | struct rq *this_rq = cpu_rq(this_cpu); | 3075 | struct rq *this_rq = cpu_rq(this_cpu); |
3066 | enum cpu_idle_type idle = this_rq->idle_at_tick ? | 3076 | enum cpu_idle_type idle = this_rq->idle_at_tick ? |
3067 | CPU_IDLE : CPU_NOT_IDLE; | 3077 | CPU_IDLE : CPU_NOT_IDLE; |
3068 | 3078 | ||
3069 | rebalance_domains(this_cpu, idle); | 3079 | rebalance_domains(this_cpu, idle); |
3070 | 3080 | ||
3071 | #ifdef CONFIG_NO_HZ | 3081 | #ifdef CONFIG_NO_HZ |
3072 | /* | 3082 | /* |
3073 | * If this cpu is the owner for idle load balancing, then do the | 3083 | * If this cpu is the owner for idle load balancing, then do the |
3074 | * balancing on behalf of the other idle cpus whose ticks are | 3084 | * balancing on behalf of the other idle cpus whose ticks are |
3075 | * stopped. | 3085 | * stopped. |
3076 | */ | 3086 | */ |
3077 | if (this_rq->idle_at_tick && | 3087 | if (this_rq->idle_at_tick && |
3078 | atomic_read(&nohz.load_balancer) == this_cpu) { | 3088 | atomic_read(&nohz.load_balancer) == this_cpu) { |
3079 | cpumask_t cpus = nohz.cpu_mask; | 3089 | cpumask_t cpus = nohz.cpu_mask; |
3080 | struct rq *rq; | 3090 | struct rq *rq; |
3081 | int balance_cpu; | 3091 | int balance_cpu; |
3082 | 3092 | ||
3083 | cpu_clear(this_cpu, cpus); | 3093 | cpu_clear(this_cpu, cpus); |
3084 | for_each_cpu_mask(balance_cpu, cpus) { | 3094 | for_each_cpu_mask(balance_cpu, cpus) { |
3085 | /* | 3095 | /* |
3086 | * If this cpu gets work to do, stop the load balancing | 3096 | * If this cpu gets work to do, stop the load balancing |
3087 | * work being done for other cpus. Next load | 3097 | * work being done for other cpus. Next load |
3088 | * balancing owner will pick it up. | 3098 | * balancing owner will pick it up. |
3089 | */ | 3099 | */ |
3090 | if (need_resched()) | 3100 | if (need_resched()) |
3091 | break; | 3101 | break; |
3092 | 3102 | ||
3093 | rebalance_domains(balance_cpu, SCHED_IDLE); | 3103 | rebalance_domains(balance_cpu, SCHED_IDLE); |
3094 | 3104 | ||
3095 | rq = cpu_rq(balance_cpu); | 3105 | rq = cpu_rq(balance_cpu); |
3096 | if (time_after(this_rq->next_balance, rq->next_balance)) | 3106 | if (time_after(this_rq->next_balance, rq->next_balance)) |
3097 | this_rq->next_balance = rq->next_balance; | 3107 | this_rq->next_balance = rq->next_balance; |
3098 | } | 3108 | } |
3099 | } | 3109 | } |
3100 | #endif | 3110 | #endif |
3101 | } | 3111 | } |
3102 | 3112 | ||
3103 | /* | 3113 | /* |
3104 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. | 3114 | * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. |
3105 | * | 3115 | * |
3106 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new | 3116 | * In case of CONFIG_NO_HZ, this is the place where we nominate a new |
3107 | * idle load balancing owner or decide to stop the periodic load balancing, | 3117 | * idle load balancing owner or decide to stop the periodic load balancing, |
3108 | * if the whole system is idle. | 3118 | * if the whole system is idle. |
3109 | */ | 3119 | */ |
3110 | static inline void trigger_load_balance(struct rq *rq, int cpu) | 3120 | static inline void trigger_load_balance(struct rq *rq, int cpu) |
3111 | { | 3121 | { |
3112 | #ifdef CONFIG_NO_HZ | 3122 | #ifdef CONFIG_NO_HZ |
3113 | /* | 3123 | /* |
3114 | * If we were in the nohz mode recently and busy at the current | 3124 | * If we were in the nohz mode recently and busy at the current |
3115 | * scheduler tick, then check if we need to nominate new idle | 3125 | * scheduler tick, then check if we need to nominate new idle |
3116 | * load balancer. | 3126 | * load balancer. |
3117 | */ | 3127 | */ |
3118 | if (rq->in_nohz_recently && !rq->idle_at_tick) { | 3128 | if (rq->in_nohz_recently && !rq->idle_at_tick) { |
3119 | rq->in_nohz_recently = 0; | 3129 | rq->in_nohz_recently = 0; |
3120 | 3130 | ||
3121 | if (atomic_read(&nohz.load_balancer) == cpu) { | 3131 | if (atomic_read(&nohz.load_balancer) == cpu) { |
3122 | cpu_clear(cpu, nohz.cpu_mask); | 3132 | cpu_clear(cpu, nohz.cpu_mask); |
3123 | atomic_set(&nohz.load_balancer, -1); | 3133 | atomic_set(&nohz.load_balancer, -1); |
3124 | } | 3134 | } |
3125 | 3135 | ||
3126 | if (atomic_read(&nohz.load_balancer) == -1) { | 3136 | if (atomic_read(&nohz.load_balancer) == -1) { |
3127 | /* | 3137 | /* |
3128 | * simple selection for now: Nominate the | 3138 | * simple selection for now: Nominate the |
3129 | * first cpu in the nohz list to be the next | 3139 | * first cpu in the nohz list to be the next |
3130 | * ilb owner. | 3140 | * ilb owner. |
3131 | * | 3141 | * |
3132 | * TBD: Traverse the sched domains and nominate | 3142 | * TBD: Traverse the sched domains and nominate |
3133 | * the nearest cpu in the nohz.cpu_mask. | 3143 | * the nearest cpu in the nohz.cpu_mask. |
3134 | */ | 3144 | */ |
3135 | int ilb = first_cpu(nohz.cpu_mask); | 3145 | int ilb = first_cpu(nohz.cpu_mask); |
3136 | 3146 | ||
3137 | if (ilb != NR_CPUS) | 3147 | if (ilb != NR_CPUS) |
3138 | resched_cpu(ilb); | 3148 | resched_cpu(ilb); |
3139 | } | 3149 | } |
3140 | } | 3150 | } |
3141 | 3151 | ||
3142 | /* | 3152 | /* |
3143 | * If this cpu is idle and doing idle load balancing for all the | 3153 | * If this cpu is idle and doing idle load balancing for all the |
3144 | * cpus with ticks stopped, is it time for that to stop? | 3154 | * cpus with ticks stopped, is it time for that to stop? |
3145 | */ | 3155 | */ |
3146 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && | 3156 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && |
3147 | cpus_weight(nohz.cpu_mask) == num_online_cpus()) { | 3157 | cpus_weight(nohz.cpu_mask) == num_online_cpus()) { |
3148 | resched_cpu(cpu); | 3158 | resched_cpu(cpu); |
3149 | return; | 3159 | return; |
3150 | } | 3160 | } |
3151 | 3161 | ||
3152 | /* | 3162 | /* |
3153 | * If this cpu is idle and the idle load balancing is done by | 3163 | * If this cpu is idle and the idle load balancing is done by |
3154 | * someone else, then no need raise the SCHED_SOFTIRQ | 3164 | * someone else, then no need raise the SCHED_SOFTIRQ |
3155 | */ | 3165 | */ |
3156 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && | 3166 | if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && |
3157 | cpu_isset(cpu, nohz.cpu_mask)) | 3167 | cpu_isset(cpu, nohz.cpu_mask)) |
3158 | return; | 3168 | return; |
3159 | #endif | 3169 | #endif |
3160 | if (time_after_eq(jiffies, rq->next_balance)) | 3170 | if (time_after_eq(jiffies, rq->next_balance)) |
3161 | raise_softirq(SCHED_SOFTIRQ); | 3171 | raise_softirq(SCHED_SOFTIRQ); |
3162 | } | 3172 | } |
3163 | 3173 | ||
3164 | #else /* CONFIG_SMP */ | 3174 | #else /* CONFIG_SMP */ |
3165 | 3175 | ||
3166 | /* | 3176 | /* |
3167 | * on UP we do not need to balance between CPUs: | 3177 | * on UP we do not need to balance between CPUs: |
3168 | */ | 3178 | */ |
3169 | static inline void idle_balance(int cpu, struct rq *rq) | 3179 | static inline void idle_balance(int cpu, struct rq *rq) |
3170 | { | 3180 | { |
3171 | } | 3181 | } |
3172 | 3182 | ||
3173 | /* Avoid "used but not defined" warning on UP */ | 3183 | /* Avoid "used but not defined" warning on UP */ |
3174 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 3184 | static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
3175 | unsigned long max_nr_move, unsigned long max_load_move, | 3185 | unsigned long max_nr_move, unsigned long max_load_move, |
3176 | struct sched_domain *sd, enum cpu_idle_type idle, | 3186 | struct sched_domain *sd, enum cpu_idle_type idle, |
3177 | int *all_pinned, unsigned long *load_moved, | 3187 | int *all_pinned, unsigned long *load_moved, |
3178 | int this_best_prio, int best_prio, int best_prio_seen, | 3188 | int this_best_prio, int best_prio, int best_prio_seen, |
3179 | struct rq_iterator *iterator) | 3189 | struct rq_iterator *iterator) |
3180 | { | 3190 | { |
3181 | *load_moved = 0; | 3191 | *load_moved = 0; |
3182 | 3192 | ||
3183 | return 0; | 3193 | return 0; |
3184 | } | 3194 | } |
3185 | 3195 | ||
3186 | #endif | 3196 | #endif |
3187 | 3197 | ||
3188 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3198 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
3189 | 3199 | ||
3190 | EXPORT_PER_CPU_SYMBOL(kstat); | 3200 | EXPORT_PER_CPU_SYMBOL(kstat); |
3191 | 3201 | ||
3192 | /* | 3202 | /* |
3193 | * Return p->sum_exec_runtime plus any more ns on the sched_clock | 3203 | * Return p->sum_exec_runtime plus any more ns on the sched_clock |
3194 | * that have not yet been banked in case the task is currently running. | 3204 | * that have not yet been banked in case the task is currently running. |
3195 | */ | 3205 | */ |
3196 | unsigned long long task_sched_runtime(struct task_struct *p) | 3206 | unsigned long long task_sched_runtime(struct task_struct *p) |
3197 | { | 3207 | { |
3198 | unsigned long flags; | 3208 | unsigned long flags; |
3199 | u64 ns, delta_exec; | 3209 | u64 ns, delta_exec; |
3200 | struct rq *rq; | 3210 | struct rq *rq; |
3201 | 3211 | ||
3202 | rq = task_rq_lock(p, &flags); | 3212 | rq = task_rq_lock(p, &flags); |
3203 | ns = p->se.sum_exec_runtime; | 3213 | ns = p->se.sum_exec_runtime; |
3204 | if (rq->curr == p) { | 3214 | if (rq->curr == p) { |
3205 | delta_exec = rq_clock(rq) - p->se.exec_start; | 3215 | delta_exec = rq_clock(rq) - p->se.exec_start; |
3206 | if ((s64)delta_exec > 0) | 3216 | if ((s64)delta_exec > 0) |
3207 | ns += delta_exec; | 3217 | ns += delta_exec; |
3208 | } | 3218 | } |
3209 | task_rq_unlock(rq, &flags); | 3219 | task_rq_unlock(rq, &flags); |
3210 | 3220 | ||
3211 | return ns; | 3221 | return ns; |
3212 | } | 3222 | } |
3213 | 3223 | ||
3214 | /* | 3224 | /* |
3215 | * Account user cpu time to a process. | 3225 | * Account user cpu time to a process. |
3216 | * @p: the process that the cpu time gets accounted to | 3226 | * @p: the process that the cpu time gets accounted to |
3217 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3227 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3218 | * @cputime: the cpu time spent in user space since the last update | 3228 | * @cputime: the cpu time spent in user space since the last update |
3219 | */ | 3229 | */ |
3220 | void account_user_time(struct task_struct *p, cputime_t cputime) | 3230 | void account_user_time(struct task_struct *p, cputime_t cputime) |
3221 | { | 3231 | { |
3222 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3232 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3223 | cputime64_t tmp; | 3233 | cputime64_t tmp; |
3224 | 3234 | ||
3225 | p->utime = cputime_add(p->utime, cputime); | 3235 | p->utime = cputime_add(p->utime, cputime); |
3226 | 3236 | ||
3227 | /* Add user time to cpustat. */ | 3237 | /* Add user time to cpustat. */ |
3228 | tmp = cputime_to_cputime64(cputime); | 3238 | tmp = cputime_to_cputime64(cputime); |
3229 | if (TASK_NICE(p) > 0) | 3239 | if (TASK_NICE(p) > 0) |
3230 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 3240 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3231 | else | 3241 | else |
3232 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3242 | cpustat->user = cputime64_add(cpustat->user, tmp); |
3233 | } | 3243 | } |
3234 | 3244 | ||
3235 | /* | 3245 | /* |
3236 | * Account system cpu time to a process. | 3246 | * Account system cpu time to a process. |
3237 | * @p: the process that the cpu time gets accounted to | 3247 | * @p: the process that the cpu time gets accounted to |
3238 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3248 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3239 | * @cputime: the cpu time spent in kernel space since the last update | 3249 | * @cputime: the cpu time spent in kernel space since the last update |
3240 | */ | 3250 | */ |
3241 | void account_system_time(struct task_struct *p, int hardirq_offset, | 3251 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3242 | cputime_t cputime) | 3252 | cputime_t cputime) |
3243 | { | 3253 | { |
3244 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3254 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3245 | struct rq *rq = this_rq(); | 3255 | struct rq *rq = this_rq(); |
3246 | cputime64_t tmp; | 3256 | cputime64_t tmp; |
3247 | 3257 | ||
3248 | p->stime = cputime_add(p->stime, cputime); | 3258 | p->stime = cputime_add(p->stime, cputime); |
3249 | 3259 | ||
3250 | /* Add system time to cpustat. */ | 3260 | /* Add system time to cpustat. */ |
3251 | tmp = cputime_to_cputime64(cputime); | 3261 | tmp = cputime_to_cputime64(cputime); |
3252 | if (hardirq_count() - hardirq_offset) | 3262 | if (hardirq_count() - hardirq_offset) |
3253 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3263 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3254 | else if (softirq_count()) | 3264 | else if (softirq_count()) |
3255 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3265 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3256 | else if (p != rq->idle) | 3266 | else if (p != rq->idle) |
3257 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3267 | cpustat->system = cputime64_add(cpustat->system, tmp); |
3258 | else if (atomic_read(&rq->nr_iowait) > 0) | 3268 | else if (atomic_read(&rq->nr_iowait) > 0) |
3259 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3269 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3260 | else | 3270 | else |
3261 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3271 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3262 | /* Account for system time used */ | 3272 | /* Account for system time used */ |
3263 | acct_update_integrals(p); | 3273 | acct_update_integrals(p); |
3264 | } | 3274 | } |
3265 | 3275 | ||
3266 | /* | 3276 | /* |
3267 | * Account for involuntary wait time. | 3277 | * Account for involuntary wait time. |
3268 | * @p: the process from which the cpu time has been stolen | 3278 | * @p: the process from which the cpu time has been stolen |
3269 | * @steal: the cpu time spent in involuntary wait | 3279 | * @steal: the cpu time spent in involuntary wait |
3270 | */ | 3280 | */ |
3271 | void account_steal_time(struct task_struct *p, cputime_t steal) | 3281 | void account_steal_time(struct task_struct *p, cputime_t steal) |
3272 | { | 3282 | { |
3273 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3283 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3274 | cputime64_t tmp = cputime_to_cputime64(steal); | 3284 | cputime64_t tmp = cputime_to_cputime64(steal); |
3275 | struct rq *rq = this_rq(); | 3285 | struct rq *rq = this_rq(); |
3276 | 3286 | ||
3277 | if (p == rq->idle) { | 3287 | if (p == rq->idle) { |
3278 | p->stime = cputime_add(p->stime, steal); | 3288 | p->stime = cputime_add(p->stime, steal); |
3279 | if (atomic_read(&rq->nr_iowait) > 0) | 3289 | if (atomic_read(&rq->nr_iowait) > 0) |
3280 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3290 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3281 | else | 3291 | else |
3282 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3292 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3283 | } else | 3293 | } else |
3284 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3294 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3285 | } | 3295 | } |
3286 | 3296 | ||
3287 | /* | 3297 | /* |
3288 | * This function gets called by the timer code, with HZ frequency. | 3298 | * This function gets called by the timer code, with HZ frequency. |
3289 | * We call it with interrupts disabled. | 3299 | * We call it with interrupts disabled. |
3290 | * | 3300 | * |
3291 | * It also gets called by the fork code, when changing the parent's | 3301 | * It also gets called by the fork code, when changing the parent's |
3292 | * timeslices. | 3302 | * timeslices. |
3293 | */ | 3303 | */ |
3294 | void scheduler_tick(void) | 3304 | void scheduler_tick(void) |
3295 | { | 3305 | { |
3296 | int cpu = smp_processor_id(); | 3306 | int cpu = smp_processor_id(); |
3297 | struct rq *rq = cpu_rq(cpu); | 3307 | struct rq *rq = cpu_rq(cpu); |
3298 | struct task_struct *curr = rq->curr; | 3308 | struct task_struct *curr = rq->curr; |
3299 | 3309 | ||
3300 | spin_lock(&rq->lock); | 3310 | spin_lock(&rq->lock); |
3301 | update_cpu_load(rq); | 3311 | update_cpu_load(rq); |
3302 | if (curr != rq->idle) /* FIXME: needed? */ | 3312 | if (curr != rq->idle) /* FIXME: needed? */ |
3303 | curr->sched_class->task_tick(rq, curr); | 3313 | curr->sched_class->task_tick(rq, curr); |
3304 | spin_unlock(&rq->lock); | 3314 | spin_unlock(&rq->lock); |
3305 | 3315 | ||
3306 | #ifdef CONFIG_SMP | 3316 | #ifdef CONFIG_SMP |
3307 | rq->idle_at_tick = idle_cpu(cpu); | 3317 | rq->idle_at_tick = idle_cpu(cpu); |
3308 | trigger_load_balance(rq, cpu); | 3318 | trigger_load_balance(rq, cpu); |
3309 | #endif | 3319 | #endif |
3310 | } | 3320 | } |
3311 | 3321 | ||
3312 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 3322 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) |
3313 | 3323 | ||
3314 | void fastcall add_preempt_count(int val) | 3324 | void fastcall add_preempt_count(int val) |
3315 | { | 3325 | { |
3316 | /* | 3326 | /* |
3317 | * Underflow? | 3327 | * Underflow? |
3318 | */ | 3328 | */ |
3319 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 3329 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
3320 | return; | 3330 | return; |
3321 | preempt_count() += val; | 3331 | preempt_count() += val; |
3322 | /* | 3332 | /* |
3323 | * Spinlock count overflowing soon? | 3333 | * Spinlock count overflowing soon? |
3324 | */ | 3334 | */ |
3325 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 3335 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
3326 | PREEMPT_MASK - 10); | 3336 | PREEMPT_MASK - 10); |
3327 | } | 3337 | } |
3328 | EXPORT_SYMBOL(add_preempt_count); | 3338 | EXPORT_SYMBOL(add_preempt_count); |
3329 | 3339 | ||
3330 | void fastcall sub_preempt_count(int val) | 3340 | void fastcall sub_preempt_count(int val) |
3331 | { | 3341 | { |
3332 | /* | 3342 | /* |
3333 | * Underflow? | 3343 | * Underflow? |
3334 | */ | 3344 | */ |
3335 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) | 3345 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
3336 | return; | 3346 | return; |
3337 | /* | 3347 | /* |
3338 | * Is the spinlock portion underflowing? | 3348 | * Is the spinlock portion underflowing? |
3339 | */ | 3349 | */ |
3340 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 3350 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
3341 | !(preempt_count() & PREEMPT_MASK))) | 3351 | !(preempt_count() & PREEMPT_MASK))) |
3342 | return; | 3352 | return; |
3343 | 3353 | ||
3344 | preempt_count() -= val; | 3354 | preempt_count() -= val; |
3345 | } | 3355 | } |
3346 | EXPORT_SYMBOL(sub_preempt_count); | 3356 | EXPORT_SYMBOL(sub_preempt_count); |
3347 | 3357 | ||
3348 | #endif | 3358 | #endif |
3349 | 3359 | ||
3350 | /* | 3360 | /* |
3351 | * Print scheduling while atomic bug: | 3361 | * Print scheduling while atomic bug: |
3352 | */ | 3362 | */ |
3353 | static noinline void __schedule_bug(struct task_struct *prev) | 3363 | static noinline void __schedule_bug(struct task_struct *prev) |
3354 | { | 3364 | { |
3355 | printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", | 3365 | printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n", |
3356 | prev->comm, preempt_count(), prev->pid); | 3366 | prev->comm, preempt_count(), prev->pid); |
3357 | debug_show_held_locks(prev); | 3367 | debug_show_held_locks(prev); |
3358 | if (irqs_disabled()) | 3368 | if (irqs_disabled()) |
3359 | print_irqtrace_events(prev); | 3369 | print_irqtrace_events(prev); |
3360 | dump_stack(); | 3370 | dump_stack(); |
3361 | } | 3371 | } |
3362 | 3372 | ||
3363 | /* | 3373 | /* |
3364 | * Various schedule()-time debugging checks and statistics: | 3374 | * Various schedule()-time debugging checks and statistics: |
3365 | */ | 3375 | */ |
3366 | static inline void schedule_debug(struct task_struct *prev) | 3376 | static inline void schedule_debug(struct task_struct *prev) |
3367 | { | 3377 | { |
3368 | /* | 3378 | /* |
3369 | * Test if we are atomic. Since do_exit() needs to call into | 3379 | * Test if we are atomic. Since do_exit() needs to call into |
3370 | * schedule() atomically, we ignore that path for now. | 3380 | * schedule() atomically, we ignore that path for now. |
3371 | * Otherwise, whine if we are scheduling when we should not be. | 3381 | * Otherwise, whine if we are scheduling when we should not be. |
3372 | */ | 3382 | */ |
3373 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) | 3383 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) |
3374 | __schedule_bug(prev); | 3384 | __schedule_bug(prev); |
3375 | 3385 | ||
3376 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3386 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3377 | 3387 | ||
3378 | schedstat_inc(this_rq(), sched_cnt); | 3388 | schedstat_inc(this_rq(), sched_cnt); |
3379 | } | 3389 | } |
3380 | 3390 | ||
3381 | /* | 3391 | /* |
3382 | * Pick up the highest-prio task: | 3392 | * Pick up the highest-prio task: |
3383 | */ | 3393 | */ |
3384 | static inline struct task_struct * | 3394 | static inline struct task_struct * |
3385 | pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) | 3395 | pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) |
3386 | { | 3396 | { |
3387 | struct sched_class *class; | 3397 | struct sched_class *class; |
3388 | struct task_struct *p; | 3398 | struct task_struct *p; |
3389 | 3399 | ||
3390 | /* | 3400 | /* |
3391 | * Optimization: we know that if all tasks are in | 3401 | * Optimization: we know that if all tasks are in |
3392 | * the fair class we can call that function directly: | 3402 | * the fair class we can call that function directly: |
3393 | */ | 3403 | */ |
3394 | if (likely(rq->nr_running == rq->cfs.nr_running)) { | 3404 | if (likely(rq->nr_running == rq->cfs.nr_running)) { |
3395 | p = fair_sched_class.pick_next_task(rq, now); | 3405 | p = fair_sched_class.pick_next_task(rq, now); |
3396 | if (likely(p)) | 3406 | if (likely(p)) |
3397 | return p; | 3407 | return p; |
3398 | } | 3408 | } |
3399 | 3409 | ||
3400 | class = sched_class_highest; | 3410 | class = sched_class_highest; |
3401 | for ( ; ; ) { | 3411 | for ( ; ; ) { |
3402 | p = class->pick_next_task(rq, now); | 3412 | p = class->pick_next_task(rq, now); |
3403 | if (p) | 3413 | if (p) |
3404 | return p; | 3414 | return p; |
3405 | /* | 3415 | /* |
3406 | * Will never be NULL as the idle class always | 3416 | * Will never be NULL as the idle class always |
3407 | * returns a non-NULL p: | 3417 | * returns a non-NULL p: |
3408 | */ | 3418 | */ |
3409 | class = class->next; | 3419 | class = class->next; |
3410 | } | 3420 | } |
3411 | } | 3421 | } |
3412 | 3422 | ||
3413 | /* | 3423 | /* |
3414 | * schedule() is the main scheduler function. | 3424 | * schedule() is the main scheduler function. |
3415 | */ | 3425 | */ |
3416 | asmlinkage void __sched schedule(void) | 3426 | asmlinkage void __sched schedule(void) |
3417 | { | 3427 | { |
3418 | struct task_struct *prev, *next; | 3428 | struct task_struct *prev, *next; |
3419 | long *switch_count; | 3429 | long *switch_count; |
3420 | struct rq *rq; | 3430 | struct rq *rq; |
3421 | u64 now; | 3431 | u64 now; |
3422 | int cpu; | 3432 | int cpu; |
3423 | 3433 | ||
3424 | need_resched: | 3434 | need_resched: |
3425 | preempt_disable(); | 3435 | preempt_disable(); |
3426 | cpu = smp_processor_id(); | 3436 | cpu = smp_processor_id(); |
3427 | rq = cpu_rq(cpu); | 3437 | rq = cpu_rq(cpu); |
3428 | rcu_qsctr_inc(cpu); | 3438 | rcu_qsctr_inc(cpu); |
3429 | prev = rq->curr; | 3439 | prev = rq->curr; |
3430 | switch_count = &prev->nivcsw; | 3440 | switch_count = &prev->nivcsw; |
3431 | 3441 | ||
3432 | release_kernel_lock(prev); | 3442 | release_kernel_lock(prev); |
3433 | need_resched_nonpreemptible: | 3443 | need_resched_nonpreemptible: |
3434 | 3444 | ||
3435 | schedule_debug(prev); | 3445 | schedule_debug(prev); |
3436 | 3446 | ||
3437 | spin_lock_irq(&rq->lock); | 3447 | spin_lock_irq(&rq->lock); |
3438 | clear_tsk_need_resched(prev); | 3448 | clear_tsk_need_resched(prev); |
3439 | 3449 | ||
3440 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3450 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3441 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3451 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
3442 | unlikely(signal_pending(prev)))) { | 3452 | unlikely(signal_pending(prev)))) { |
3443 | prev->state = TASK_RUNNING; | 3453 | prev->state = TASK_RUNNING; |
3444 | } else { | 3454 | } else { |
3445 | deactivate_task(rq, prev, 1); | 3455 | deactivate_task(rq, prev, 1); |
3446 | } | 3456 | } |
3447 | switch_count = &prev->nvcsw; | 3457 | switch_count = &prev->nvcsw; |
3448 | } | 3458 | } |
3449 | 3459 | ||
3450 | if (unlikely(!rq->nr_running)) | 3460 | if (unlikely(!rq->nr_running)) |
3451 | idle_balance(cpu, rq); | 3461 | idle_balance(cpu, rq); |
3452 | 3462 | ||
3453 | now = __rq_clock(rq); | 3463 | now = __rq_clock(rq); |
3454 | prev->sched_class->put_prev_task(rq, prev, now); | 3464 | prev->sched_class->put_prev_task(rq, prev, now); |
3455 | next = pick_next_task(rq, prev, now); | 3465 | next = pick_next_task(rq, prev, now); |
3456 | 3466 | ||
3457 | sched_info_switch(prev, next); | 3467 | sched_info_switch(prev, next); |
3458 | 3468 | ||
3459 | if (likely(prev != next)) { | 3469 | if (likely(prev != next)) { |
3460 | rq->nr_switches++; | 3470 | rq->nr_switches++; |
3461 | rq->curr = next; | 3471 | rq->curr = next; |
3462 | ++*switch_count; | 3472 | ++*switch_count; |
3463 | 3473 | ||
3464 | context_switch(rq, prev, next); /* unlocks the rq */ | 3474 | context_switch(rq, prev, next); /* unlocks the rq */ |
3465 | } else | 3475 | } else |
3466 | spin_unlock_irq(&rq->lock); | 3476 | spin_unlock_irq(&rq->lock); |
3467 | 3477 | ||
3468 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3478 | if (unlikely(reacquire_kernel_lock(current) < 0)) { |
3469 | cpu = smp_processor_id(); | 3479 | cpu = smp_processor_id(); |
3470 | rq = cpu_rq(cpu); | 3480 | rq = cpu_rq(cpu); |
3471 | goto need_resched_nonpreemptible; | 3481 | goto need_resched_nonpreemptible; |
3472 | } | 3482 | } |
3473 | preempt_enable_no_resched(); | 3483 | preempt_enable_no_resched(); |
3474 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3484 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3475 | goto need_resched; | 3485 | goto need_resched; |
3476 | } | 3486 | } |
3477 | EXPORT_SYMBOL(schedule); | 3487 | EXPORT_SYMBOL(schedule); |
3478 | 3488 | ||
3479 | #ifdef CONFIG_PREEMPT | 3489 | #ifdef CONFIG_PREEMPT |
3480 | /* | 3490 | /* |
3481 | * this is the entry point to schedule() from in-kernel preemption | 3491 | * this is the entry point to schedule() from in-kernel preemption |
3482 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3492 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3483 | * occur there and call schedule directly. | 3493 | * occur there and call schedule directly. |
3484 | */ | 3494 | */ |
3485 | asmlinkage void __sched preempt_schedule(void) | 3495 | asmlinkage void __sched preempt_schedule(void) |
3486 | { | 3496 | { |
3487 | struct thread_info *ti = current_thread_info(); | 3497 | struct thread_info *ti = current_thread_info(); |
3488 | #ifdef CONFIG_PREEMPT_BKL | 3498 | #ifdef CONFIG_PREEMPT_BKL |
3489 | struct task_struct *task = current; | 3499 | struct task_struct *task = current; |
3490 | int saved_lock_depth; | 3500 | int saved_lock_depth; |
3491 | #endif | 3501 | #endif |
3492 | /* | 3502 | /* |
3493 | * If there is a non-zero preempt_count or interrupts are disabled, | 3503 | * If there is a non-zero preempt_count or interrupts are disabled, |
3494 | * we do not want to preempt the current task. Just return.. | 3504 | * we do not want to preempt the current task. Just return.. |
3495 | */ | 3505 | */ |
3496 | if (likely(ti->preempt_count || irqs_disabled())) | 3506 | if (likely(ti->preempt_count || irqs_disabled())) |
3497 | return; | 3507 | return; |
3498 | 3508 | ||
3499 | need_resched: | 3509 | need_resched: |
3500 | add_preempt_count(PREEMPT_ACTIVE); | 3510 | add_preempt_count(PREEMPT_ACTIVE); |
3501 | /* | 3511 | /* |
3502 | * We keep the big kernel semaphore locked, but we | 3512 | * We keep the big kernel semaphore locked, but we |
3503 | * clear ->lock_depth so that schedule() doesnt | 3513 | * clear ->lock_depth so that schedule() doesnt |
3504 | * auto-release the semaphore: | 3514 | * auto-release the semaphore: |
3505 | */ | 3515 | */ |
3506 | #ifdef CONFIG_PREEMPT_BKL | 3516 | #ifdef CONFIG_PREEMPT_BKL |
3507 | saved_lock_depth = task->lock_depth; | 3517 | saved_lock_depth = task->lock_depth; |
3508 | task->lock_depth = -1; | 3518 | task->lock_depth = -1; |
3509 | #endif | 3519 | #endif |
3510 | schedule(); | 3520 | schedule(); |
3511 | #ifdef CONFIG_PREEMPT_BKL | 3521 | #ifdef CONFIG_PREEMPT_BKL |
3512 | task->lock_depth = saved_lock_depth; | 3522 | task->lock_depth = saved_lock_depth; |
3513 | #endif | 3523 | #endif |
3514 | sub_preempt_count(PREEMPT_ACTIVE); | 3524 | sub_preempt_count(PREEMPT_ACTIVE); |
3515 | 3525 | ||
3516 | /* we could miss a preemption opportunity between schedule and now */ | 3526 | /* we could miss a preemption opportunity between schedule and now */ |
3517 | barrier(); | 3527 | barrier(); |
3518 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3528 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3519 | goto need_resched; | 3529 | goto need_resched; |
3520 | } | 3530 | } |
3521 | EXPORT_SYMBOL(preempt_schedule); | 3531 | EXPORT_SYMBOL(preempt_schedule); |
3522 | 3532 | ||
3523 | /* | 3533 | /* |
3524 | * this is the entry point to schedule() from kernel preemption | 3534 | * this is the entry point to schedule() from kernel preemption |
3525 | * off of irq context. | 3535 | * off of irq context. |
3526 | * Note, that this is called and return with irqs disabled. This will | 3536 | * Note, that this is called and return with irqs disabled. This will |
3527 | * protect us against recursive calling from irq. | 3537 | * protect us against recursive calling from irq. |
3528 | */ | 3538 | */ |
3529 | asmlinkage void __sched preempt_schedule_irq(void) | 3539 | asmlinkage void __sched preempt_schedule_irq(void) |
3530 | { | 3540 | { |
3531 | struct thread_info *ti = current_thread_info(); | 3541 | struct thread_info *ti = current_thread_info(); |
3532 | #ifdef CONFIG_PREEMPT_BKL | 3542 | #ifdef CONFIG_PREEMPT_BKL |
3533 | struct task_struct *task = current; | 3543 | struct task_struct *task = current; |
3534 | int saved_lock_depth; | 3544 | int saved_lock_depth; |
3535 | #endif | 3545 | #endif |
3536 | /* Catch callers which need to be fixed */ | 3546 | /* Catch callers which need to be fixed */ |
3537 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3547 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3538 | 3548 | ||
3539 | need_resched: | 3549 | need_resched: |
3540 | add_preempt_count(PREEMPT_ACTIVE); | 3550 | add_preempt_count(PREEMPT_ACTIVE); |
3541 | /* | 3551 | /* |
3542 | * We keep the big kernel semaphore locked, but we | 3552 | * We keep the big kernel semaphore locked, but we |
3543 | * clear ->lock_depth so that schedule() doesnt | 3553 | * clear ->lock_depth so that schedule() doesnt |
3544 | * auto-release the semaphore: | 3554 | * auto-release the semaphore: |
3545 | */ | 3555 | */ |
3546 | #ifdef CONFIG_PREEMPT_BKL | 3556 | #ifdef CONFIG_PREEMPT_BKL |
3547 | saved_lock_depth = task->lock_depth; | 3557 | saved_lock_depth = task->lock_depth; |
3548 | task->lock_depth = -1; | 3558 | task->lock_depth = -1; |
3549 | #endif | 3559 | #endif |
3550 | local_irq_enable(); | 3560 | local_irq_enable(); |
3551 | schedule(); | 3561 | schedule(); |
3552 | local_irq_disable(); | 3562 | local_irq_disable(); |
3553 | #ifdef CONFIG_PREEMPT_BKL | 3563 | #ifdef CONFIG_PREEMPT_BKL |
3554 | task->lock_depth = saved_lock_depth; | 3564 | task->lock_depth = saved_lock_depth; |
3555 | #endif | 3565 | #endif |
3556 | sub_preempt_count(PREEMPT_ACTIVE); | 3566 | sub_preempt_count(PREEMPT_ACTIVE); |
3557 | 3567 | ||
3558 | /* we could miss a preemption opportunity between schedule and now */ | 3568 | /* we could miss a preemption opportunity between schedule and now */ |
3559 | barrier(); | 3569 | barrier(); |
3560 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3570 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3561 | goto need_resched; | 3571 | goto need_resched; |
3562 | } | 3572 | } |
3563 | 3573 | ||
3564 | #endif /* CONFIG_PREEMPT */ | 3574 | #endif /* CONFIG_PREEMPT */ |
3565 | 3575 | ||
3566 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3576 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
3567 | void *key) | 3577 | void *key) |
3568 | { | 3578 | { |
3569 | return try_to_wake_up(curr->private, mode, sync); | 3579 | return try_to_wake_up(curr->private, mode, sync); |
3570 | } | 3580 | } |
3571 | EXPORT_SYMBOL(default_wake_function); | 3581 | EXPORT_SYMBOL(default_wake_function); |
3572 | 3582 | ||
3573 | /* | 3583 | /* |
3574 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 3584 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
3575 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 3585 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
3576 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 3586 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
3577 | * | 3587 | * |
3578 | * There are circumstances in which we can try to wake a task which has already | 3588 | * There are circumstances in which we can try to wake a task which has already |
3579 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 3589 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
3580 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 3590 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
3581 | */ | 3591 | */ |
3582 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3592 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
3583 | int nr_exclusive, int sync, void *key) | 3593 | int nr_exclusive, int sync, void *key) |
3584 | { | 3594 | { |
3585 | struct list_head *tmp, *next; | 3595 | struct list_head *tmp, *next; |
3586 | 3596 | ||
3587 | list_for_each_safe(tmp, next, &q->task_list) { | 3597 | list_for_each_safe(tmp, next, &q->task_list) { |
3588 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | 3598 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
3589 | unsigned flags = curr->flags; | 3599 | unsigned flags = curr->flags; |
3590 | 3600 | ||
3591 | if (curr->func(curr, mode, sync, key) && | 3601 | if (curr->func(curr, mode, sync, key) && |
3592 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 3602 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
3593 | break; | 3603 | break; |
3594 | } | 3604 | } |
3595 | } | 3605 | } |
3596 | 3606 | ||
3597 | /** | 3607 | /** |
3598 | * __wake_up - wake up threads blocked on a waitqueue. | 3608 | * __wake_up - wake up threads blocked on a waitqueue. |
3599 | * @q: the waitqueue | 3609 | * @q: the waitqueue |
3600 | * @mode: which threads | 3610 | * @mode: which threads |
3601 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 3611 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
3602 | * @key: is directly passed to the wakeup function | 3612 | * @key: is directly passed to the wakeup function |
3603 | */ | 3613 | */ |
3604 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | 3614 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, |
3605 | int nr_exclusive, void *key) | 3615 | int nr_exclusive, void *key) |
3606 | { | 3616 | { |
3607 | unsigned long flags; | 3617 | unsigned long flags; |
3608 | 3618 | ||
3609 | spin_lock_irqsave(&q->lock, flags); | 3619 | spin_lock_irqsave(&q->lock, flags); |
3610 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3620 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
3611 | spin_unlock_irqrestore(&q->lock, flags); | 3621 | spin_unlock_irqrestore(&q->lock, flags); |
3612 | } | 3622 | } |
3613 | EXPORT_SYMBOL(__wake_up); | 3623 | EXPORT_SYMBOL(__wake_up); |
3614 | 3624 | ||
3615 | /* | 3625 | /* |
3616 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 3626 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
3617 | */ | 3627 | */ |
3618 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 3628 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) |
3619 | { | 3629 | { |
3620 | __wake_up_common(q, mode, 1, 0, NULL); | 3630 | __wake_up_common(q, mode, 1, 0, NULL); |
3621 | } | 3631 | } |
3622 | 3632 | ||
3623 | /** | 3633 | /** |
3624 | * __wake_up_sync - wake up threads blocked on a waitqueue. | 3634 | * __wake_up_sync - wake up threads blocked on a waitqueue. |
3625 | * @q: the waitqueue | 3635 | * @q: the waitqueue |
3626 | * @mode: which threads | 3636 | * @mode: which threads |
3627 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 3637 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
3628 | * | 3638 | * |
3629 | * The sync wakeup differs that the waker knows that it will schedule | 3639 | * The sync wakeup differs that the waker knows that it will schedule |
3630 | * away soon, so while the target thread will be woken up, it will not | 3640 | * away soon, so while the target thread will be woken up, it will not |
3631 | * be migrated to another CPU - ie. the two threads are 'synchronized' | 3641 | * be migrated to another CPU - ie. the two threads are 'synchronized' |
3632 | * with each other. This can prevent needless bouncing between CPUs. | 3642 | * with each other. This can prevent needless bouncing between CPUs. |
3633 | * | 3643 | * |
3634 | * On UP it can prevent extra preemption. | 3644 | * On UP it can prevent extra preemption. |
3635 | */ | 3645 | */ |
3636 | void fastcall | 3646 | void fastcall |
3637 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 3647 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) |
3638 | { | 3648 | { |
3639 | unsigned long flags; | 3649 | unsigned long flags; |
3640 | int sync = 1; | 3650 | int sync = 1; |
3641 | 3651 | ||
3642 | if (unlikely(!q)) | 3652 | if (unlikely(!q)) |
3643 | return; | 3653 | return; |
3644 | 3654 | ||
3645 | if (unlikely(!nr_exclusive)) | 3655 | if (unlikely(!nr_exclusive)) |
3646 | sync = 0; | 3656 | sync = 0; |
3647 | 3657 | ||
3648 | spin_lock_irqsave(&q->lock, flags); | 3658 | spin_lock_irqsave(&q->lock, flags); |
3649 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); | 3659 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); |
3650 | spin_unlock_irqrestore(&q->lock, flags); | 3660 | spin_unlock_irqrestore(&q->lock, flags); |
3651 | } | 3661 | } |
3652 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 3662 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
3653 | 3663 | ||
3654 | void fastcall complete(struct completion *x) | 3664 | void fastcall complete(struct completion *x) |
3655 | { | 3665 | { |
3656 | unsigned long flags; | 3666 | unsigned long flags; |
3657 | 3667 | ||
3658 | spin_lock_irqsave(&x->wait.lock, flags); | 3668 | spin_lock_irqsave(&x->wait.lock, flags); |
3659 | x->done++; | 3669 | x->done++; |
3660 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 3670 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
3661 | 1, 0, NULL); | 3671 | 1, 0, NULL); |
3662 | spin_unlock_irqrestore(&x->wait.lock, flags); | 3672 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3663 | } | 3673 | } |
3664 | EXPORT_SYMBOL(complete); | 3674 | EXPORT_SYMBOL(complete); |
3665 | 3675 | ||
3666 | void fastcall complete_all(struct completion *x) | 3676 | void fastcall complete_all(struct completion *x) |
3667 | { | 3677 | { |
3668 | unsigned long flags; | 3678 | unsigned long flags; |
3669 | 3679 | ||
3670 | spin_lock_irqsave(&x->wait.lock, flags); | 3680 | spin_lock_irqsave(&x->wait.lock, flags); |
3671 | x->done += UINT_MAX/2; | 3681 | x->done += UINT_MAX/2; |
3672 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 3682 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
3673 | 0, 0, NULL); | 3683 | 0, 0, NULL); |
3674 | spin_unlock_irqrestore(&x->wait.lock, flags); | 3684 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3675 | } | 3685 | } |
3676 | EXPORT_SYMBOL(complete_all); | 3686 | EXPORT_SYMBOL(complete_all); |
3677 | 3687 | ||
3678 | void fastcall __sched wait_for_completion(struct completion *x) | 3688 | void fastcall __sched wait_for_completion(struct completion *x) |
3679 | { | 3689 | { |
3680 | might_sleep(); | 3690 | might_sleep(); |
3681 | 3691 | ||
3682 | spin_lock_irq(&x->wait.lock); | 3692 | spin_lock_irq(&x->wait.lock); |
3683 | if (!x->done) { | 3693 | if (!x->done) { |
3684 | DECLARE_WAITQUEUE(wait, current); | 3694 | DECLARE_WAITQUEUE(wait, current); |
3685 | 3695 | ||
3686 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3696 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3687 | __add_wait_queue_tail(&x->wait, &wait); | 3697 | __add_wait_queue_tail(&x->wait, &wait); |
3688 | do { | 3698 | do { |
3689 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3699 | __set_current_state(TASK_UNINTERRUPTIBLE); |
3690 | spin_unlock_irq(&x->wait.lock); | 3700 | spin_unlock_irq(&x->wait.lock); |
3691 | schedule(); | 3701 | schedule(); |
3692 | spin_lock_irq(&x->wait.lock); | 3702 | spin_lock_irq(&x->wait.lock); |
3693 | } while (!x->done); | 3703 | } while (!x->done); |
3694 | __remove_wait_queue(&x->wait, &wait); | 3704 | __remove_wait_queue(&x->wait, &wait); |
3695 | } | 3705 | } |
3696 | x->done--; | 3706 | x->done--; |
3697 | spin_unlock_irq(&x->wait.lock); | 3707 | spin_unlock_irq(&x->wait.lock); |
3698 | } | 3708 | } |
3699 | EXPORT_SYMBOL(wait_for_completion); | 3709 | EXPORT_SYMBOL(wait_for_completion); |
3700 | 3710 | ||
3701 | unsigned long fastcall __sched | 3711 | unsigned long fastcall __sched |
3702 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 3712 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
3703 | { | 3713 | { |
3704 | might_sleep(); | 3714 | might_sleep(); |
3705 | 3715 | ||
3706 | spin_lock_irq(&x->wait.lock); | 3716 | spin_lock_irq(&x->wait.lock); |
3707 | if (!x->done) { | 3717 | if (!x->done) { |
3708 | DECLARE_WAITQUEUE(wait, current); | 3718 | DECLARE_WAITQUEUE(wait, current); |
3709 | 3719 | ||
3710 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3720 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3711 | __add_wait_queue_tail(&x->wait, &wait); | 3721 | __add_wait_queue_tail(&x->wait, &wait); |
3712 | do { | 3722 | do { |
3713 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3723 | __set_current_state(TASK_UNINTERRUPTIBLE); |
3714 | spin_unlock_irq(&x->wait.lock); | 3724 | spin_unlock_irq(&x->wait.lock); |
3715 | timeout = schedule_timeout(timeout); | 3725 | timeout = schedule_timeout(timeout); |
3716 | spin_lock_irq(&x->wait.lock); | 3726 | spin_lock_irq(&x->wait.lock); |
3717 | if (!timeout) { | 3727 | if (!timeout) { |
3718 | __remove_wait_queue(&x->wait, &wait); | 3728 | __remove_wait_queue(&x->wait, &wait); |
3719 | goto out; | 3729 | goto out; |
3720 | } | 3730 | } |
3721 | } while (!x->done); | 3731 | } while (!x->done); |
3722 | __remove_wait_queue(&x->wait, &wait); | 3732 | __remove_wait_queue(&x->wait, &wait); |
3723 | } | 3733 | } |
3724 | x->done--; | 3734 | x->done--; |
3725 | out: | 3735 | out: |
3726 | spin_unlock_irq(&x->wait.lock); | 3736 | spin_unlock_irq(&x->wait.lock); |
3727 | return timeout; | 3737 | return timeout; |
3728 | } | 3738 | } |
3729 | EXPORT_SYMBOL(wait_for_completion_timeout); | 3739 | EXPORT_SYMBOL(wait_for_completion_timeout); |
3730 | 3740 | ||
3731 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3741 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) |
3732 | { | 3742 | { |
3733 | int ret = 0; | 3743 | int ret = 0; |
3734 | 3744 | ||
3735 | might_sleep(); | 3745 | might_sleep(); |
3736 | 3746 | ||
3737 | spin_lock_irq(&x->wait.lock); | 3747 | spin_lock_irq(&x->wait.lock); |
3738 | if (!x->done) { | 3748 | if (!x->done) { |
3739 | DECLARE_WAITQUEUE(wait, current); | 3749 | DECLARE_WAITQUEUE(wait, current); |
3740 | 3750 | ||
3741 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3751 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3742 | __add_wait_queue_tail(&x->wait, &wait); | 3752 | __add_wait_queue_tail(&x->wait, &wait); |
3743 | do { | 3753 | do { |
3744 | if (signal_pending(current)) { | 3754 | if (signal_pending(current)) { |
3745 | ret = -ERESTARTSYS; | 3755 | ret = -ERESTARTSYS; |
3746 | __remove_wait_queue(&x->wait, &wait); | 3756 | __remove_wait_queue(&x->wait, &wait); |
3747 | goto out; | 3757 | goto out; |
3748 | } | 3758 | } |
3749 | __set_current_state(TASK_INTERRUPTIBLE); | 3759 | __set_current_state(TASK_INTERRUPTIBLE); |
3750 | spin_unlock_irq(&x->wait.lock); | 3760 | spin_unlock_irq(&x->wait.lock); |
3751 | schedule(); | 3761 | schedule(); |
3752 | spin_lock_irq(&x->wait.lock); | 3762 | spin_lock_irq(&x->wait.lock); |
3753 | } while (!x->done); | 3763 | } while (!x->done); |
3754 | __remove_wait_queue(&x->wait, &wait); | 3764 | __remove_wait_queue(&x->wait, &wait); |
3755 | } | 3765 | } |
3756 | x->done--; | 3766 | x->done--; |
3757 | out: | 3767 | out: |
3758 | spin_unlock_irq(&x->wait.lock); | 3768 | spin_unlock_irq(&x->wait.lock); |
3759 | 3769 | ||
3760 | return ret; | 3770 | return ret; |
3761 | } | 3771 | } |
3762 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3772 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
3763 | 3773 | ||
3764 | unsigned long fastcall __sched | 3774 | unsigned long fastcall __sched |
3765 | wait_for_completion_interruptible_timeout(struct completion *x, | 3775 | wait_for_completion_interruptible_timeout(struct completion *x, |
3766 | unsigned long timeout) | 3776 | unsigned long timeout) |
3767 | { | 3777 | { |
3768 | might_sleep(); | 3778 | might_sleep(); |
3769 | 3779 | ||
3770 | spin_lock_irq(&x->wait.lock); | 3780 | spin_lock_irq(&x->wait.lock); |
3771 | if (!x->done) { | 3781 | if (!x->done) { |
3772 | DECLARE_WAITQUEUE(wait, current); | 3782 | DECLARE_WAITQUEUE(wait, current); |
3773 | 3783 | ||
3774 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3784 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3775 | __add_wait_queue_tail(&x->wait, &wait); | 3785 | __add_wait_queue_tail(&x->wait, &wait); |
3776 | do { | 3786 | do { |
3777 | if (signal_pending(current)) { | 3787 | if (signal_pending(current)) { |
3778 | timeout = -ERESTARTSYS; | 3788 | timeout = -ERESTARTSYS; |
3779 | __remove_wait_queue(&x->wait, &wait); | 3789 | __remove_wait_queue(&x->wait, &wait); |
3780 | goto out; | 3790 | goto out; |
3781 | } | 3791 | } |
3782 | __set_current_state(TASK_INTERRUPTIBLE); | 3792 | __set_current_state(TASK_INTERRUPTIBLE); |
3783 | spin_unlock_irq(&x->wait.lock); | 3793 | spin_unlock_irq(&x->wait.lock); |
3784 | timeout = schedule_timeout(timeout); | 3794 | timeout = schedule_timeout(timeout); |
3785 | spin_lock_irq(&x->wait.lock); | 3795 | spin_lock_irq(&x->wait.lock); |
3786 | if (!timeout) { | 3796 | if (!timeout) { |
3787 | __remove_wait_queue(&x->wait, &wait); | 3797 | __remove_wait_queue(&x->wait, &wait); |
3788 | goto out; | 3798 | goto out; |
3789 | } | 3799 | } |
3790 | } while (!x->done); | 3800 | } while (!x->done); |
3791 | __remove_wait_queue(&x->wait, &wait); | 3801 | __remove_wait_queue(&x->wait, &wait); |
3792 | } | 3802 | } |
3793 | x->done--; | 3803 | x->done--; |
3794 | out: | 3804 | out: |
3795 | spin_unlock_irq(&x->wait.lock); | 3805 | spin_unlock_irq(&x->wait.lock); |
3796 | return timeout; | 3806 | return timeout; |
3797 | } | 3807 | } |
3798 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3808 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
3799 | 3809 | ||
3800 | static inline void | 3810 | static inline void |
3801 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | 3811 | sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) |
3802 | { | 3812 | { |
3803 | spin_lock_irqsave(&q->lock, *flags); | 3813 | spin_lock_irqsave(&q->lock, *flags); |
3804 | __add_wait_queue(q, wait); | 3814 | __add_wait_queue(q, wait); |
3805 | spin_unlock(&q->lock); | 3815 | spin_unlock(&q->lock); |
3806 | } | 3816 | } |
3807 | 3817 | ||
3808 | static inline void | 3818 | static inline void |
3809 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) | 3819 | sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags) |
3810 | { | 3820 | { |
3811 | spin_lock_irq(&q->lock); | 3821 | spin_lock_irq(&q->lock); |
3812 | __remove_wait_queue(q, wait); | 3822 | __remove_wait_queue(q, wait); |
3813 | spin_unlock_irqrestore(&q->lock, *flags); | 3823 | spin_unlock_irqrestore(&q->lock, *flags); |
3814 | } | 3824 | } |
3815 | 3825 | ||
3816 | void __sched interruptible_sleep_on(wait_queue_head_t *q) | 3826 | void __sched interruptible_sleep_on(wait_queue_head_t *q) |
3817 | { | 3827 | { |
3818 | unsigned long flags; | 3828 | unsigned long flags; |
3819 | wait_queue_t wait; | 3829 | wait_queue_t wait; |
3820 | 3830 | ||
3821 | init_waitqueue_entry(&wait, current); | 3831 | init_waitqueue_entry(&wait, current); |
3822 | 3832 | ||
3823 | current->state = TASK_INTERRUPTIBLE; | 3833 | current->state = TASK_INTERRUPTIBLE; |
3824 | 3834 | ||
3825 | sleep_on_head(q, &wait, &flags); | 3835 | sleep_on_head(q, &wait, &flags); |
3826 | schedule(); | 3836 | schedule(); |
3827 | sleep_on_tail(q, &wait, &flags); | 3837 | sleep_on_tail(q, &wait, &flags); |
3828 | } | 3838 | } |
3829 | EXPORT_SYMBOL(interruptible_sleep_on); | 3839 | EXPORT_SYMBOL(interruptible_sleep_on); |
3830 | 3840 | ||
3831 | long __sched | 3841 | long __sched |
3832 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3842 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3833 | { | 3843 | { |
3834 | unsigned long flags; | 3844 | unsigned long flags; |
3835 | wait_queue_t wait; | 3845 | wait_queue_t wait; |
3836 | 3846 | ||
3837 | init_waitqueue_entry(&wait, current); | 3847 | init_waitqueue_entry(&wait, current); |
3838 | 3848 | ||
3839 | current->state = TASK_INTERRUPTIBLE; | 3849 | current->state = TASK_INTERRUPTIBLE; |
3840 | 3850 | ||
3841 | sleep_on_head(q, &wait, &flags); | 3851 | sleep_on_head(q, &wait, &flags); |
3842 | timeout = schedule_timeout(timeout); | 3852 | timeout = schedule_timeout(timeout); |
3843 | sleep_on_tail(q, &wait, &flags); | 3853 | sleep_on_tail(q, &wait, &flags); |
3844 | 3854 | ||
3845 | return timeout; | 3855 | return timeout; |
3846 | } | 3856 | } |
3847 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3857 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3848 | 3858 | ||
3849 | void __sched sleep_on(wait_queue_head_t *q) | 3859 | void __sched sleep_on(wait_queue_head_t *q) |
3850 | { | 3860 | { |
3851 | unsigned long flags; | 3861 | unsigned long flags; |
3852 | wait_queue_t wait; | 3862 | wait_queue_t wait; |
3853 | 3863 | ||
3854 | init_waitqueue_entry(&wait, current); | 3864 | init_waitqueue_entry(&wait, current); |
3855 | 3865 | ||
3856 | current->state = TASK_UNINTERRUPTIBLE; | 3866 | current->state = TASK_UNINTERRUPTIBLE; |
3857 | 3867 | ||
3858 | sleep_on_head(q, &wait, &flags); | 3868 | sleep_on_head(q, &wait, &flags); |
3859 | schedule(); | 3869 | schedule(); |
3860 | sleep_on_tail(q, &wait, &flags); | 3870 | sleep_on_tail(q, &wait, &flags); |
3861 | } | 3871 | } |
3862 | EXPORT_SYMBOL(sleep_on); | 3872 | EXPORT_SYMBOL(sleep_on); |
3863 | 3873 | ||
3864 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3874 | long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3865 | { | 3875 | { |
3866 | unsigned long flags; | 3876 | unsigned long flags; |
3867 | wait_queue_t wait; | 3877 | wait_queue_t wait; |
3868 | 3878 | ||
3869 | init_waitqueue_entry(&wait, current); | 3879 | init_waitqueue_entry(&wait, current); |
3870 | 3880 | ||
3871 | current->state = TASK_UNINTERRUPTIBLE; | 3881 | current->state = TASK_UNINTERRUPTIBLE; |
3872 | 3882 | ||
3873 | sleep_on_head(q, &wait, &flags); | 3883 | sleep_on_head(q, &wait, &flags); |
3874 | timeout = schedule_timeout(timeout); | 3884 | timeout = schedule_timeout(timeout); |
3875 | sleep_on_tail(q, &wait, &flags); | 3885 | sleep_on_tail(q, &wait, &flags); |
3876 | 3886 | ||
3877 | return timeout; | 3887 | return timeout; |
3878 | } | 3888 | } |
3879 | EXPORT_SYMBOL(sleep_on_timeout); | 3889 | EXPORT_SYMBOL(sleep_on_timeout); |
3880 | 3890 | ||
3881 | #ifdef CONFIG_RT_MUTEXES | 3891 | #ifdef CONFIG_RT_MUTEXES |
3882 | 3892 | ||
3883 | /* | 3893 | /* |
3884 | * rt_mutex_setprio - set the current priority of a task | 3894 | * rt_mutex_setprio - set the current priority of a task |
3885 | * @p: task | 3895 | * @p: task |
3886 | * @prio: prio value (kernel-internal form) | 3896 | * @prio: prio value (kernel-internal form) |
3887 | * | 3897 | * |
3888 | * This function changes the 'effective' priority of a task. It does | 3898 | * This function changes the 'effective' priority of a task. It does |
3889 | * not touch ->normal_prio like __setscheduler(). | 3899 | * not touch ->normal_prio like __setscheduler(). |
3890 | * | 3900 | * |
3891 | * Used by the rt_mutex code to implement priority inheritance logic. | 3901 | * Used by the rt_mutex code to implement priority inheritance logic. |
3892 | */ | 3902 | */ |
3893 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3903 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3894 | { | 3904 | { |
3895 | unsigned long flags; | 3905 | unsigned long flags; |
3896 | int oldprio, on_rq; | 3906 | int oldprio, on_rq; |
3897 | struct rq *rq; | 3907 | struct rq *rq; |
3898 | u64 now; | 3908 | u64 now; |
3899 | 3909 | ||
3900 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3910 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
3901 | 3911 | ||
3902 | rq = task_rq_lock(p, &flags); | 3912 | rq = task_rq_lock(p, &flags); |
3903 | now = rq_clock(rq); | 3913 | now = rq_clock(rq); |
3904 | 3914 | ||
3905 | oldprio = p->prio; | 3915 | oldprio = p->prio; |
3906 | on_rq = p->se.on_rq; | 3916 | on_rq = p->se.on_rq; |
3907 | if (on_rq) | 3917 | if (on_rq) |
3908 | dequeue_task(rq, p, 0, now); | 3918 | dequeue_task(rq, p, 0, now); |
3909 | 3919 | ||
3910 | if (rt_prio(prio)) | 3920 | if (rt_prio(prio)) |
3911 | p->sched_class = &rt_sched_class; | 3921 | p->sched_class = &rt_sched_class; |
3912 | else | 3922 | else |
3913 | p->sched_class = &fair_sched_class; | 3923 | p->sched_class = &fair_sched_class; |
3914 | 3924 | ||
3915 | p->prio = prio; | 3925 | p->prio = prio; |
3916 | 3926 | ||
3917 | if (on_rq) { | 3927 | if (on_rq) { |
3918 | enqueue_task(rq, p, 0, now); | 3928 | enqueue_task(rq, p, 0, now); |
3919 | /* | 3929 | /* |
3920 | * Reschedule if we are currently running on this runqueue and | 3930 | * Reschedule if we are currently running on this runqueue and |
3921 | * our priority decreased, or if we are not currently running on | 3931 | * our priority decreased, or if we are not currently running on |
3922 | * this runqueue and our priority is higher than the current's | 3932 | * this runqueue and our priority is higher than the current's |
3923 | */ | 3933 | */ |
3924 | if (task_running(rq, p)) { | 3934 | if (task_running(rq, p)) { |
3925 | if (p->prio > oldprio) | 3935 | if (p->prio > oldprio) |
3926 | resched_task(rq->curr); | 3936 | resched_task(rq->curr); |
3927 | } else { | 3937 | } else { |
3928 | check_preempt_curr(rq, p); | 3938 | check_preempt_curr(rq, p); |
3929 | } | 3939 | } |
3930 | } | 3940 | } |
3931 | task_rq_unlock(rq, &flags); | 3941 | task_rq_unlock(rq, &flags); |
3932 | } | 3942 | } |
3933 | 3943 | ||
3934 | #endif | 3944 | #endif |
3935 | 3945 | ||
3936 | void set_user_nice(struct task_struct *p, long nice) | 3946 | void set_user_nice(struct task_struct *p, long nice) |
3937 | { | 3947 | { |
3938 | int old_prio, delta, on_rq; | 3948 | int old_prio, delta, on_rq; |
3939 | unsigned long flags; | 3949 | unsigned long flags; |
3940 | struct rq *rq; | 3950 | struct rq *rq; |
3941 | u64 now; | 3951 | u64 now; |
3942 | 3952 | ||
3943 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3953 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3944 | return; | 3954 | return; |
3945 | /* | 3955 | /* |
3946 | * We have to be careful, if called from sys_setpriority(), | 3956 | * We have to be careful, if called from sys_setpriority(), |
3947 | * the task might be in the middle of scheduling on another CPU. | 3957 | * the task might be in the middle of scheduling on another CPU. |
3948 | */ | 3958 | */ |
3949 | rq = task_rq_lock(p, &flags); | 3959 | rq = task_rq_lock(p, &flags); |
3950 | now = rq_clock(rq); | 3960 | now = rq_clock(rq); |
3951 | /* | 3961 | /* |
3952 | * The RT priorities are set via sched_setscheduler(), but we still | 3962 | * The RT priorities are set via sched_setscheduler(), but we still |
3953 | * allow the 'normal' nice value to be set - but as expected | 3963 | * allow the 'normal' nice value to be set - but as expected |
3954 | * it wont have any effect on scheduling until the task is | 3964 | * it wont have any effect on scheduling until the task is |
3955 | * SCHED_FIFO/SCHED_RR: | 3965 | * SCHED_FIFO/SCHED_RR: |
3956 | */ | 3966 | */ |
3957 | if (task_has_rt_policy(p)) { | 3967 | if (task_has_rt_policy(p)) { |
3958 | p->static_prio = NICE_TO_PRIO(nice); | 3968 | p->static_prio = NICE_TO_PRIO(nice); |
3959 | goto out_unlock; | 3969 | goto out_unlock; |
3960 | } | 3970 | } |
3961 | on_rq = p->se.on_rq; | 3971 | on_rq = p->se.on_rq; |
3962 | if (on_rq) { | 3972 | if (on_rq) { |
3963 | dequeue_task(rq, p, 0, now); | 3973 | dequeue_task(rq, p, 0, now); |
3964 | dec_load(rq, p, now); | 3974 | dec_load(rq, p, now); |
3965 | } | 3975 | } |
3966 | 3976 | ||
3967 | p->static_prio = NICE_TO_PRIO(nice); | 3977 | p->static_prio = NICE_TO_PRIO(nice); |
3968 | set_load_weight(p); | 3978 | set_load_weight(p); |
3969 | old_prio = p->prio; | 3979 | old_prio = p->prio; |
3970 | p->prio = effective_prio(p); | 3980 | p->prio = effective_prio(p); |
3971 | delta = p->prio - old_prio; | 3981 | delta = p->prio - old_prio; |
3972 | 3982 | ||
3973 | if (on_rq) { | 3983 | if (on_rq) { |
3974 | enqueue_task(rq, p, 0, now); | 3984 | enqueue_task(rq, p, 0, now); |
3975 | inc_load(rq, p, now); | 3985 | inc_load(rq, p, now); |
3976 | /* | 3986 | /* |
3977 | * If the task increased its priority or is running and | 3987 | * If the task increased its priority or is running and |
3978 | * lowered its priority, then reschedule its CPU: | 3988 | * lowered its priority, then reschedule its CPU: |
3979 | */ | 3989 | */ |
3980 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3990 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3981 | resched_task(rq->curr); | 3991 | resched_task(rq->curr); |
3982 | } | 3992 | } |
3983 | out_unlock: | 3993 | out_unlock: |
3984 | task_rq_unlock(rq, &flags); | 3994 | task_rq_unlock(rq, &flags); |
3985 | } | 3995 | } |
3986 | EXPORT_SYMBOL(set_user_nice); | 3996 | EXPORT_SYMBOL(set_user_nice); |
3987 | 3997 | ||
3988 | /* | 3998 | /* |
3989 | * can_nice - check if a task can reduce its nice value | 3999 | * can_nice - check if a task can reduce its nice value |
3990 | * @p: task | 4000 | * @p: task |
3991 | * @nice: nice value | 4001 | * @nice: nice value |
3992 | */ | 4002 | */ |
3993 | int can_nice(const struct task_struct *p, const int nice) | 4003 | int can_nice(const struct task_struct *p, const int nice) |
3994 | { | 4004 | { |
3995 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 4005 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
3996 | int nice_rlim = 20 - nice; | 4006 | int nice_rlim = 20 - nice; |
3997 | 4007 | ||
3998 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 4008 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
3999 | capable(CAP_SYS_NICE)); | 4009 | capable(CAP_SYS_NICE)); |
4000 | } | 4010 | } |
4001 | 4011 | ||
4002 | #ifdef __ARCH_WANT_SYS_NICE | 4012 | #ifdef __ARCH_WANT_SYS_NICE |
4003 | 4013 | ||
4004 | /* | 4014 | /* |
4005 | * sys_nice - change the priority of the current process. | 4015 | * sys_nice - change the priority of the current process. |
4006 | * @increment: priority increment | 4016 | * @increment: priority increment |
4007 | * | 4017 | * |
4008 | * sys_setpriority is a more generic, but much slower function that | 4018 | * sys_setpriority is a more generic, but much slower function that |
4009 | * does similar things. | 4019 | * does similar things. |
4010 | */ | 4020 | */ |
4011 | asmlinkage long sys_nice(int increment) | 4021 | asmlinkage long sys_nice(int increment) |
4012 | { | 4022 | { |
4013 | long nice, retval; | 4023 | long nice, retval; |
4014 | 4024 | ||
4015 | /* | 4025 | /* |
4016 | * Setpriority might change our priority at the same moment. | 4026 | * Setpriority might change our priority at the same moment. |
4017 | * We don't have to worry. Conceptually one call occurs first | 4027 | * We don't have to worry. Conceptually one call occurs first |
4018 | * and we have a single winner. | 4028 | * and we have a single winner. |
4019 | */ | 4029 | */ |
4020 | if (increment < -40) | 4030 | if (increment < -40) |
4021 | increment = -40; | 4031 | increment = -40; |
4022 | if (increment > 40) | 4032 | if (increment > 40) |
4023 | increment = 40; | 4033 | increment = 40; |
4024 | 4034 | ||
4025 | nice = PRIO_TO_NICE(current->static_prio) + increment; | 4035 | nice = PRIO_TO_NICE(current->static_prio) + increment; |
4026 | if (nice < -20) | 4036 | if (nice < -20) |
4027 | nice = -20; | 4037 | nice = -20; |
4028 | if (nice > 19) | 4038 | if (nice > 19) |
4029 | nice = 19; | 4039 | nice = 19; |
4030 | 4040 | ||
4031 | if (increment < 0 && !can_nice(current, nice)) | 4041 | if (increment < 0 && !can_nice(current, nice)) |
4032 | return -EPERM; | 4042 | return -EPERM; |
4033 | 4043 | ||
4034 | retval = security_task_setnice(current, nice); | 4044 | retval = security_task_setnice(current, nice); |
4035 | if (retval) | 4045 | if (retval) |
4036 | return retval; | 4046 | return retval; |
4037 | 4047 | ||
4038 | set_user_nice(current, nice); | 4048 | set_user_nice(current, nice); |
4039 | return 0; | 4049 | return 0; |
4040 | } | 4050 | } |
4041 | 4051 | ||
4042 | #endif | 4052 | #endif |
4043 | 4053 | ||
4044 | /** | 4054 | /** |
4045 | * task_prio - return the priority value of a given task. | 4055 | * task_prio - return the priority value of a given task. |
4046 | * @p: the task in question. | 4056 | * @p: the task in question. |
4047 | * | 4057 | * |
4048 | * This is the priority value as seen by users in /proc. | 4058 | * This is the priority value as seen by users in /proc. |
4049 | * RT tasks are offset by -200. Normal tasks are centered | 4059 | * RT tasks are offset by -200. Normal tasks are centered |
4050 | * around 0, value goes from -16 to +15. | 4060 | * around 0, value goes from -16 to +15. |
4051 | */ | 4061 | */ |
4052 | int task_prio(const struct task_struct *p) | 4062 | int task_prio(const struct task_struct *p) |
4053 | { | 4063 | { |
4054 | return p->prio - MAX_RT_PRIO; | 4064 | return p->prio - MAX_RT_PRIO; |
4055 | } | 4065 | } |
4056 | 4066 | ||
4057 | /** | 4067 | /** |
4058 | * task_nice - return the nice value of a given task. | 4068 | * task_nice - return the nice value of a given task. |
4059 | * @p: the task in question. | 4069 | * @p: the task in question. |
4060 | */ | 4070 | */ |
4061 | int task_nice(const struct task_struct *p) | 4071 | int task_nice(const struct task_struct *p) |
4062 | { | 4072 | { |
4063 | return TASK_NICE(p); | 4073 | return TASK_NICE(p); |
4064 | } | 4074 | } |
4065 | EXPORT_SYMBOL_GPL(task_nice); | 4075 | EXPORT_SYMBOL_GPL(task_nice); |
4066 | 4076 | ||
4067 | /** | 4077 | /** |
4068 | * idle_cpu - is a given cpu idle currently? | 4078 | * idle_cpu - is a given cpu idle currently? |
4069 | * @cpu: the processor in question. | 4079 | * @cpu: the processor in question. |
4070 | */ | 4080 | */ |
4071 | int idle_cpu(int cpu) | 4081 | int idle_cpu(int cpu) |
4072 | { | 4082 | { |
4073 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 4083 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
4074 | } | 4084 | } |
4075 | 4085 | ||
4076 | /** | 4086 | /** |
4077 | * idle_task - return the idle task for a given cpu. | 4087 | * idle_task - return the idle task for a given cpu. |
4078 | * @cpu: the processor in question. | 4088 | * @cpu: the processor in question. |
4079 | */ | 4089 | */ |
4080 | struct task_struct *idle_task(int cpu) | 4090 | struct task_struct *idle_task(int cpu) |
4081 | { | 4091 | { |
4082 | return cpu_rq(cpu)->idle; | 4092 | return cpu_rq(cpu)->idle; |
4083 | } | 4093 | } |
4084 | 4094 | ||
4085 | /** | 4095 | /** |
4086 | * find_process_by_pid - find a process with a matching PID value. | 4096 | * find_process_by_pid - find a process with a matching PID value. |
4087 | * @pid: the pid in question. | 4097 | * @pid: the pid in question. |
4088 | */ | 4098 | */ |
4089 | static inline struct task_struct *find_process_by_pid(pid_t pid) | 4099 | static inline struct task_struct *find_process_by_pid(pid_t pid) |
4090 | { | 4100 | { |
4091 | return pid ? find_task_by_pid(pid) : current; | 4101 | return pid ? find_task_by_pid(pid) : current; |
4092 | } | 4102 | } |
4093 | 4103 | ||
4094 | /* Actually do priority change: must hold rq lock. */ | 4104 | /* Actually do priority change: must hold rq lock. */ |
4095 | static void | 4105 | static void |
4096 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) | 4106 | __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) |
4097 | { | 4107 | { |
4098 | BUG_ON(p->se.on_rq); | 4108 | BUG_ON(p->se.on_rq); |
4099 | 4109 | ||
4100 | p->policy = policy; | 4110 | p->policy = policy; |
4101 | switch (p->policy) { | 4111 | switch (p->policy) { |
4102 | case SCHED_NORMAL: | 4112 | case SCHED_NORMAL: |
4103 | case SCHED_BATCH: | 4113 | case SCHED_BATCH: |
4104 | case SCHED_IDLE: | 4114 | case SCHED_IDLE: |
4105 | p->sched_class = &fair_sched_class; | 4115 | p->sched_class = &fair_sched_class; |
4106 | break; | 4116 | break; |
4107 | case SCHED_FIFO: | 4117 | case SCHED_FIFO: |
4108 | case SCHED_RR: | 4118 | case SCHED_RR: |
4109 | p->sched_class = &rt_sched_class; | 4119 | p->sched_class = &rt_sched_class; |
4110 | break; | 4120 | break; |
4111 | } | 4121 | } |
4112 | 4122 | ||
4113 | p->rt_priority = prio; | 4123 | p->rt_priority = prio; |
4114 | p->normal_prio = normal_prio(p); | 4124 | p->normal_prio = normal_prio(p); |
4115 | /* we are holding p->pi_lock already */ | 4125 | /* we are holding p->pi_lock already */ |
4116 | p->prio = rt_mutex_getprio(p); | 4126 | p->prio = rt_mutex_getprio(p); |
4117 | set_load_weight(p); | 4127 | set_load_weight(p); |
4118 | } | 4128 | } |
4119 | 4129 | ||
4120 | /** | 4130 | /** |
4121 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 4131 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
4122 | * @p: the task in question. | 4132 | * @p: the task in question. |
4123 | * @policy: new policy. | 4133 | * @policy: new policy. |
4124 | * @param: structure containing the new RT priority. | 4134 | * @param: structure containing the new RT priority. |
4125 | * | 4135 | * |
4126 | * NOTE that the task may be already dead. | 4136 | * NOTE that the task may be already dead. |
4127 | */ | 4137 | */ |
4128 | int sched_setscheduler(struct task_struct *p, int policy, | 4138 | int sched_setscheduler(struct task_struct *p, int policy, |
4129 | struct sched_param *param) | 4139 | struct sched_param *param) |
4130 | { | 4140 | { |
4131 | int retval, oldprio, oldpolicy = -1, on_rq; | 4141 | int retval, oldprio, oldpolicy = -1, on_rq; |
4132 | unsigned long flags; | 4142 | unsigned long flags; |
4133 | struct rq *rq; | 4143 | struct rq *rq; |
4134 | 4144 | ||
4135 | /* may grab non-irq protected spin_locks */ | 4145 | /* may grab non-irq protected spin_locks */ |
4136 | BUG_ON(in_interrupt()); | 4146 | BUG_ON(in_interrupt()); |
4137 | recheck: | 4147 | recheck: |
4138 | /* double check policy once rq lock held */ | 4148 | /* double check policy once rq lock held */ |
4139 | if (policy < 0) | 4149 | if (policy < 0) |
4140 | policy = oldpolicy = p->policy; | 4150 | policy = oldpolicy = p->policy; |
4141 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 4151 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
4142 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 4152 | policy != SCHED_NORMAL && policy != SCHED_BATCH && |
4143 | policy != SCHED_IDLE) | 4153 | policy != SCHED_IDLE) |
4144 | return -EINVAL; | 4154 | return -EINVAL; |
4145 | /* | 4155 | /* |
4146 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 4156 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
4147 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 4157 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
4148 | * SCHED_BATCH and SCHED_IDLE is 0. | 4158 | * SCHED_BATCH and SCHED_IDLE is 0. |
4149 | */ | 4159 | */ |
4150 | if (param->sched_priority < 0 || | 4160 | if (param->sched_priority < 0 || |
4151 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4161 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
4152 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 4162 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
4153 | return -EINVAL; | 4163 | return -EINVAL; |
4154 | if (rt_policy(policy) != (param->sched_priority != 0)) | 4164 | if (rt_policy(policy) != (param->sched_priority != 0)) |
4155 | return -EINVAL; | 4165 | return -EINVAL; |
4156 | 4166 | ||
4157 | /* | 4167 | /* |
4158 | * Allow unprivileged RT tasks to decrease priority: | 4168 | * Allow unprivileged RT tasks to decrease priority: |
4159 | */ | 4169 | */ |
4160 | if (!capable(CAP_SYS_NICE)) { | 4170 | if (!capable(CAP_SYS_NICE)) { |
4161 | if (rt_policy(policy)) { | 4171 | if (rt_policy(policy)) { |
4162 | unsigned long rlim_rtprio; | 4172 | unsigned long rlim_rtprio; |
4163 | 4173 | ||
4164 | if (!lock_task_sighand(p, &flags)) | 4174 | if (!lock_task_sighand(p, &flags)) |
4165 | return -ESRCH; | 4175 | return -ESRCH; |
4166 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; | 4176 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; |
4167 | unlock_task_sighand(p, &flags); | 4177 | unlock_task_sighand(p, &flags); |
4168 | 4178 | ||
4169 | /* can't set/change the rt policy */ | 4179 | /* can't set/change the rt policy */ |
4170 | if (policy != p->policy && !rlim_rtprio) | 4180 | if (policy != p->policy && !rlim_rtprio) |
4171 | return -EPERM; | 4181 | return -EPERM; |
4172 | 4182 | ||
4173 | /* can't increase priority */ | 4183 | /* can't increase priority */ |
4174 | if (param->sched_priority > p->rt_priority && | 4184 | if (param->sched_priority > p->rt_priority && |
4175 | param->sched_priority > rlim_rtprio) | 4185 | param->sched_priority > rlim_rtprio) |
4176 | return -EPERM; | 4186 | return -EPERM; |
4177 | } | 4187 | } |
4178 | /* | 4188 | /* |
4179 | * Like positive nice levels, dont allow tasks to | 4189 | * Like positive nice levels, dont allow tasks to |
4180 | * move out of SCHED_IDLE either: | 4190 | * move out of SCHED_IDLE either: |
4181 | */ | 4191 | */ |
4182 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) | 4192 | if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) |
4183 | return -EPERM; | 4193 | return -EPERM; |
4184 | 4194 | ||
4185 | /* can't change other user's priorities */ | 4195 | /* can't change other user's priorities */ |
4186 | if ((current->euid != p->euid) && | 4196 | if ((current->euid != p->euid) && |
4187 | (current->euid != p->uid)) | 4197 | (current->euid != p->uid)) |
4188 | return -EPERM; | 4198 | return -EPERM; |
4189 | } | 4199 | } |
4190 | 4200 | ||
4191 | retval = security_task_setscheduler(p, policy, param); | 4201 | retval = security_task_setscheduler(p, policy, param); |
4192 | if (retval) | 4202 | if (retval) |
4193 | return retval; | 4203 | return retval; |
4194 | /* | 4204 | /* |
4195 | * make sure no PI-waiters arrive (or leave) while we are | 4205 | * make sure no PI-waiters arrive (or leave) while we are |
4196 | * changing the priority of the task: | 4206 | * changing the priority of the task: |
4197 | */ | 4207 | */ |
4198 | spin_lock_irqsave(&p->pi_lock, flags); | 4208 | spin_lock_irqsave(&p->pi_lock, flags); |
4199 | /* | 4209 | /* |
4200 | * To be able to change p->policy safely, the apropriate | 4210 | * To be able to change p->policy safely, the apropriate |
4201 | * runqueue lock must be held. | 4211 | * runqueue lock must be held. |
4202 | */ | 4212 | */ |
4203 | rq = __task_rq_lock(p); | 4213 | rq = __task_rq_lock(p); |
4204 | /* recheck policy now with rq lock held */ | 4214 | /* recheck policy now with rq lock held */ |
4205 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4215 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4206 | policy = oldpolicy = -1; | 4216 | policy = oldpolicy = -1; |
4207 | __task_rq_unlock(rq); | 4217 | __task_rq_unlock(rq); |
4208 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4218 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4209 | goto recheck; | 4219 | goto recheck; |
4210 | } | 4220 | } |
4211 | on_rq = p->se.on_rq; | 4221 | on_rq = p->se.on_rq; |
4212 | if (on_rq) | 4222 | if (on_rq) |
4213 | deactivate_task(rq, p, 0); | 4223 | deactivate_task(rq, p, 0); |
4214 | oldprio = p->prio; | 4224 | oldprio = p->prio; |
4215 | __setscheduler(rq, p, policy, param->sched_priority); | 4225 | __setscheduler(rq, p, policy, param->sched_priority); |
4216 | if (on_rq) { | 4226 | if (on_rq) { |
4217 | activate_task(rq, p, 0); | 4227 | activate_task(rq, p, 0); |
4218 | /* | 4228 | /* |
4219 | * Reschedule if we are currently running on this runqueue and | 4229 | * Reschedule if we are currently running on this runqueue and |
4220 | * our priority decreased, or if we are not currently running on | 4230 | * our priority decreased, or if we are not currently running on |
4221 | * this runqueue and our priority is higher than the current's | 4231 | * this runqueue and our priority is higher than the current's |
4222 | */ | 4232 | */ |
4223 | if (task_running(rq, p)) { | 4233 | if (task_running(rq, p)) { |
4224 | if (p->prio > oldprio) | 4234 | if (p->prio > oldprio) |
4225 | resched_task(rq->curr); | 4235 | resched_task(rq->curr); |
4226 | } else { | 4236 | } else { |
4227 | check_preempt_curr(rq, p); | 4237 | check_preempt_curr(rq, p); |
4228 | } | 4238 | } |
4229 | } | 4239 | } |
4230 | __task_rq_unlock(rq); | 4240 | __task_rq_unlock(rq); |
4231 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4241 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4232 | 4242 | ||
4233 | rt_mutex_adjust_pi(p); | 4243 | rt_mutex_adjust_pi(p); |
4234 | 4244 | ||
4235 | return 0; | 4245 | return 0; |
4236 | } | 4246 | } |
4237 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4247 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
4238 | 4248 | ||
4239 | static int | 4249 | static int |
4240 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4250 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4241 | { | 4251 | { |
4242 | struct sched_param lparam; | 4252 | struct sched_param lparam; |
4243 | struct task_struct *p; | 4253 | struct task_struct *p; |
4244 | int retval; | 4254 | int retval; |
4245 | 4255 | ||
4246 | if (!param || pid < 0) | 4256 | if (!param || pid < 0) |
4247 | return -EINVAL; | 4257 | return -EINVAL; |
4248 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 4258 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
4249 | return -EFAULT; | 4259 | return -EFAULT; |
4250 | 4260 | ||
4251 | rcu_read_lock(); | 4261 | rcu_read_lock(); |
4252 | retval = -ESRCH; | 4262 | retval = -ESRCH; |
4253 | p = find_process_by_pid(pid); | 4263 | p = find_process_by_pid(pid); |
4254 | if (p != NULL) | 4264 | if (p != NULL) |
4255 | retval = sched_setscheduler(p, policy, &lparam); | 4265 | retval = sched_setscheduler(p, policy, &lparam); |
4256 | rcu_read_unlock(); | 4266 | rcu_read_unlock(); |
4257 | 4267 | ||
4258 | return retval; | 4268 | return retval; |
4259 | } | 4269 | } |
4260 | 4270 | ||
4261 | /** | 4271 | /** |
4262 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 4272 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
4263 | * @pid: the pid in question. | 4273 | * @pid: the pid in question. |
4264 | * @policy: new policy. | 4274 | * @policy: new policy. |
4265 | * @param: structure containing the new RT priority. | 4275 | * @param: structure containing the new RT priority. |
4266 | */ | 4276 | */ |
4267 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 4277 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
4268 | struct sched_param __user *param) | 4278 | struct sched_param __user *param) |
4269 | { | 4279 | { |
4270 | /* negative values for policy are not valid */ | 4280 | /* negative values for policy are not valid */ |
4271 | if (policy < 0) | 4281 | if (policy < 0) |
4272 | return -EINVAL; | 4282 | return -EINVAL; |
4273 | 4283 | ||
4274 | return do_sched_setscheduler(pid, policy, param); | 4284 | return do_sched_setscheduler(pid, policy, param); |
4275 | } | 4285 | } |
4276 | 4286 | ||
4277 | /** | 4287 | /** |
4278 | * sys_sched_setparam - set/change the RT priority of a thread | 4288 | * sys_sched_setparam - set/change the RT priority of a thread |
4279 | * @pid: the pid in question. | 4289 | * @pid: the pid in question. |
4280 | * @param: structure containing the new RT priority. | 4290 | * @param: structure containing the new RT priority. |
4281 | */ | 4291 | */ |
4282 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | 4292 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) |
4283 | { | 4293 | { |
4284 | return do_sched_setscheduler(pid, -1, param); | 4294 | return do_sched_setscheduler(pid, -1, param); |
4285 | } | 4295 | } |
4286 | 4296 | ||
4287 | /** | 4297 | /** |
4288 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 4298 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
4289 | * @pid: the pid in question. | 4299 | * @pid: the pid in question. |
4290 | */ | 4300 | */ |
4291 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4301 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
4292 | { | 4302 | { |
4293 | struct task_struct *p; | 4303 | struct task_struct *p; |
4294 | int retval = -EINVAL; | 4304 | int retval = -EINVAL; |
4295 | 4305 | ||
4296 | if (pid < 0) | 4306 | if (pid < 0) |
4297 | goto out_nounlock; | 4307 | goto out_nounlock; |
4298 | 4308 | ||
4299 | retval = -ESRCH; | 4309 | retval = -ESRCH; |
4300 | read_lock(&tasklist_lock); | 4310 | read_lock(&tasklist_lock); |
4301 | p = find_process_by_pid(pid); | 4311 | p = find_process_by_pid(pid); |
4302 | if (p) { | 4312 | if (p) { |
4303 | retval = security_task_getscheduler(p); | 4313 | retval = security_task_getscheduler(p); |
4304 | if (!retval) | 4314 | if (!retval) |
4305 | retval = p->policy; | 4315 | retval = p->policy; |
4306 | } | 4316 | } |
4307 | read_unlock(&tasklist_lock); | 4317 | read_unlock(&tasklist_lock); |
4308 | 4318 | ||
4309 | out_nounlock: | 4319 | out_nounlock: |
4310 | return retval; | 4320 | return retval; |
4311 | } | 4321 | } |
4312 | 4322 | ||
4313 | /** | 4323 | /** |
4314 | * sys_sched_getscheduler - get the RT priority of a thread | 4324 | * sys_sched_getscheduler - get the RT priority of a thread |
4315 | * @pid: the pid in question. | 4325 | * @pid: the pid in question. |
4316 | * @param: structure containing the RT priority. | 4326 | * @param: structure containing the RT priority. |
4317 | */ | 4327 | */ |
4318 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4328 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
4319 | { | 4329 | { |
4320 | struct sched_param lp; | 4330 | struct sched_param lp; |
4321 | struct task_struct *p; | 4331 | struct task_struct *p; |
4322 | int retval = -EINVAL; | 4332 | int retval = -EINVAL; |
4323 | 4333 | ||
4324 | if (!param || pid < 0) | 4334 | if (!param || pid < 0) |
4325 | goto out_nounlock; | 4335 | goto out_nounlock; |
4326 | 4336 | ||
4327 | read_lock(&tasklist_lock); | 4337 | read_lock(&tasklist_lock); |
4328 | p = find_process_by_pid(pid); | 4338 | p = find_process_by_pid(pid); |
4329 | retval = -ESRCH; | 4339 | retval = -ESRCH; |
4330 | if (!p) | 4340 | if (!p) |
4331 | goto out_unlock; | 4341 | goto out_unlock; |
4332 | 4342 | ||
4333 | retval = security_task_getscheduler(p); | 4343 | retval = security_task_getscheduler(p); |
4334 | if (retval) | 4344 | if (retval) |
4335 | goto out_unlock; | 4345 | goto out_unlock; |
4336 | 4346 | ||
4337 | lp.sched_priority = p->rt_priority; | 4347 | lp.sched_priority = p->rt_priority; |
4338 | read_unlock(&tasklist_lock); | 4348 | read_unlock(&tasklist_lock); |
4339 | 4349 | ||
4340 | /* | 4350 | /* |
4341 | * This one might sleep, we cannot do it with a spinlock held ... | 4351 | * This one might sleep, we cannot do it with a spinlock held ... |
4342 | */ | 4352 | */ |
4343 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4353 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
4344 | 4354 | ||
4345 | out_nounlock: | 4355 | out_nounlock: |
4346 | return retval; | 4356 | return retval; |
4347 | 4357 | ||
4348 | out_unlock: | 4358 | out_unlock: |
4349 | read_unlock(&tasklist_lock); | 4359 | read_unlock(&tasklist_lock); |
4350 | return retval; | 4360 | return retval; |
4351 | } | 4361 | } |
4352 | 4362 | ||
4353 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4363 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
4354 | { | 4364 | { |
4355 | cpumask_t cpus_allowed; | 4365 | cpumask_t cpus_allowed; |
4356 | struct task_struct *p; | 4366 | struct task_struct *p; |
4357 | int retval; | 4367 | int retval; |
4358 | 4368 | ||
4359 | mutex_lock(&sched_hotcpu_mutex); | 4369 | mutex_lock(&sched_hotcpu_mutex); |
4360 | read_lock(&tasklist_lock); | 4370 | read_lock(&tasklist_lock); |
4361 | 4371 | ||
4362 | p = find_process_by_pid(pid); | 4372 | p = find_process_by_pid(pid); |
4363 | if (!p) { | 4373 | if (!p) { |
4364 | read_unlock(&tasklist_lock); | 4374 | read_unlock(&tasklist_lock); |
4365 | mutex_unlock(&sched_hotcpu_mutex); | 4375 | mutex_unlock(&sched_hotcpu_mutex); |
4366 | return -ESRCH; | 4376 | return -ESRCH; |
4367 | } | 4377 | } |
4368 | 4378 | ||
4369 | /* | 4379 | /* |
4370 | * It is not safe to call set_cpus_allowed with the | 4380 | * It is not safe to call set_cpus_allowed with the |
4371 | * tasklist_lock held. We will bump the task_struct's | 4381 | * tasklist_lock held. We will bump the task_struct's |
4372 | * usage count and then drop tasklist_lock. | 4382 | * usage count and then drop tasklist_lock. |
4373 | */ | 4383 | */ |
4374 | get_task_struct(p); | 4384 | get_task_struct(p); |
4375 | read_unlock(&tasklist_lock); | 4385 | read_unlock(&tasklist_lock); |
4376 | 4386 | ||
4377 | retval = -EPERM; | 4387 | retval = -EPERM; |
4378 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 4388 | if ((current->euid != p->euid) && (current->euid != p->uid) && |
4379 | !capable(CAP_SYS_NICE)) | 4389 | !capable(CAP_SYS_NICE)) |
4380 | goto out_unlock; | 4390 | goto out_unlock; |
4381 | 4391 | ||
4382 | retval = security_task_setscheduler(p, 0, NULL); | 4392 | retval = security_task_setscheduler(p, 0, NULL); |
4383 | if (retval) | 4393 | if (retval) |
4384 | goto out_unlock; | 4394 | goto out_unlock; |
4385 | 4395 | ||
4386 | cpus_allowed = cpuset_cpus_allowed(p); | 4396 | cpus_allowed = cpuset_cpus_allowed(p); |
4387 | cpus_and(new_mask, new_mask, cpus_allowed); | 4397 | cpus_and(new_mask, new_mask, cpus_allowed); |
4388 | retval = set_cpus_allowed(p, new_mask); | 4398 | retval = set_cpus_allowed(p, new_mask); |
4389 | 4399 | ||
4390 | out_unlock: | 4400 | out_unlock: |
4391 | put_task_struct(p); | 4401 | put_task_struct(p); |
4392 | mutex_unlock(&sched_hotcpu_mutex); | 4402 | mutex_unlock(&sched_hotcpu_mutex); |
4393 | return retval; | 4403 | return retval; |
4394 | } | 4404 | } |
4395 | 4405 | ||
4396 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | 4406 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
4397 | cpumask_t *new_mask) | 4407 | cpumask_t *new_mask) |
4398 | { | 4408 | { |
4399 | if (len < sizeof(cpumask_t)) { | 4409 | if (len < sizeof(cpumask_t)) { |
4400 | memset(new_mask, 0, sizeof(cpumask_t)); | 4410 | memset(new_mask, 0, sizeof(cpumask_t)); |
4401 | } else if (len > sizeof(cpumask_t)) { | 4411 | } else if (len > sizeof(cpumask_t)) { |
4402 | len = sizeof(cpumask_t); | 4412 | len = sizeof(cpumask_t); |
4403 | } | 4413 | } |
4404 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; | 4414 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; |
4405 | } | 4415 | } |
4406 | 4416 | ||
4407 | /** | 4417 | /** |
4408 | * sys_sched_setaffinity - set the cpu affinity of a process | 4418 | * sys_sched_setaffinity - set the cpu affinity of a process |
4409 | * @pid: pid of the process | 4419 | * @pid: pid of the process |
4410 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4420 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4411 | * @user_mask_ptr: user-space pointer to the new cpu mask | 4421 | * @user_mask_ptr: user-space pointer to the new cpu mask |
4412 | */ | 4422 | */ |
4413 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | 4423 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, |
4414 | unsigned long __user *user_mask_ptr) | 4424 | unsigned long __user *user_mask_ptr) |
4415 | { | 4425 | { |
4416 | cpumask_t new_mask; | 4426 | cpumask_t new_mask; |
4417 | int retval; | 4427 | int retval; |
4418 | 4428 | ||
4419 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); | 4429 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); |
4420 | if (retval) | 4430 | if (retval) |
4421 | return retval; | 4431 | return retval; |
4422 | 4432 | ||
4423 | return sched_setaffinity(pid, new_mask); | 4433 | return sched_setaffinity(pid, new_mask); |
4424 | } | 4434 | } |
4425 | 4435 | ||
4426 | /* | 4436 | /* |
4427 | * Represents all cpu's present in the system | 4437 | * Represents all cpu's present in the system |
4428 | * In systems capable of hotplug, this map could dynamically grow | 4438 | * In systems capable of hotplug, this map could dynamically grow |
4429 | * as new cpu's are detected in the system via any platform specific | 4439 | * as new cpu's are detected in the system via any platform specific |
4430 | * method, such as ACPI for e.g. | 4440 | * method, such as ACPI for e.g. |
4431 | */ | 4441 | */ |
4432 | 4442 | ||
4433 | cpumask_t cpu_present_map __read_mostly; | 4443 | cpumask_t cpu_present_map __read_mostly; |
4434 | EXPORT_SYMBOL(cpu_present_map); | 4444 | EXPORT_SYMBOL(cpu_present_map); |
4435 | 4445 | ||
4436 | #ifndef CONFIG_SMP | 4446 | #ifndef CONFIG_SMP |
4437 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | 4447 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
4438 | EXPORT_SYMBOL(cpu_online_map); | 4448 | EXPORT_SYMBOL(cpu_online_map); |
4439 | 4449 | ||
4440 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | 4450 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
4441 | EXPORT_SYMBOL(cpu_possible_map); | 4451 | EXPORT_SYMBOL(cpu_possible_map); |
4442 | #endif | 4452 | #endif |
4443 | 4453 | ||
4444 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4454 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
4445 | { | 4455 | { |
4446 | struct task_struct *p; | 4456 | struct task_struct *p; |
4447 | int retval; | 4457 | int retval; |
4448 | 4458 | ||
4449 | mutex_lock(&sched_hotcpu_mutex); | 4459 | mutex_lock(&sched_hotcpu_mutex); |
4450 | read_lock(&tasklist_lock); | 4460 | read_lock(&tasklist_lock); |
4451 | 4461 | ||
4452 | retval = -ESRCH; | 4462 | retval = -ESRCH; |
4453 | p = find_process_by_pid(pid); | 4463 | p = find_process_by_pid(pid); |
4454 | if (!p) | 4464 | if (!p) |
4455 | goto out_unlock; | 4465 | goto out_unlock; |
4456 | 4466 | ||
4457 | retval = security_task_getscheduler(p); | 4467 | retval = security_task_getscheduler(p); |
4458 | if (retval) | 4468 | if (retval) |
4459 | goto out_unlock; | 4469 | goto out_unlock; |
4460 | 4470 | ||
4461 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 4471 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
4462 | 4472 | ||
4463 | out_unlock: | 4473 | out_unlock: |
4464 | read_unlock(&tasklist_lock); | 4474 | read_unlock(&tasklist_lock); |
4465 | mutex_unlock(&sched_hotcpu_mutex); | 4475 | mutex_unlock(&sched_hotcpu_mutex); |
4466 | if (retval) | 4476 | if (retval) |
4467 | return retval; | 4477 | return retval; |
4468 | 4478 | ||
4469 | return 0; | 4479 | return 0; |
4470 | } | 4480 | } |
4471 | 4481 | ||
4472 | /** | 4482 | /** |
4473 | * sys_sched_getaffinity - get the cpu affinity of a process | 4483 | * sys_sched_getaffinity - get the cpu affinity of a process |
4474 | * @pid: pid of the process | 4484 | * @pid: pid of the process |
4475 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4485 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4476 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4486 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
4477 | */ | 4487 | */ |
4478 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | 4488 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, |
4479 | unsigned long __user *user_mask_ptr) | 4489 | unsigned long __user *user_mask_ptr) |
4480 | { | 4490 | { |
4481 | int ret; | 4491 | int ret; |
4482 | cpumask_t mask; | 4492 | cpumask_t mask; |
4483 | 4493 | ||
4484 | if (len < sizeof(cpumask_t)) | 4494 | if (len < sizeof(cpumask_t)) |
4485 | return -EINVAL; | 4495 | return -EINVAL; |
4486 | 4496 | ||
4487 | ret = sched_getaffinity(pid, &mask); | 4497 | ret = sched_getaffinity(pid, &mask); |
4488 | if (ret < 0) | 4498 | if (ret < 0) |
4489 | return ret; | 4499 | return ret; |
4490 | 4500 | ||
4491 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) | 4501 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) |
4492 | return -EFAULT; | 4502 | return -EFAULT; |
4493 | 4503 | ||
4494 | return sizeof(cpumask_t); | 4504 | return sizeof(cpumask_t); |
4495 | } | 4505 | } |
4496 | 4506 | ||
4497 | /** | 4507 | /** |
4498 | * sys_sched_yield - yield the current processor to other threads. | 4508 | * sys_sched_yield - yield the current processor to other threads. |
4499 | * | 4509 | * |
4500 | * This function yields the current CPU to other tasks. If there are no | 4510 | * This function yields the current CPU to other tasks. If there are no |
4501 | * other threads running on this CPU then this function will return. | 4511 | * other threads running on this CPU then this function will return. |
4502 | */ | 4512 | */ |
4503 | asmlinkage long sys_sched_yield(void) | 4513 | asmlinkage long sys_sched_yield(void) |
4504 | { | 4514 | { |
4505 | struct rq *rq = this_rq_lock(); | 4515 | struct rq *rq = this_rq_lock(); |
4506 | 4516 | ||
4507 | schedstat_inc(rq, yld_cnt); | 4517 | schedstat_inc(rq, yld_cnt); |
4508 | if (unlikely(rq->nr_running == 1)) | 4518 | if (unlikely(rq->nr_running == 1)) |
4509 | schedstat_inc(rq, yld_act_empty); | 4519 | schedstat_inc(rq, yld_act_empty); |
4510 | else | 4520 | else |
4511 | current->sched_class->yield_task(rq, current); | 4521 | current->sched_class->yield_task(rq, current); |
4512 | 4522 | ||
4513 | /* | 4523 | /* |
4514 | * Since we are going to call schedule() anyway, there's | 4524 | * Since we are going to call schedule() anyway, there's |
4515 | * no need to preempt or enable interrupts: | 4525 | * no need to preempt or enable interrupts: |
4516 | */ | 4526 | */ |
4517 | __release(rq->lock); | 4527 | __release(rq->lock); |
4518 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 4528 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
4519 | _raw_spin_unlock(&rq->lock); | 4529 | _raw_spin_unlock(&rq->lock); |
4520 | preempt_enable_no_resched(); | 4530 | preempt_enable_no_resched(); |
4521 | 4531 | ||
4522 | schedule(); | 4532 | schedule(); |
4523 | 4533 | ||
4524 | return 0; | 4534 | return 0; |
4525 | } | 4535 | } |
4526 | 4536 | ||
4527 | static void __cond_resched(void) | 4537 | static void __cond_resched(void) |
4528 | { | 4538 | { |
4529 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 4539 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
4530 | __might_sleep(__FILE__, __LINE__); | 4540 | __might_sleep(__FILE__, __LINE__); |
4531 | #endif | 4541 | #endif |
4532 | /* | 4542 | /* |
4533 | * The BKS might be reacquired before we have dropped | 4543 | * The BKS might be reacquired before we have dropped |
4534 | * PREEMPT_ACTIVE, which could trigger a second | 4544 | * PREEMPT_ACTIVE, which could trigger a second |
4535 | * cond_resched() call. | 4545 | * cond_resched() call. |
4536 | */ | 4546 | */ |
4537 | do { | 4547 | do { |
4538 | add_preempt_count(PREEMPT_ACTIVE); | 4548 | add_preempt_count(PREEMPT_ACTIVE); |
4539 | schedule(); | 4549 | schedule(); |
4540 | sub_preempt_count(PREEMPT_ACTIVE); | 4550 | sub_preempt_count(PREEMPT_ACTIVE); |
4541 | } while (need_resched()); | 4551 | } while (need_resched()); |
4542 | } | 4552 | } |
4543 | 4553 | ||
4544 | int __sched cond_resched(void) | 4554 | int __sched cond_resched(void) |
4545 | { | 4555 | { |
4546 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4556 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4547 | system_state == SYSTEM_RUNNING) { | 4557 | system_state == SYSTEM_RUNNING) { |
4548 | __cond_resched(); | 4558 | __cond_resched(); |
4549 | return 1; | 4559 | return 1; |
4550 | } | 4560 | } |
4551 | return 0; | 4561 | return 0; |
4552 | } | 4562 | } |
4553 | EXPORT_SYMBOL(cond_resched); | 4563 | EXPORT_SYMBOL(cond_resched); |
4554 | 4564 | ||
4555 | /* | 4565 | /* |
4556 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4566 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4557 | * call schedule, and on return reacquire the lock. | 4567 | * call schedule, and on return reacquire the lock. |
4558 | * | 4568 | * |
4559 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 4569 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4560 | * operations here to prevent schedule() from being called twice (once via | 4570 | * operations here to prevent schedule() from being called twice (once via |
4561 | * spin_unlock(), once by hand). | 4571 | * spin_unlock(), once by hand). |
4562 | */ | 4572 | */ |
4563 | int cond_resched_lock(spinlock_t *lock) | 4573 | int cond_resched_lock(spinlock_t *lock) |
4564 | { | 4574 | { |
4565 | int ret = 0; | 4575 | int ret = 0; |
4566 | 4576 | ||
4567 | if (need_lockbreak(lock)) { | 4577 | if (need_lockbreak(lock)) { |
4568 | spin_unlock(lock); | 4578 | spin_unlock(lock); |
4569 | cpu_relax(); | 4579 | cpu_relax(); |
4570 | ret = 1; | 4580 | ret = 1; |
4571 | spin_lock(lock); | 4581 | spin_lock(lock); |
4572 | } | 4582 | } |
4573 | if (need_resched() && system_state == SYSTEM_RUNNING) { | 4583 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
4574 | spin_release(&lock->dep_map, 1, _THIS_IP_); | 4584 | spin_release(&lock->dep_map, 1, _THIS_IP_); |
4575 | _raw_spin_unlock(lock); | 4585 | _raw_spin_unlock(lock); |
4576 | preempt_enable_no_resched(); | 4586 | preempt_enable_no_resched(); |
4577 | __cond_resched(); | 4587 | __cond_resched(); |
4578 | ret = 1; | 4588 | ret = 1; |
4579 | spin_lock(lock); | 4589 | spin_lock(lock); |
4580 | } | 4590 | } |
4581 | return ret; | 4591 | return ret; |
4582 | } | 4592 | } |
4583 | EXPORT_SYMBOL(cond_resched_lock); | 4593 | EXPORT_SYMBOL(cond_resched_lock); |
4584 | 4594 | ||
4585 | int __sched cond_resched_softirq(void) | 4595 | int __sched cond_resched_softirq(void) |
4586 | { | 4596 | { |
4587 | BUG_ON(!in_softirq()); | 4597 | BUG_ON(!in_softirq()); |
4588 | 4598 | ||
4589 | if (need_resched() && system_state == SYSTEM_RUNNING) { | 4599 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
4590 | local_bh_enable(); | 4600 | local_bh_enable(); |
4591 | __cond_resched(); | 4601 | __cond_resched(); |
4592 | local_bh_disable(); | 4602 | local_bh_disable(); |
4593 | return 1; | 4603 | return 1; |
4594 | } | 4604 | } |
4595 | return 0; | 4605 | return 0; |
4596 | } | 4606 | } |
4597 | EXPORT_SYMBOL(cond_resched_softirq); | 4607 | EXPORT_SYMBOL(cond_resched_softirq); |
4598 | 4608 | ||
4599 | /** | 4609 | /** |
4600 | * yield - yield the current processor to other threads. | 4610 | * yield - yield the current processor to other threads. |
4601 | * | 4611 | * |
4602 | * This is a shortcut for kernel-space yielding - it marks the | 4612 | * This is a shortcut for kernel-space yielding - it marks the |
4603 | * thread runnable and calls sys_sched_yield(). | 4613 | * thread runnable and calls sys_sched_yield(). |
4604 | */ | 4614 | */ |
4605 | void __sched yield(void) | 4615 | void __sched yield(void) |
4606 | { | 4616 | { |
4607 | set_current_state(TASK_RUNNING); | 4617 | set_current_state(TASK_RUNNING); |
4608 | sys_sched_yield(); | 4618 | sys_sched_yield(); |
4609 | } | 4619 | } |
4610 | EXPORT_SYMBOL(yield); | 4620 | EXPORT_SYMBOL(yield); |
4611 | 4621 | ||
4612 | /* | 4622 | /* |
4613 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 4623 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4614 | * that process accounting knows that this is a task in IO wait state. | 4624 | * that process accounting knows that this is a task in IO wait state. |
4615 | * | 4625 | * |
4616 | * But don't do that if it is a deliberate, throttling IO wait (this task | 4626 | * But don't do that if it is a deliberate, throttling IO wait (this task |
4617 | * has set its backing_dev_info: the queue against which it should throttle) | 4627 | * has set its backing_dev_info: the queue against which it should throttle) |
4618 | */ | 4628 | */ |
4619 | void __sched io_schedule(void) | 4629 | void __sched io_schedule(void) |
4620 | { | 4630 | { |
4621 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 4631 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4622 | 4632 | ||
4623 | delayacct_blkio_start(); | 4633 | delayacct_blkio_start(); |
4624 | atomic_inc(&rq->nr_iowait); | 4634 | atomic_inc(&rq->nr_iowait); |
4625 | schedule(); | 4635 | schedule(); |
4626 | atomic_dec(&rq->nr_iowait); | 4636 | atomic_dec(&rq->nr_iowait); |
4627 | delayacct_blkio_end(); | 4637 | delayacct_blkio_end(); |
4628 | } | 4638 | } |
4629 | EXPORT_SYMBOL(io_schedule); | 4639 | EXPORT_SYMBOL(io_schedule); |
4630 | 4640 | ||
4631 | long __sched io_schedule_timeout(long timeout) | 4641 | long __sched io_schedule_timeout(long timeout) |
4632 | { | 4642 | { |
4633 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 4643 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4634 | long ret; | 4644 | long ret; |
4635 | 4645 | ||
4636 | delayacct_blkio_start(); | 4646 | delayacct_blkio_start(); |
4637 | atomic_inc(&rq->nr_iowait); | 4647 | atomic_inc(&rq->nr_iowait); |
4638 | ret = schedule_timeout(timeout); | 4648 | ret = schedule_timeout(timeout); |
4639 | atomic_dec(&rq->nr_iowait); | 4649 | atomic_dec(&rq->nr_iowait); |
4640 | delayacct_blkio_end(); | 4650 | delayacct_blkio_end(); |
4641 | return ret; | 4651 | return ret; |
4642 | } | 4652 | } |
4643 | 4653 | ||
4644 | /** | 4654 | /** |
4645 | * sys_sched_get_priority_max - return maximum RT priority. | 4655 | * sys_sched_get_priority_max - return maximum RT priority. |
4646 | * @policy: scheduling class. | 4656 | * @policy: scheduling class. |
4647 | * | 4657 | * |
4648 | * this syscall returns the maximum rt_priority that can be used | 4658 | * this syscall returns the maximum rt_priority that can be used |
4649 | * by a given scheduling class. | 4659 | * by a given scheduling class. |
4650 | */ | 4660 | */ |
4651 | asmlinkage long sys_sched_get_priority_max(int policy) | 4661 | asmlinkage long sys_sched_get_priority_max(int policy) |
4652 | { | 4662 | { |
4653 | int ret = -EINVAL; | 4663 | int ret = -EINVAL; |
4654 | 4664 | ||
4655 | switch (policy) { | 4665 | switch (policy) { |
4656 | case SCHED_FIFO: | 4666 | case SCHED_FIFO: |
4657 | case SCHED_RR: | 4667 | case SCHED_RR: |
4658 | ret = MAX_USER_RT_PRIO-1; | 4668 | ret = MAX_USER_RT_PRIO-1; |
4659 | break; | 4669 | break; |
4660 | case SCHED_NORMAL: | 4670 | case SCHED_NORMAL: |
4661 | case SCHED_BATCH: | 4671 | case SCHED_BATCH: |
4662 | case SCHED_IDLE: | 4672 | case SCHED_IDLE: |
4663 | ret = 0; | 4673 | ret = 0; |
4664 | break; | 4674 | break; |
4665 | } | 4675 | } |
4666 | return ret; | 4676 | return ret; |
4667 | } | 4677 | } |
4668 | 4678 | ||
4669 | /** | 4679 | /** |
4670 | * sys_sched_get_priority_min - return minimum RT priority. | 4680 | * sys_sched_get_priority_min - return minimum RT priority. |
4671 | * @policy: scheduling class. | 4681 | * @policy: scheduling class. |
4672 | * | 4682 | * |
4673 | * this syscall returns the minimum rt_priority that can be used | 4683 | * this syscall returns the minimum rt_priority that can be used |
4674 | * by a given scheduling class. | 4684 | * by a given scheduling class. |
4675 | */ | 4685 | */ |
4676 | asmlinkage long sys_sched_get_priority_min(int policy) | 4686 | asmlinkage long sys_sched_get_priority_min(int policy) |
4677 | { | 4687 | { |
4678 | int ret = -EINVAL; | 4688 | int ret = -EINVAL; |
4679 | 4689 | ||
4680 | switch (policy) { | 4690 | switch (policy) { |
4681 | case SCHED_FIFO: | 4691 | case SCHED_FIFO: |
4682 | case SCHED_RR: | 4692 | case SCHED_RR: |
4683 | ret = 1; | 4693 | ret = 1; |
4684 | break; | 4694 | break; |
4685 | case SCHED_NORMAL: | 4695 | case SCHED_NORMAL: |
4686 | case SCHED_BATCH: | 4696 | case SCHED_BATCH: |
4687 | case SCHED_IDLE: | 4697 | case SCHED_IDLE: |
4688 | ret = 0; | 4698 | ret = 0; |
4689 | } | 4699 | } |
4690 | return ret; | 4700 | return ret; |
4691 | } | 4701 | } |
4692 | 4702 | ||
4693 | /** | 4703 | /** |
4694 | * sys_sched_rr_get_interval - return the default timeslice of a process. | 4704 | * sys_sched_rr_get_interval - return the default timeslice of a process. |
4695 | * @pid: pid of the process. | 4705 | * @pid: pid of the process. |
4696 | * @interval: userspace pointer to the timeslice value. | 4706 | * @interval: userspace pointer to the timeslice value. |
4697 | * | 4707 | * |
4698 | * this syscall writes the default timeslice value of a given process | 4708 | * this syscall writes the default timeslice value of a given process |
4699 | * into the user-space timespec buffer. A value of '0' means infinity. | 4709 | * into the user-space timespec buffer. A value of '0' means infinity. |
4700 | */ | 4710 | */ |
4701 | asmlinkage | 4711 | asmlinkage |
4702 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4712 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4703 | { | 4713 | { |
4704 | struct task_struct *p; | 4714 | struct task_struct *p; |
4705 | int retval = -EINVAL; | 4715 | int retval = -EINVAL; |
4706 | struct timespec t; | 4716 | struct timespec t; |
4707 | 4717 | ||
4708 | if (pid < 0) | 4718 | if (pid < 0) |
4709 | goto out_nounlock; | 4719 | goto out_nounlock; |
4710 | 4720 | ||
4711 | retval = -ESRCH; | 4721 | retval = -ESRCH; |
4712 | read_lock(&tasklist_lock); | 4722 | read_lock(&tasklist_lock); |
4713 | p = find_process_by_pid(pid); | 4723 | p = find_process_by_pid(pid); |
4714 | if (!p) | 4724 | if (!p) |
4715 | goto out_unlock; | 4725 | goto out_unlock; |
4716 | 4726 | ||
4717 | retval = security_task_getscheduler(p); | 4727 | retval = security_task_getscheduler(p); |
4718 | if (retval) | 4728 | if (retval) |
4719 | goto out_unlock; | 4729 | goto out_unlock; |
4720 | 4730 | ||
4721 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4731 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4722 | 0 : static_prio_timeslice(p->static_prio), &t); | 4732 | 0 : static_prio_timeslice(p->static_prio), &t); |
4723 | read_unlock(&tasklist_lock); | 4733 | read_unlock(&tasklist_lock); |
4724 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4734 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4725 | out_nounlock: | 4735 | out_nounlock: |
4726 | return retval; | 4736 | return retval; |
4727 | out_unlock: | 4737 | out_unlock: |
4728 | read_unlock(&tasklist_lock); | 4738 | read_unlock(&tasklist_lock); |
4729 | return retval; | 4739 | return retval; |
4730 | } | 4740 | } |
4731 | 4741 | ||
4732 | static const char stat_nam[] = "RSDTtZX"; | 4742 | static const char stat_nam[] = "RSDTtZX"; |
4733 | 4743 | ||
4734 | static void show_task(struct task_struct *p) | 4744 | static void show_task(struct task_struct *p) |
4735 | { | 4745 | { |
4736 | unsigned long free = 0; | 4746 | unsigned long free = 0; |
4737 | unsigned state; | 4747 | unsigned state; |
4738 | 4748 | ||
4739 | state = p->state ? __ffs(p->state) + 1 : 0; | 4749 | state = p->state ? __ffs(p->state) + 1 : 0; |
4740 | printk("%-13.13s %c", p->comm, | 4750 | printk("%-13.13s %c", p->comm, |
4741 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 4751 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
4742 | #if BITS_PER_LONG == 32 | 4752 | #if BITS_PER_LONG == 32 |
4743 | if (state == TASK_RUNNING) | 4753 | if (state == TASK_RUNNING) |
4744 | printk(" running "); | 4754 | printk(" running "); |
4745 | else | 4755 | else |
4746 | printk(" %08lx ", thread_saved_pc(p)); | 4756 | printk(" %08lx ", thread_saved_pc(p)); |
4747 | #else | 4757 | #else |
4748 | if (state == TASK_RUNNING) | 4758 | if (state == TASK_RUNNING) |
4749 | printk(" running task "); | 4759 | printk(" running task "); |
4750 | else | 4760 | else |
4751 | printk(" %016lx ", thread_saved_pc(p)); | 4761 | printk(" %016lx ", thread_saved_pc(p)); |
4752 | #endif | 4762 | #endif |
4753 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4763 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4754 | { | 4764 | { |
4755 | unsigned long *n = end_of_stack(p); | 4765 | unsigned long *n = end_of_stack(p); |
4756 | while (!*n) | 4766 | while (!*n) |
4757 | n++; | 4767 | n++; |
4758 | free = (unsigned long)n - (unsigned long)end_of_stack(p); | 4768 | free = (unsigned long)n - (unsigned long)end_of_stack(p); |
4759 | } | 4769 | } |
4760 | #endif | 4770 | #endif |
4761 | printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); | 4771 | printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid); |
4762 | 4772 | ||
4763 | if (state != TASK_RUNNING) | 4773 | if (state != TASK_RUNNING) |
4764 | show_stack(p, NULL); | 4774 | show_stack(p, NULL); |
4765 | } | 4775 | } |
4766 | 4776 | ||
4767 | void show_state_filter(unsigned long state_filter) | 4777 | void show_state_filter(unsigned long state_filter) |
4768 | { | 4778 | { |
4769 | struct task_struct *g, *p; | 4779 | struct task_struct *g, *p; |
4770 | 4780 | ||
4771 | #if BITS_PER_LONG == 32 | 4781 | #if BITS_PER_LONG == 32 |
4772 | printk(KERN_INFO | 4782 | printk(KERN_INFO |
4773 | " task PC stack pid father\n"); | 4783 | " task PC stack pid father\n"); |
4774 | #else | 4784 | #else |
4775 | printk(KERN_INFO | 4785 | printk(KERN_INFO |
4776 | " task PC stack pid father\n"); | 4786 | " task PC stack pid father\n"); |
4777 | #endif | 4787 | #endif |
4778 | read_lock(&tasklist_lock); | 4788 | read_lock(&tasklist_lock); |
4779 | do_each_thread(g, p) { | 4789 | do_each_thread(g, p) { |
4780 | /* | 4790 | /* |
4781 | * reset the NMI-timeout, listing all files on a slow | 4791 | * reset the NMI-timeout, listing all files on a slow |
4782 | * console might take alot of time: | 4792 | * console might take alot of time: |
4783 | */ | 4793 | */ |
4784 | touch_nmi_watchdog(); | 4794 | touch_nmi_watchdog(); |
4785 | if (!state_filter || (p->state & state_filter)) | 4795 | if (!state_filter || (p->state & state_filter)) |
4786 | show_task(p); | 4796 | show_task(p); |
4787 | } while_each_thread(g, p); | 4797 | } while_each_thread(g, p); |
4788 | 4798 | ||
4789 | touch_all_softlockup_watchdogs(); | 4799 | touch_all_softlockup_watchdogs(); |
4790 | 4800 | ||
4791 | #ifdef CONFIG_SCHED_DEBUG | 4801 | #ifdef CONFIG_SCHED_DEBUG |
4792 | sysrq_sched_debug_show(); | 4802 | sysrq_sched_debug_show(); |
4793 | #endif | 4803 | #endif |
4794 | read_unlock(&tasklist_lock); | 4804 | read_unlock(&tasklist_lock); |
4795 | /* | 4805 | /* |
4796 | * Only show locks if all tasks are dumped: | 4806 | * Only show locks if all tasks are dumped: |
4797 | */ | 4807 | */ |
4798 | if (state_filter == -1) | 4808 | if (state_filter == -1) |
4799 | debug_show_all_locks(); | 4809 | debug_show_all_locks(); |
4800 | } | 4810 | } |
4801 | 4811 | ||
4802 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) | 4812 | void __cpuinit init_idle_bootup_task(struct task_struct *idle) |
4803 | { | 4813 | { |
4804 | idle->sched_class = &idle_sched_class; | 4814 | idle->sched_class = &idle_sched_class; |
4805 | } | 4815 | } |
4806 | 4816 | ||
4807 | /** | 4817 | /** |
4808 | * init_idle - set up an idle thread for a given CPU | 4818 | * init_idle - set up an idle thread for a given CPU |
4809 | * @idle: task in question | 4819 | * @idle: task in question |
4810 | * @cpu: cpu the idle task belongs to | 4820 | * @cpu: cpu the idle task belongs to |
4811 | * | 4821 | * |
4812 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4822 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
4813 | * flag, to make booting more robust. | 4823 | * flag, to make booting more robust. |
4814 | */ | 4824 | */ |
4815 | void __cpuinit init_idle(struct task_struct *idle, int cpu) | 4825 | void __cpuinit init_idle(struct task_struct *idle, int cpu) |
4816 | { | 4826 | { |
4817 | struct rq *rq = cpu_rq(cpu); | 4827 | struct rq *rq = cpu_rq(cpu); |
4818 | unsigned long flags; | 4828 | unsigned long flags; |
4819 | 4829 | ||
4820 | __sched_fork(idle); | 4830 | __sched_fork(idle); |
4821 | idle->se.exec_start = sched_clock(); | 4831 | idle->se.exec_start = sched_clock(); |
4822 | 4832 | ||
4823 | idle->prio = idle->normal_prio = MAX_PRIO; | 4833 | idle->prio = idle->normal_prio = MAX_PRIO; |
4824 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4834 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4825 | __set_task_cpu(idle, cpu); | 4835 | __set_task_cpu(idle, cpu); |
4826 | 4836 | ||
4827 | spin_lock_irqsave(&rq->lock, flags); | 4837 | spin_lock_irqsave(&rq->lock, flags); |
4828 | rq->curr = rq->idle = idle; | 4838 | rq->curr = rq->idle = idle; |
4829 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 4839 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
4830 | idle->oncpu = 1; | 4840 | idle->oncpu = 1; |
4831 | #endif | 4841 | #endif |
4832 | spin_unlock_irqrestore(&rq->lock, flags); | 4842 | spin_unlock_irqrestore(&rq->lock, flags); |
4833 | 4843 | ||
4834 | /* Set the preempt count _outside_ the spinlocks! */ | 4844 | /* Set the preempt count _outside_ the spinlocks! */ |
4835 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | 4845 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) |
4836 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | 4846 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); |
4837 | #else | 4847 | #else |
4838 | task_thread_info(idle)->preempt_count = 0; | 4848 | task_thread_info(idle)->preempt_count = 0; |
4839 | #endif | 4849 | #endif |
4840 | /* | 4850 | /* |
4841 | * The idle tasks have their own, simple scheduling class: | 4851 | * The idle tasks have their own, simple scheduling class: |
4842 | */ | 4852 | */ |
4843 | idle->sched_class = &idle_sched_class; | 4853 | idle->sched_class = &idle_sched_class; |
4844 | } | 4854 | } |
4845 | 4855 | ||
4846 | /* | 4856 | /* |
4847 | * In a system that switches off the HZ timer nohz_cpu_mask | 4857 | * In a system that switches off the HZ timer nohz_cpu_mask |
4848 | * indicates which cpus entered this state. This is used | 4858 | * indicates which cpus entered this state. This is used |
4849 | * in the rcu update to wait only for active cpus. For system | 4859 | * in the rcu update to wait only for active cpus. For system |
4850 | * which do not switch off the HZ timer nohz_cpu_mask should | 4860 | * which do not switch off the HZ timer nohz_cpu_mask should |
4851 | * always be CPU_MASK_NONE. | 4861 | * always be CPU_MASK_NONE. |
4852 | */ | 4862 | */ |
4853 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4863 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4854 | 4864 | ||
4855 | /* | 4865 | /* |
4856 | * Increase the granularity value when there are more CPUs, | 4866 | * Increase the granularity value when there are more CPUs, |
4857 | * because with more CPUs the 'effective latency' as visible | 4867 | * because with more CPUs the 'effective latency' as visible |
4858 | * to users decreases. But the relationship is not linear, | 4868 | * to users decreases. But the relationship is not linear, |
4859 | * so pick a second-best guess by going with the log2 of the | 4869 | * so pick a second-best guess by going with the log2 of the |
4860 | * number of CPUs. | 4870 | * number of CPUs. |
4861 | * | 4871 | * |
4862 | * This idea comes from the SD scheduler of Con Kolivas: | 4872 | * This idea comes from the SD scheduler of Con Kolivas: |
4863 | */ | 4873 | */ |
4864 | static inline void sched_init_granularity(void) | 4874 | static inline void sched_init_granularity(void) |
4865 | { | 4875 | { |
4866 | unsigned int factor = 1 + ilog2(num_online_cpus()); | 4876 | unsigned int factor = 1 + ilog2(num_online_cpus()); |
4867 | const unsigned long gran_limit = 100000000; | 4877 | const unsigned long gran_limit = 100000000; |
4868 | 4878 | ||
4869 | sysctl_sched_granularity *= factor; | 4879 | sysctl_sched_granularity *= factor; |
4870 | if (sysctl_sched_granularity > gran_limit) | 4880 | if (sysctl_sched_granularity > gran_limit) |
4871 | sysctl_sched_granularity = gran_limit; | 4881 | sysctl_sched_granularity = gran_limit; |
4872 | 4882 | ||
4873 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; | 4883 | sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; |
4874 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; | 4884 | sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; |
4875 | } | 4885 | } |
4876 | 4886 | ||
4877 | #ifdef CONFIG_SMP | 4887 | #ifdef CONFIG_SMP |
4878 | /* | 4888 | /* |
4879 | * This is how migration works: | 4889 | * This is how migration works: |
4880 | * | 4890 | * |
4881 | * 1) we queue a struct migration_req structure in the source CPU's | 4891 | * 1) we queue a struct migration_req structure in the source CPU's |
4882 | * runqueue and wake up that CPU's migration thread. | 4892 | * runqueue and wake up that CPU's migration thread. |
4883 | * 2) we down() the locked semaphore => thread blocks. | 4893 | * 2) we down() the locked semaphore => thread blocks. |
4884 | * 3) migration thread wakes up (implicitly it forces the migrated | 4894 | * 3) migration thread wakes up (implicitly it forces the migrated |
4885 | * thread off the CPU) | 4895 | * thread off the CPU) |
4886 | * 4) it gets the migration request and checks whether the migrated | 4896 | * 4) it gets the migration request and checks whether the migrated |
4887 | * task is still in the wrong runqueue. | 4897 | * task is still in the wrong runqueue. |
4888 | * 5) if it's in the wrong runqueue then the migration thread removes | 4898 | * 5) if it's in the wrong runqueue then the migration thread removes |
4889 | * it and puts it into the right queue. | 4899 | * it and puts it into the right queue. |
4890 | * 6) migration thread up()s the semaphore. | 4900 | * 6) migration thread up()s the semaphore. |
4891 | * 7) we wake up and the migration is done. | 4901 | * 7) we wake up and the migration is done. |
4892 | */ | 4902 | */ |
4893 | 4903 | ||
4894 | /* | 4904 | /* |
4895 | * Change a given task's CPU affinity. Migrate the thread to a | 4905 | * Change a given task's CPU affinity. Migrate the thread to a |
4896 | * proper CPU and schedule it away if the CPU it's executing on | 4906 | * proper CPU and schedule it away if the CPU it's executing on |
4897 | * is removed from the allowed bitmask. | 4907 | * is removed from the allowed bitmask. |
4898 | * | 4908 | * |
4899 | * NOTE: the caller must have a valid reference to the task, the | 4909 | * NOTE: the caller must have a valid reference to the task, the |
4900 | * task must not exit() & deallocate itself prematurely. The | 4910 | * task must not exit() & deallocate itself prematurely. The |
4901 | * call is not atomic; no spinlocks may be held. | 4911 | * call is not atomic; no spinlocks may be held. |
4902 | */ | 4912 | */ |
4903 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 4913 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
4904 | { | 4914 | { |
4905 | struct migration_req req; | 4915 | struct migration_req req; |
4906 | unsigned long flags; | 4916 | unsigned long flags; |
4907 | struct rq *rq; | 4917 | struct rq *rq; |
4908 | int ret = 0; | 4918 | int ret = 0; |
4909 | 4919 | ||
4910 | rq = task_rq_lock(p, &flags); | 4920 | rq = task_rq_lock(p, &flags); |
4911 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4921 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
4912 | ret = -EINVAL; | 4922 | ret = -EINVAL; |
4913 | goto out; | 4923 | goto out; |
4914 | } | 4924 | } |
4915 | 4925 | ||
4916 | p->cpus_allowed = new_mask; | 4926 | p->cpus_allowed = new_mask; |
4917 | /* Can the task run on the task's current CPU? If so, we're done */ | 4927 | /* Can the task run on the task's current CPU? If so, we're done */ |
4918 | if (cpu_isset(task_cpu(p), new_mask)) | 4928 | if (cpu_isset(task_cpu(p), new_mask)) |
4919 | goto out; | 4929 | goto out; |
4920 | 4930 | ||
4921 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 4931 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { |
4922 | /* Need help from migration thread: drop lock and wait. */ | 4932 | /* Need help from migration thread: drop lock and wait. */ |
4923 | task_rq_unlock(rq, &flags); | 4933 | task_rq_unlock(rq, &flags); |
4924 | wake_up_process(rq->migration_thread); | 4934 | wake_up_process(rq->migration_thread); |
4925 | wait_for_completion(&req.done); | 4935 | wait_for_completion(&req.done); |
4926 | tlb_migrate_finish(p->mm); | 4936 | tlb_migrate_finish(p->mm); |
4927 | return 0; | 4937 | return 0; |
4928 | } | 4938 | } |
4929 | out: | 4939 | out: |
4930 | task_rq_unlock(rq, &flags); | 4940 | task_rq_unlock(rq, &flags); |
4931 | 4941 | ||
4932 | return ret; | 4942 | return ret; |
4933 | } | 4943 | } |
4934 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4944 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
4935 | 4945 | ||
4936 | /* | 4946 | /* |
4937 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 4947 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
4938 | * this because either it can't run here any more (set_cpus_allowed() | 4948 | * this because either it can't run here any more (set_cpus_allowed() |
4939 | * away from this CPU, or CPU going down), or because we're | 4949 | * away from this CPU, or CPU going down), or because we're |
4940 | * attempting to rebalance this task on exec (sched_exec). | 4950 | * attempting to rebalance this task on exec (sched_exec). |
4941 | * | 4951 | * |
4942 | * So we race with normal scheduler movements, but that's OK, as long | 4952 | * So we race with normal scheduler movements, but that's OK, as long |
4943 | * as the task is no longer on this CPU. | 4953 | * as the task is no longer on this CPU. |
4944 | * | 4954 | * |
4945 | * Returns non-zero if task was successfully migrated. | 4955 | * Returns non-zero if task was successfully migrated. |
4946 | */ | 4956 | */ |
4947 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4957 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4948 | { | 4958 | { |
4949 | struct rq *rq_dest, *rq_src; | 4959 | struct rq *rq_dest, *rq_src; |
4950 | int ret = 0, on_rq; | 4960 | int ret = 0, on_rq; |
4951 | 4961 | ||
4952 | if (unlikely(cpu_is_offline(dest_cpu))) | 4962 | if (unlikely(cpu_is_offline(dest_cpu))) |
4953 | return ret; | 4963 | return ret; |
4954 | 4964 | ||
4955 | rq_src = cpu_rq(src_cpu); | 4965 | rq_src = cpu_rq(src_cpu); |
4956 | rq_dest = cpu_rq(dest_cpu); | 4966 | rq_dest = cpu_rq(dest_cpu); |
4957 | 4967 | ||
4958 | double_rq_lock(rq_src, rq_dest); | 4968 | double_rq_lock(rq_src, rq_dest); |
4959 | /* Already moved. */ | 4969 | /* Already moved. */ |
4960 | if (task_cpu(p) != src_cpu) | 4970 | if (task_cpu(p) != src_cpu) |
4961 | goto out; | 4971 | goto out; |
4962 | /* Affinity changed (again). */ | 4972 | /* Affinity changed (again). */ |
4963 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 4973 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
4964 | goto out; | 4974 | goto out; |
4965 | 4975 | ||
4966 | on_rq = p->se.on_rq; | 4976 | on_rq = p->se.on_rq; |
4967 | if (on_rq) | 4977 | if (on_rq) |
4968 | deactivate_task(rq_src, p, 0); | 4978 | deactivate_task(rq_src, p, 0); |
4969 | set_task_cpu(p, dest_cpu); | 4979 | set_task_cpu(p, dest_cpu); |
4970 | if (on_rq) { | 4980 | if (on_rq) { |
4971 | activate_task(rq_dest, p, 0); | 4981 | activate_task(rq_dest, p, 0); |
4972 | check_preempt_curr(rq_dest, p); | 4982 | check_preempt_curr(rq_dest, p); |
4973 | } | 4983 | } |
4974 | ret = 1; | 4984 | ret = 1; |
4975 | out: | 4985 | out: |
4976 | double_rq_unlock(rq_src, rq_dest); | 4986 | double_rq_unlock(rq_src, rq_dest); |
4977 | return ret; | 4987 | return ret; |
4978 | } | 4988 | } |
4979 | 4989 | ||
4980 | /* | 4990 | /* |
4981 | * migration_thread - this is a highprio system thread that performs | 4991 | * migration_thread - this is a highprio system thread that performs |
4982 | * thread migration by bumping thread off CPU then 'pushing' onto | 4992 | * thread migration by bumping thread off CPU then 'pushing' onto |
4983 | * another runqueue. | 4993 | * another runqueue. |
4984 | */ | 4994 | */ |
4985 | static int migration_thread(void *data) | 4995 | static int migration_thread(void *data) |
4986 | { | 4996 | { |
4987 | int cpu = (long)data; | 4997 | int cpu = (long)data; |
4988 | struct rq *rq; | 4998 | struct rq *rq; |
4989 | 4999 | ||
4990 | rq = cpu_rq(cpu); | 5000 | rq = cpu_rq(cpu); |
4991 | BUG_ON(rq->migration_thread != current); | 5001 | BUG_ON(rq->migration_thread != current); |
4992 | 5002 | ||
4993 | set_current_state(TASK_INTERRUPTIBLE); | 5003 | set_current_state(TASK_INTERRUPTIBLE); |
4994 | while (!kthread_should_stop()) { | 5004 | while (!kthread_should_stop()) { |
4995 | struct migration_req *req; | 5005 | struct migration_req *req; |
4996 | struct list_head *head; | 5006 | struct list_head *head; |
4997 | 5007 | ||
4998 | spin_lock_irq(&rq->lock); | 5008 | spin_lock_irq(&rq->lock); |
4999 | 5009 | ||
5000 | if (cpu_is_offline(cpu)) { | 5010 | if (cpu_is_offline(cpu)) { |
5001 | spin_unlock_irq(&rq->lock); | 5011 | spin_unlock_irq(&rq->lock); |
5002 | goto wait_to_die; | 5012 | goto wait_to_die; |
5003 | } | 5013 | } |
5004 | 5014 | ||
5005 | if (rq->active_balance) { | 5015 | if (rq->active_balance) { |
5006 | active_load_balance(rq, cpu); | 5016 | active_load_balance(rq, cpu); |
5007 | rq->active_balance = 0; | 5017 | rq->active_balance = 0; |
5008 | } | 5018 | } |
5009 | 5019 | ||
5010 | head = &rq->migration_queue; | 5020 | head = &rq->migration_queue; |
5011 | 5021 | ||
5012 | if (list_empty(head)) { | 5022 | if (list_empty(head)) { |
5013 | spin_unlock_irq(&rq->lock); | 5023 | spin_unlock_irq(&rq->lock); |
5014 | schedule(); | 5024 | schedule(); |
5015 | set_current_state(TASK_INTERRUPTIBLE); | 5025 | set_current_state(TASK_INTERRUPTIBLE); |
5016 | continue; | 5026 | continue; |
5017 | } | 5027 | } |
5018 | req = list_entry(head->next, struct migration_req, list); | 5028 | req = list_entry(head->next, struct migration_req, list); |
5019 | list_del_init(head->next); | 5029 | list_del_init(head->next); |
5020 | 5030 | ||
5021 | spin_unlock(&rq->lock); | 5031 | spin_unlock(&rq->lock); |
5022 | __migrate_task(req->task, cpu, req->dest_cpu); | 5032 | __migrate_task(req->task, cpu, req->dest_cpu); |
5023 | local_irq_enable(); | 5033 | local_irq_enable(); |
5024 | 5034 | ||
5025 | complete(&req->done); | 5035 | complete(&req->done); |
5026 | } | 5036 | } |
5027 | __set_current_state(TASK_RUNNING); | 5037 | __set_current_state(TASK_RUNNING); |
5028 | return 0; | 5038 | return 0; |
5029 | 5039 | ||
5030 | wait_to_die: | 5040 | wait_to_die: |
5031 | /* Wait for kthread_stop */ | 5041 | /* Wait for kthread_stop */ |
5032 | set_current_state(TASK_INTERRUPTIBLE); | 5042 | set_current_state(TASK_INTERRUPTIBLE); |
5033 | while (!kthread_should_stop()) { | 5043 | while (!kthread_should_stop()) { |
5034 | schedule(); | 5044 | schedule(); |
5035 | set_current_state(TASK_INTERRUPTIBLE); | 5045 | set_current_state(TASK_INTERRUPTIBLE); |
5036 | } | 5046 | } |
5037 | __set_current_state(TASK_RUNNING); | 5047 | __set_current_state(TASK_RUNNING); |
5038 | return 0; | 5048 | return 0; |
5039 | } | 5049 | } |
5040 | 5050 | ||
5041 | #ifdef CONFIG_HOTPLUG_CPU | 5051 | #ifdef CONFIG_HOTPLUG_CPU |
5042 | /* | 5052 | /* |
5043 | * Figure out where task on dead CPU should go, use force if neccessary. | 5053 | * Figure out where task on dead CPU should go, use force if neccessary. |
5044 | * NOTE: interrupts should be disabled by the caller | 5054 | * NOTE: interrupts should be disabled by the caller |
5045 | */ | 5055 | */ |
5046 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5056 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
5047 | { | 5057 | { |
5048 | unsigned long flags; | 5058 | unsigned long flags; |
5049 | cpumask_t mask; | 5059 | cpumask_t mask; |
5050 | struct rq *rq; | 5060 | struct rq *rq; |
5051 | int dest_cpu; | 5061 | int dest_cpu; |
5052 | 5062 | ||
5053 | restart: | 5063 | restart: |
5054 | /* On same node? */ | 5064 | /* On same node? */ |
5055 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 5065 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
5056 | cpus_and(mask, mask, p->cpus_allowed); | 5066 | cpus_and(mask, mask, p->cpus_allowed); |
5057 | dest_cpu = any_online_cpu(mask); | 5067 | dest_cpu = any_online_cpu(mask); |
5058 | 5068 | ||
5059 | /* On any allowed CPU? */ | 5069 | /* On any allowed CPU? */ |
5060 | if (dest_cpu == NR_CPUS) | 5070 | if (dest_cpu == NR_CPUS) |
5061 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5071 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5062 | 5072 | ||
5063 | /* No more Mr. Nice Guy. */ | 5073 | /* No more Mr. Nice Guy. */ |
5064 | if (dest_cpu == NR_CPUS) { | 5074 | if (dest_cpu == NR_CPUS) { |
5065 | rq = task_rq_lock(p, &flags); | 5075 | rq = task_rq_lock(p, &flags); |
5066 | cpus_setall(p->cpus_allowed); | 5076 | cpus_setall(p->cpus_allowed); |
5067 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5077 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5068 | task_rq_unlock(rq, &flags); | 5078 | task_rq_unlock(rq, &flags); |
5069 | 5079 | ||
5070 | /* | 5080 | /* |
5071 | * Don't tell them about moving exiting tasks or | 5081 | * Don't tell them about moving exiting tasks or |
5072 | * kernel threads (both mm NULL), since they never | 5082 | * kernel threads (both mm NULL), since they never |
5073 | * leave kernel. | 5083 | * leave kernel. |
5074 | */ | 5084 | */ |
5075 | if (p->mm && printk_ratelimit()) | 5085 | if (p->mm && printk_ratelimit()) |
5076 | printk(KERN_INFO "process %d (%s) no " | 5086 | printk(KERN_INFO "process %d (%s) no " |
5077 | "longer affine to cpu%d\n", | 5087 | "longer affine to cpu%d\n", |
5078 | p->pid, p->comm, dead_cpu); | 5088 | p->pid, p->comm, dead_cpu); |
5079 | } | 5089 | } |
5080 | if (!__migrate_task(p, dead_cpu, dest_cpu)) | 5090 | if (!__migrate_task(p, dead_cpu, dest_cpu)) |
5081 | goto restart; | 5091 | goto restart; |
5082 | } | 5092 | } |
5083 | 5093 | ||
5084 | /* | 5094 | /* |
5085 | * While a dead CPU has no uninterruptible tasks queued at this point, | 5095 | * While a dead CPU has no uninterruptible tasks queued at this point, |
5086 | * it might still have a nonzero ->nr_uninterruptible counter, because | 5096 | * it might still have a nonzero ->nr_uninterruptible counter, because |
5087 | * for performance reasons the counter is not stricly tracking tasks to | 5097 | * for performance reasons the counter is not stricly tracking tasks to |
5088 | * their home CPUs. So we just add the counter to another CPU's counter, | 5098 | * their home CPUs. So we just add the counter to another CPU's counter, |
5089 | * to keep the global sum constant after CPU-down: | 5099 | * to keep the global sum constant after CPU-down: |
5090 | */ | 5100 | */ |
5091 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5101 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5092 | { | 5102 | { |
5093 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5103 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
5094 | unsigned long flags; | 5104 | unsigned long flags; |
5095 | 5105 | ||
5096 | local_irq_save(flags); | 5106 | local_irq_save(flags); |
5097 | double_rq_lock(rq_src, rq_dest); | 5107 | double_rq_lock(rq_src, rq_dest); |
5098 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5108 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5099 | rq_src->nr_uninterruptible = 0; | 5109 | rq_src->nr_uninterruptible = 0; |
5100 | double_rq_unlock(rq_src, rq_dest); | 5110 | double_rq_unlock(rq_src, rq_dest); |
5101 | local_irq_restore(flags); | 5111 | local_irq_restore(flags); |
5102 | } | 5112 | } |
5103 | 5113 | ||
5104 | /* Run through task list and migrate tasks from the dead cpu. */ | 5114 | /* Run through task list and migrate tasks from the dead cpu. */ |
5105 | static void migrate_live_tasks(int src_cpu) | 5115 | static void migrate_live_tasks(int src_cpu) |
5106 | { | 5116 | { |
5107 | struct task_struct *p, *t; | 5117 | struct task_struct *p, *t; |
5108 | 5118 | ||
5109 | write_lock_irq(&tasklist_lock); | 5119 | write_lock_irq(&tasklist_lock); |
5110 | 5120 | ||
5111 | do_each_thread(t, p) { | 5121 | do_each_thread(t, p) { |
5112 | if (p == current) | 5122 | if (p == current) |
5113 | continue; | 5123 | continue; |
5114 | 5124 | ||
5115 | if (task_cpu(p) == src_cpu) | 5125 | if (task_cpu(p) == src_cpu) |
5116 | move_task_off_dead_cpu(src_cpu, p); | 5126 | move_task_off_dead_cpu(src_cpu, p); |
5117 | } while_each_thread(t, p); | 5127 | } while_each_thread(t, p); |
5118 | 5128 | ||
5119 | write_unlock_irq(&tasklist_lock); | 5129 | write_unlock_irq(&tasklist_lock); |
5120 | } | 5130 | } |
5121 | 5131 | ||
5122 | /* | 5132 | /* |
5123 | * Schedules idle task to be the next runnable task on current CPU. | 5133 | * Schedules idle task to be the next runnable task on current CPU. |
5124 | * It does so by boosting its priority to highest possible and adding it to | 5134 | * It does so by boosting its priority to highest possible and adding it to |
5125 | * the _front_ of the runqueue. Used by CPU offline code. | 5135 | * the _front_ of the runqueue. Used by CPU offline code. |
5126 | */ | 5136 | */ |
5127 | void sched_idle_next(void) | 5137 | void sched_idle_next(void) |
5128 | { | 5138 | { |
5129 | int this_cpu = smp_processor_id(); | 5139 | int this_cpu = smp_processor_id(); |
5130 | struct rq *rq = cpu_rq(this_cpu); | 5140 | struct rq *rq = cpu_rq(this_cpu); |
5131 | struct task_struct *p = rq->idle; | 5141 | struct task_struct *p = rq->idle; |
5132 | unsigned long flags; | 5142 | unsigned long flags; |
5133 | 5143 | ||
5134 | /* cpu has to be offline */ | 5144 | /* cpu has to be offline */ |
5135 | BUG_ON(cpu_online(this_cpu)); | 5145 | BUG_ON(cpu_online(this_cpu)); |
5136 | 5146 | ||
5137 | /* | 5147 | /* |
5138 | * Strictly not necessary since rest of the CPUs are stopped by now | 5148 | * Strictly not necessary since rest of the CPUs are stopped by now |
5139 | * and interrupts disabled on the current cpu. | 5149 | * and interrupts disabled on the current cpu. |
5140 | */ | 5150 | */ |
5141 | spin_lock_irqsave(&rq->lock, flags); | 5151 | spin_lock_irqsave(&rq->lock, flags); |
5142 | 5152 | ||
5143 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 5153 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5144 | 5154 | ||
5145 | /* Add idle task to the _front_ of its priority queue: */ | 5155 | /* Add idle task to the _front_ of its priority queue: */ |
5146 | activate_idle_task(p, rq); | 5156 | activate_idle_task(p, rq); |
5147 | 5157 | ||
5148 | spin_unlock_irqrestore(&rq->lock, flags); | 5158 | spin_unlock_irqrestore(&rq->lock, flags); |
5149 | } | 5159 | } |
5150 | 5160 | ||
5151 | /* | 5161 | /* |
5152 | * Ensures that the idle task is using init_mm right before its cpu goes | 5162 | * Ensures that the idle task is using init_mm right before its cpu goes |
5153 | * offline. | 5163 | * offline. |
5154 | */ | 5164 | */ |
5155 | void idle_task_exit(void) | 5165 | void idle_task_exit(void) |
5156 | { | 5166 | { |
5157 | struct mm_struct *mm = current->active_mm; | 5167 | struct mm_struct *mm = current->active_mm; |
5158 | 5168 | ||
5159 | BUG_ON(cpu_online(smp_processor_id())); | 5169 | BUG_ON(cpu_online(smp_processor_id())); |
5160 | 5170 | ||
5161 | if (mm != &init_mm) | 5171 | if (mm != &init_mm) |
5162 | switch_mm(mm, &init_mm, current); | 5172 | switch_mm(mm, &init_mm, current); |
5163 | mmdrop(mm); | 5173 | mmdrop(mm); |
5164 | } | 5174 | } |
5165 | 5175 | ||
5166 | /* called under rq->lock with disabled interrupts */ | 5176 | /* called under rq->lock with disabled interrupts */ |
5167 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | 5177 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
5168 | { | 5178 | { |
5169 | struct rq *rq = cpu_rq(dead_cpu); | 5179 | struct rq *rq = cpu_rq(dead_cpu); |
5170 | 5180 | ||
5171 | /* Must be exiting, otherwise would be on tasklist. */ | 5181 | /* Must be exiting, otherwise would be on tasklist. */ |
5172 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); | 5182 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); |
5173 | 5183 | ||
5174 | /* Cannot have done final schedule yet: would have vanished. */ | 5184 | /* Cannot have done final schedule yet: would have vanished. */ |
5175 | BUG_ON(p->state == TASK_DEAD); | 5185 | BUG_ON(p->state == TASK_DEAD); |
5176 | 5186 | ||
5177 | get_task_struct(p); | 5187 | get_task_struct(p); |
5178 | 5188 | ||
5179 | /* | 5189 | /* |
5180 | * Drop lock around migration; if someone else moves it, | 5190 | * Drop lock around migration; if someone else moves it, |
5181 | * that's OK. No task can be added to this CPU, so iteration is | 5191 | * that's OK. No task can be added to this CPU, so iteration is |
5182 | * fine. | 5192 | * fine. |
5183 | * NOTE: interrupts should be left disabled --dev@ | 5193 | * NOTE: interrupts should be left disabled --dev@ |
5184 | */ | 5194 | */ |
5185 | spin_unlock(&rq->lock); | 5195 | spin_unlock(&rq->lock); |
5186 | move_task_off_dead_cpu(dead_cpu, p); | 5196 | move_task_off_dead_cpu(dead_cpu, p); |
5187 | spin_lock(&rq->lock); | 5197 | spin_lock(&rq->lock); |
5188 | 5198 | ||
5189 | put_task_struct(p); | 5199 | put_task_struct(p); |
5190 | } | 5200 | } |
5191 | 5201 | ||
5192 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5202 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
5193 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5203 | static void migrate_dead_tasks(unsigned int dead_cpu) |
5194 | { | 5204 | { |
5195 | struct rq *rq = cpu_rq(dead_cpu); | 5205 | struct rq *rq = cpu_rq(dead_cpu); |
5196 | struct task_struct *next; | 5206 | struct task_struct *next; |
5197 | 5207 | ||
5198 | for ( ; ; ) { | 5208 | for ( ; ; ) { |
5199 | if (!rq->nr_running) | 5209 | if (!rq->nr_running) |
5200 | break; | 5210 | break; |
5201 | next = pick_next_task(rq, rq->curr, rq_clock(rq)); | 5211 | next = pick_next_task(rq, rq->curr, rq_clock(rq)); |
5202 | if (!next) | 5212 | if (!next) |
5203 | break; | 5213 | break; |
5204 | migrate_dead(dead_cpu, next); | 5214 | migrate_dead(dead_cpu, next); |
5205 | 5215 | ||
5206 | } | 5216 | } |
5207 | } | 5217 | } |
5208 | #endif /* CONFIG_HOTPLUG_CPU */ | 5218 | #endif /* CONFIG_HOTPLUG_CPU */ |
5209 | 5219 | ||
5210 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5220 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
5211 | 5221 | ||
5212 | static struct ctl_table sd_ctl_dir[] = { | 5222 | static struct ctl_table sd_ctl_dir[] = { |
5213 | {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, }, | 5223 | {CTL_UNNUMBERED, "sched_domain", NULL, 0, 0755, NULL, }, |
5214 | {0,}, | 5224 | {0,}, |
5215 | }; | 5225 | }; |
5216 | 5226 | ||
5217 | static struct ctl_table sd_ctl_root[] = { | 5227 | static struct ctl_table sd_ctl_root[] = { |
5218 | {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, }, | 5228 | {CTL_UNNUMBERED, "kernel", NULL, 0, 0755, sd_ctl_dir, }, |
5219 | {0,}, | 5229 | {0,}, |
5220 | }; | 5230 | }; |
5221 | 5231 | ||
5222 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5232 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
5223 | { | 5233 | { |
5224 | struct ctl_table *entry = | 5234 | struct ctl_table *entry = |
5225 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); | 5235 | kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); |
5226 | 5236 | ||
5227 | BUG_ON(!entry); | 5237 | BUG_ON(!entry); |
5228 | memset(entry, 0, n * sizeof(struct ctl_table)); | 5238 | memset(entry, 0, n * sizeof(struct ctl_table)); |
5229 | 5239 | ||
5230 | return entry; | 5240 | return entry; |
5231 | } | 5241 | } |
5232 | 5242 | ||
5233 | static void | 5243 | static void |
5234 | set_table_entry(struct ctl_table *entry, int ctl_name, | 5244 | set_table_entry(struct ctl_table *entry, int ctl_name, |
5235 | const char *procname, void *data, int maxlen, | 5245 | const char *procname, void *data, int maxlen, |
5236 | mode_t mode, proc_handler *proc_handler) | 5246 | mode_t mode, proc_handler *proc_handler) |
5237 | { | 5247 | { |
5238 | entry->ctl_name = ctl_name; | 5248 | entry->ctl_name = ctl_name; |
5239 | entry->procname = procname; | 5249 | entry->procname = procname; |
5240 | entry->data = data; | 5250 | entry->data = data; |
5241 | entry->maxlen = maxlen; | 5251 | entry->maxlen = maxlen; |
5242 | entry->mode = mode; | 5252 | entry->mode = mode; |
5243 | entry->proc_handler = proc_handler; | 5253 | entry->proc_handler = proc_handler; |
5244 | } | 5254 | } |
5245 | 5255 | ||
5246 | static struct ctl_table * | 5256 | static struct ctl_table * |
5247 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 5257 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
5248 | { | 5258 | { |
5249 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 5259 | struct ctl_table *table = sd_alloc_ctl_entry(14); |
5250 | 5260 | ||
5251 | set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, | 5261 | set_table_entry(&table[0], 1, "min_interval", &sd->min_interval, |
5252 | sizeof(long), 0644, proc_doulongvec_minmax); | 5262 | sizeof(long), 0644, proc_doulongvec_minmax); |
5253 | set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, | 5263 | set_table_entry(&table[1], 2, "max_interval", &sd->max_interval, |
5254 | sizeof(long), 0644, proc_doulongvec_minmax); | 5264 | sizeof(long), 0644, proc_doulongvec_minmax); |
5255 | set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx, | 5265 | set_table_entry(&table[2], 3, "busy_idx", &sd->busy_idx, |
5256 | sizeof(int), 0644, proc_dointvec_minmax); | 5266 | sizeof(int), 0644, proc_dointvec_minmax); |
5257 | set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx, | 5267 | set_table_entry(&table[3], 4, "idle_idx", &sd->idle_idx, |
5258 | sizeof(int), 0644, proc_dointvec_minmax); | 5268 | sizeof(int), 0644, proc_dointvec_minmax); |
5259 | set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx, | 5269 | set_table_entry(&table[4], 5, "newidle_idx", &sd->newidle_idx, |
5260 | sizeof(int), 0644, proc_dointvec_minmax); | 5270 | sizeof(int), 0644, proc_dointvec_minmax); |
5261 | set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx, | 5271 | set_table_entry(&table[5], 6, "wake_idx", &sd->wake_idx, |
5262 | sizeof(int), 0644, proc_dointvec_minmax); | 5272 | sizeof(int), 0644, proc_dointvec_minmax); |
5263 | set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx, | 5273 | set_table_entry(&table[6], 7, "forkexec_idx", &sd->forkexec_idx, |
5264 | sizeof(int), 0644, proc_dointvec_minmax); | 5274 | sizeof(int), 0644, proc_dointvec_minmax); |
5265 | set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor, | 5275 | set_table_entry(&table[7], 8, "busy_factor", &sd->busy_factor, |
5266 | sizeof(int), 0644, proc_dointvec_minmax); | 5276 | sizeof(int), 0644, proc_dointvec_minmax); |
5267 | set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct, | 5277 | set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct, |
5268 | sizeof(int), 0644, proc_dointvec_minmax); | 5278 | sizeof(int), 0644, proc_dointvec_minmax); |
5269 | set_table_entry(&table[10], 11, "cache_nice_tries", | 5279 | set_table_entry(&table[10], 11, "cache_nice_tries", |
5270 | &sd->cache_nice_tries, | 5280 | &sd->cache_nice_tries, |
5271 | sizeof(int), 0644, proc_dointvec_minmax); | 5281 | sizeof(int), 0644, proc_dointvec_minmax); |
5272 | set_table_entry(&table[12], 13, "flags", &sd->flags, | 5282 | set_table_entry(&table[12], 13, "flags", &sd->flags, |
5273 | sizeof(int), 0644, proc_dointvec_minmax); | 5283 | sizeof(int), 0644, proc_dointvec_minmax); |
5274 | 5284 | ||
5275 | return table; | 5285 | return table; |
5276 | } | 5286 | } |
5277 | 5287 | ||
5278 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) | 5288 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
5279 | { | 5289 | { |
5280 | struct ctl_table *entry, *table; | 5290 | struct ctl_table *entry, *table; |
5281 | struct sched_domain *sd; | 5291 | struct sched_domain *sd; |
5282 | int domain_num = 0, i; | 5292 | int domain_num = 0, i; |
5283 | char buf[32]; | 5293 | char buf[32]; |
5284 | 5294 | ||
5285 | for_each_domain(cpu, sd) | 5295 | for_each_domain(cpu, sd) |
5286 | domain_num++; | 5296 | domain_num++; |
5287 | entry = table = sd_alloc_ctl_entry(domain_num + 1); | 5297 | entry = table = sd_alloc_ctl_entry(domain_num + 1); |
5288 | 5298 | ||
5289 | i = 0; | 5299 | i = 0; |
5290 | for_each_domain(cpu, sd) { | 5300 | for_each_domain(cpu, sd) { |
5291 | snprintf(buf, 32, "domain%d", i); | 5301 | snprintf(buf, 32, "domain%d", i); |
5292 | entry->ctl_name = i + 1; | 5302 | entry->ctl_name = i + 1; |
5293 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5303 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5294 | entry->mode = 0755; | 5304 | entry->mode = 0755; |
5295 | entry->child = sd_alloc_ctl_domain_table(sd); | 5305 | entry->child = sd_alloc_ctl_domain_table(sd); |
5296 | entry++; | 5306 | entry++; |
5297 | i++; | 5307 | i++; |
5298 | } | 5308 | } |
5299 | return table; | 5309 | return table; |
5300 | } | 5310 | } |
5301 | 5311 | ||
5302 | static struct ctl_table_header *sd_sysctl_header; | 5312 | static struct ctl_table_header *sd_sysctl_header; |
5303 | static void init_sched_domain_sysctl(void) | 5313 | static void init_sched_domain_sysctl(void) |
5304 | { | 5314 | { |
5305 | int i, cpu_num = num_online_cpus(); | 5315 | int i, cpu_num = num_online_cpus(); |
5306 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); | 5316 | struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); |
5307 | char buf[32]; | 5317 | char buf[32]; |
5308 | 5318 | ||
5309 | sd_ctl_dir[0].child = entry; | 5319 | sd_ctl_dir[0].child = entry; |
5310 | 5320 | ||
5311 | for (i = 0; i < cpu_num; i++, entry++) { | 5321 | for (i = 0; i < cpu_num; i++, entry++) { |
5312 | snprintf(buf, 32, "cpu%d", i); | 5322 | snprintf(buf, 32, "cpu%d", i); |
5313 | entry->ctl_name = i + 1; | 5323 | entry->ctl_name = i + 1; |
5314 | entry->procname = kstrdup(buf, GFP_KERNEL); | 5324 | entry->procname = kstrdup(buf, GFP_KERNEL); |
5315 | entry->mode = 0755; | 5325 | entry->mode = 0755; |
5316 | entry->child = sd_alloc_ctl_cpu_table(i); | 5326 | entry->child = sd_alloc_ctl_cpu_table(i); |
5317 | } | 5327 | } |
5318 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); | 5328 | sd_sysctl_header = register_sysctl_table(sd_ctl_root); |
5319 | } | 5329 | } |
5320 | #else | 5330 | #else |
5321 | static void init_sched_domain_sysctl(void) | 5331 | static void init_sched_domain_sysctl(void) |
5322 | { | 5332 | { |
5323 | } | 5333 | } |
5324 | #endif | 5334 | #endif |
5325 | 5335 | ||
5326 | /* | 5336 | /* |
5327 | * migration_call - callback that gets triggered when a CPU is added. | 5337 | * migration_call - callback that gets triggered when a CPU is added. |
5328 | * Here we can start up the necessary migration thread for the new CPU. | 5338 | * Here we can start up the necessary migration thread for the new CPU. |
5329 | */ | 5339 | */ |
5330 | static int __cpuinit | 5340 | static int __cpuinit |
5331 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 5341 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
5332 | { | 5342 | { |
5333 | struct task_struct *p; | 5343 | struct task_struct *p; |
5334 | int cpu = (long)hcpu; | 5344 | int cpu = (long)hcpu; |
5335 | unsigned long flags; | 5345 | unsigned long flags; |
5336 | struct rq *rq; | 5346 | struct rq *rq; |
5337 | 5347 | ||
5338 | switch (action) { | 5348 | switch (action) { |
5339 | case CPU_LOCK_ACQUIRE: | 5349 | case CPU_LOCK_ACQUIRE: |
5340 | mutex_lock(&sched_hotcpu_mutex); | 5350 | mutex_lock(&sched_hotcpu_mutex); |
5341 | break; | 5351 | break; |
5342 | 5352 | ||
5343 | case CPU_UP_PREPARE: | 5353 | case CPU_UP_PREPARE: |
5344 | case CPU_UP_PREPARE_FROZEN: | 5354 | case CPU_UP_PREPARE_FROZEN: |
5345 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); | 5355 | p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); |
5346 | if (IS_ERR(p)) | 5356 | if (IS_ERR(p)) |
5347 | return NOTIFY_BAD; | 5357 | return NOTIFY_BAD; |
5348 | kthread_bind(p, cpu); | 5358 | kthread_bind(p, cpu); |
5349 | /* Must be high prio: stop_machine expects to yield to it. */ | 5359 | /* Must be high prio: stop_machine expects to yield to it. */ |
5350 | rq = task_rq_lock(p, &flags); | 5360 | rq = task_rq_lock(p, &flags); |
5351 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 5361 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5352 | task_rq_unlock(rq, &flags); | 5362 | task_rq_unlock(rq, &flags); |
5353 | cpu_rq(cpu)->migration_thread = p; | 5363 | cpu_rq(cpu)->migration_thread = p; |
5354 | break; | 5364 | break; |
5355 | 5365 | ||
5356 | case CPU_ONLINE: | 5366 | case CPU_ONLINE: |
5357 | case CPU_ONLINE_FROZEN: | 5367 | case CPU_ONLINE_FROZEN: |
5358 | /* Strictly unneccessary, as first user will wake it. */ | 5368 | /* Strictly unneccessary, as first user will wake it. */ |
5359 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5369 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5360 | break; | 5370 | break; |
5361 | 5371 | ||
5362 | #ifdef CONFIG_HOTPLUG_CPU | 5372 | #ifdef CONFIG_HOTPLUG_CPU |
5363 | case CPU_UP_CANCELED: | 5373 | case CPU_UP_CANCELED: |
5364 | case CPU_UP_CANCELED_FROZEN: | 5374 | case CPU_UP_CANCELED_FROZEN: |
5365 | if (!cpu_rq(cpu)->migration_thread) | 5375 | if (!cpu_rq(cpu)->migration_thread) |
5366 | break; | 5376 | break; |
5367 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5377 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
5368 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5378 | kthread_bind(cpu_rq(cpu)->migration_thread, |
5369 | any_online_cpu(cpu_online_map)); | 5379 | any_online_cpu(cpu_online_map)); |
5370 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5380 | kthread_stop(cpu_rq(cpu)->migration_thread); |
5371 | cpu_rq(cpu)->migration_thread = NULL; | 5381 | cpu_rq(cpu)->migration_thread = NULL; |
5372 | break; | 5382 | break; |
5373 | 5383 | ||
5374 | case CPU_DEAD: | 5384 | case CPU_DEAD: |
5375 | case CPU_DEAD_FROZEN: | 5385 | case CPU_DEAD_FROZEN: |
5376 | migrate_live_tasks(cpu); | 5386 | migrate_live_tasks(cpu); |
5377 | rq = cpu_rq(cpu); | 5387 | rq = cpu_rq(cpu); |
5378 | kthread_stop(rq->migration_thread); | 5388 | kthread_stop(rq->migration_thread); |
5379 | rq->migration_thread = NULL; | 5389 | rq->migration_thread = NULL; |
5380 | /* Idle task back to normal (off runqueue, low prio) */ | 5390 | /* Idle task back to normal (off runqueue, low prio) */ |
5381 | rq = task_rq_lock(rq->idle, &flags); | 5391 | rq = task_rq_lock(rq->idle, &flags); |
5382 | deactivate_task(rq, rq->idle, 0); | 5392 | deactivate_task(rq, rq->idle, 0); |
5383 | rq->idle->static_prio = MAX_PRIO; | 5393 | rq->idle->static_prio = MAX_PRIO; |
5384 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | 5394 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); |
5385 | rq->idle->sched_class = &idle_sched_class; | 5395 | rq->idle->sched_class = &idle_sched_class; |
5386 | migrate_dead_tasks(cpu); | 5396 | migrate_dead_tasks(cpu); |
5387 | task_rq_unlock(rq, &flags); | 5397 | task_rq_unlock(rq, &flags); |
5388 | migrate_nr_uninterruptible(rq); | 5398 | migrate_nr_uninterruptible(rq); |
5389 | BUG_ON(rq->nr_running != 0); | 5399 | BUG_ON(rq->nr_running != 0); |
5390 | 5400 | ||
5391 | /* No need to migrate the tasks: it was best-effort if | 5401 | /* No need to migrate the tasks: it was best-effort if |
5392 | * they didn't take sched_hotcpu_mutex. Just wake up | 5402 | * they didn't take sched_hotcpu_mutex. Just wake up |
5393 | * the requestors. */ | 5403 | * the requestors. */ |
5394 | spin_lock_irq(&rq->lock); | 5404 | spin_lock_irq(&rq->lock); |
5395 | while (!list_empty(&rq->migration_queue)) { | 5405 | while (!list_empty(&rq->migration_queue)) { |
5396 | struct migration_req *req; | 5406 | struct migration_req *req; |
5397 | 5407 | ||
5398 | req = list_entry(rq->migration_queue.next, | 5408 | req = list_entry(rq->migration_queue.next, |
5399 | struct migration_req, list); | 5409 | struct migration_req, list); |
5400 | list_del_init(&req->list); | 5410 | list_del_init(&req->list); |
5401 | complete(&req->done); | 5411 | complete(&req->done); |
5402 | } | 5412 | } |
5403 | spin_unlock_irq(&rq->lock); | 5413 | spin_unlock_irq(&rq->lock); |
5404 | break; | 5414 | break; |
5405 | #endif | 5415 | #endif |
5406 | case CPU_LOCK_RELEASE: | 5416 | case CPU_LOCK_RELEASE: |
5407 | mutex_unlock(&sched_hotcpu_mutex); | 5417 | mutex_unlock(&sched_hotcpu_mutex); |
5408 | break; | 5418 | break; |
5409 | } | 5419 | } |
5410 | return NOTIFY_OK; | 5420 | return NOTIFY_OK; |
5411 | } | 5421 | } |
5412 | 5422 | ||
5413 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5423 | /* Register at highest priority so that task migration (migrate_all_tasks) |
5414 | * happens before everything else. | 5424 | * happens before everything else. |
5415 | */ | 5425 | */ |
5416 | static struct notifier_block __cpuinitdata migration_notifier = { | 5426 | static struct notifier_block __cpuinitdata migration_notifier = { |
5417 | .notifier_call = migration_call, | 5427 | .notifier_call = migration_call, |
5418 | .priority = 10 | 5428 | .priority = 10 |
5419 | }; | 5429 | }; |
5420 | 5430 | ||
5421 | int __init migration_init(void) | 5431 | int __init migration_init(void) |
5422 | { | 5432 | { |
5423 | void *cpu = (void *)(long)smp_processor_id(); | 5433 | void *cpu = (void *)(long)smp_processor_id(); |
5424 | int err; | 5434 | int err; |
5425 | 5435 | ||
5426 | /* Start one for the boot CPU: */ | 5436 | /* Start one for the boot CPU: */ |
5427 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5437 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5428 | BUG_ON(err == NOTIFY_BAD); | 5438 | BUG_ON(err == NOTIFY_BAD); |
5429 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5439 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5430 | register_cpu_notifier(&migration_notifier); | 5440 | register_cpu_notifier(&migration_notifier); |
5431 | 5441 | ||
5432 | return 0; | 5442 | return 0; |
5433 | } | 5443 | } |
5434 | #endif | 5444 | #endif |
5435 | 5445 | ||
5436 | #ifdef CONFIG_SMP | 5446 | #ifdef CONFIG_SMP |
5437 | 5447 | ||
5438 | /* Number of possible processor ids */ | 5448 | /* Number of possible processor ids */ |
5439 | int nr_cpu_ids __read_mostly = NR_CPUS; | 5449 | int nr_cpu_ids __read_mostly = NR_CPUS; |
5440 | EXPORT_SYMBOL(nr_cpu_ids); | 5450 | EXPORT_SYMBOL(nr_cpu_ids); |
5441 | 5451 | ||
5442 | #undef SCHED_DOMAIN_DEBUG | 5452 | #undef SCHED_DOMAIN_DEBUG |
5443 | #ifdef SCHED_DOMAIN_DEBUG | 5453 | #ifdef SCHED_DOMAIN_DEBUG |
5444 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 5454 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
5445 | { | 5455 | { |
5446 | int level = 0; | 5456 | int level = 0; |
5447 | 5457 | ||
5448 | if (!sd) { | 5458 | if (!sd) { |
5449 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 5459 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
5450 | return; | 5460 | return; |
5451 | } | 5461 | } |
5452 | 5462 | ||
5453 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 5463 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
5454 | 5464 | ||
5455 | do { | 5465 | do { |
5456 | int i; | 5466 | int i; |
5457 | char str[NR_CPUS]; | 5467 | char str[NR_CPUS]; |
5458 | struct sched_group *group = sd->groups; | 5468 | struct sched_group *group = sd->groups; |
5459 | cpumask_t groupmask; | 5469 | cpumask_t groupmask; |
5460 | 5470 | ||
5461 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 5471 | cpumask_scnprintf(str, NR_CPUS, sd->span); |
5462 | cpus_clear(groupmask); | 5472 | cpus_clear(groupmask); |
5463 | 5473 | ||
5464 | printk(KERN_DEBUG); | 5474 | printk(KERN_DEBUG); |
5465 | for (i = 0; i < level + 1; i++) | 5475 | for (i = 0; i < level + 1; i++) |
5466 | printk(" "); | 5476 | printk(" "); |
5467 | printk("domain %d: ", level); | 5477 | printk("domain %d: ", level); |
5468 | 5478 | ||
5469 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 5479 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
5470 | printk("does not load-balance\n"); | 5480 | printk("does not load-balance\n"); |
5471 | if (sd->parent) | 5481 | if (sd->parent) |
5472 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 5482 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
5473 | " has parent"); | 5483 | " has parent"); |
5474 | break; | 5484 | break; |
5475 | } | 5485 | } |
5476 | 5486 | ||
5477 | printk("span %s\n", str); | 5487 | printk("span %s\n", str); |
5478 | 5488 | ||
5479 | if (!cpu_isset(cpu, sd->span)) | 5489 | if (!cpu_isset(cpu, sd->span)) |
5480 | printk(KERN_ERR "ERROR: domain->span does not contain " | 5490 | printk(KERN_ERR "ERROR: domain->span does not contain " |
5481 | "CPU%d\n", cpu); | 5491 | "CPU%d\n", cpu); |
5482 | if (!cpu_isset(cpu, group->cpumask)) | 5492 | if (!cpu_isset(cpu, group->cpumask)) |
5483 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 5493 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
5484 | " CPU%d\n", cpu); | 5494 | " CPU%d\n", cpu); |
5485 | 5495 | ||
5486 | printk(KERN_DEBUG); | 5496 | printk(KERN_DEBUG); |
5487 | for (i = 0; i < level + 2; i++) | 5497 | for (i = 0; i < level + 2; i++) |
5488 | printk(" "); | 5498 | printk(" "); |
5489 | printk("groups:"); | 5499 | printk("groups:"); |
5490 | do { | 5500 | do { |
5491 | if (!group) { | 5501 | if (!group) { |
5492 | printk("\n"); | 5502 | printk("\n"); |
5493 | printk(KERN_ERR "ERROR: group is NULL\n"); | 5503 | printk(KERN_ERR "ERROR: group is NULL\n"); |
5494 | break; | 5504 | break; |
5495 | } | 5505 | } |
5496 | 5506 | ||
5497 | if (!group->__cpu_power) { | 5507 | if (!group->__cpu_power) { |
5498 | printk("\n"); | 5508 | printk("\n"); |
5499 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5509 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5500 | "set\n"); | 5510 | "set\n"); |
5501 | } | 5511 | } |
5502 | 5512 | ||
5503 | if (!cpus_weight(group->cpumask)) { | 5513 | if (!cpus_weight(group->cpumask)) { |
5504 | printk("\n"); | 5514 | printk("\n"); |
5505 | printk(KERN_ERR "ERROR: empty group\n"); | 5515 | printk(KERN_ERR "ERROR: empty group\n"); |
5506 | } | 5516 | } |
5507 | 5517 | ||
5508 | if (cpus_intersects(groupmask, group->cpumask)) { | 5518 | if (cpus_intersects(groupmask, group->cpumask)) { |
5509 | printk("\n"); | 5519 | printk("\n"); |
5510 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5520 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5511 | } | 5521 | } |
5512 | 5522 | ||
5513 | cpus_or(groupmask, groupmask, group->cpumask); | 5523 | cpus_or(groupmask, groupmask, group->cpumask); |
5514 | 5524 | ||
5515 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 5525 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); |
5516 | printk(" %s", str); | 5526 | printk(" %s", str); |
5517 | 5527 | ||
5518 | group = group->next; | 5528 | group = group->next; |
5519 | } while (group != sd->groups); | 5529 | } while (group != sd->groups); |
5520 | printk("\n"); | 5530 | printk("\n"); |
5521 | 5531 | ||
5522 | if (!cpus_equal(sd->span, groupmask)) | 5532 | if (!cpus_equal(sd->span, groupmask)) |
5523 | printk(KERN_ERR "ERROR: groups don't span " | 5533 | printk(KERN_ERR "ERROR: groups don't span " |
5524 | "domain->span\n"); | 5534 | "domain->span\n"); |
5525 | 5535 | ||
5526 | level++; | 5536 | level++; |
5527 | sd = sd->parent; | 5537 | sd = sd->parent; |
5528 | if (!sd) | 5538 | if (!sd) |
5529 | continue; | 5539 | continue; |
5530 | 5540 | ||
5531 | if (!cpus_subset(groupmask, sd->span)) | 5541 | if (!cpus_subset(groupmask, sd->span)) |
5532 | printk(KERN_ERR "ERROR: parent span is not a superset " | 5542 | printk(KERN_ERR "ERROR: parent span is not a superset " |
5533 | "of domain->span\n"); | 5543 | "of domain->span\n"); |
5534 | 5544 | ||
5535 | } while (sd); | 5545 | } while (sd); |
5536 | } | 5546 | } |
5537 | #else | 5547 | #else |
5538 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5548 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5539 | #endif | 5549 | #endif |
5540 | 5550 | ||
5541 | static int sd_degenerate(struct sched_domain *sd) | 5551 | static int sd_degenerate(struct sched_domain *sd) |
5542 | { | 5552 | { |
5543 | if (cpus_weight(sd->span) == 1) | 5553 | if (cpus_weight(sd->span) == 1) |
5544 | return 1; | 5554 | return 1; |
5545 | 5555 | ||
5546 | /* Following flags need at least 2 groups */ | 5556 | /* Following flags need at least 2 groups */ |
5547 | if (sd->flags & (SD_LOAD_BALANCE | | 5557 | if (sd->flags & (SD_LOAD_BALANCE | |
5548 | SD_BALANCE_NEWIDLE | | 5558 | SD_BALANCE_NEWIDLE | |
5549 | SD_BALANCE_FORK | | 5559 | SD_BALANCE_FORK | |
5550 | SD_BALANCE_EXEC | | 5560 | SD_BALANCE_EXEC | |
5551 | SD_SHARE_CPUPOWER | | 5561 | SD_SHARE_CPUPOWER | |
5552 | SD_SHARE_PKG_RESOURCES)) { | 5562 | SD_SHARE_PKG_RESOURCES)) { |
5553 | if (sd->groups != sd->groups->next) | 5563 | if (sd->groups != sd->groups->next) |
5554 | return 0; | 5564 | return 0; |
5555 | } | 5565 | } |
5556 | 5566 | ||
5557 | /* Following flags don't use groups */ | 5567 | /* Following flags don't use groups */ |
5558 | if (sd->flags & (SD_WAKE_IDLE | | 5568 | if (sd->flags & (SD_WAKE_IDLE | |
5559 | SD_WAKE_AFFINE | | 5569 | SD_WAKE_AFFINE | |
5560 | SD_WAKE_BALANCE)) | 5570 | SD_WAKE_BALANCE)) |
5561 | return 0; | 5571 | return 0; |
5562 | 5572 | ||
5563 | return 1; | 5573 | return 1; |
5564 | } | 5574 | } |
5565 | 5575 | ||
5566 | static int | 5576 | static int |
5567 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | 5577 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
5568 | { | 5578 | { |
5569 | unsigned long cflags = sd->flags, pflags = parent->flags; | 5579 | unsigned long cflags = sd->flags, pflags = parent->flags; |
5570 | 5580 | ||
5571 | if (sd_degenerate(parent)) | 5581 | if (sd_degenerate(parent)) |
5572 | return 1; | 5582 | return 1; |
5573 | 5583 | ||
5574 | if (!cpus_equal(sd->span, parent->span)) | 5584 | if (!cpus_equal(sd->span, parent->span)) |
5575 | return 0; | 5585 | return 0; |
5576 | 5586 | ||
5577 | /* Does parent contain flags not in child? */ | 5587 | /* Does parent contain flags not in child? */ |
5578 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | 5588 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ |
5579 | if (cflags & SD_WAKE_AFFINE) | 5589 | if (cflags & SD_WAKE_AFFINE) |
5580 | pflags &= ~SD_WAKE_BALANCE; | 5590 | pflags &= ~SD_WAKE_BALANCE; |
5581 | /* Flags needing groups don't count if only 1 group in parent */ | 5591 | /* Flags needing groups don't count if only 1 group in parent */ |
5582 | if (parent->groups == parent->groups->next) { | 5592 | if (parent->groups == parent->groups->next) { |
5583 | pflags &= ~(SD_LOAD_BALANCE | | 5593 | pflags &= ~(SD_LOAD_BALANCE | |
5584 | SD_BALANCE_NEWIDLE | | 5594 | SD_BALANCE_NEWIDLE | |
5585 | SD_BALANCE_FORK | | 5595 | SD_BALANCE_FORK | |
5586 | SD_BALANCE_EXEC | | 5596 | SD_BALANCE_EXEC | |
5587 | SD_SHARE_CPUPOWER | | 5597 | SD_SHARE_CPUPOWER | |
5588 | SD_SHARE_PKG_RESOURCES); | 5598 | SD_SHARE_PKG_RESOURCES); |
5589 | } | 5599 | } |
5590 | if (~cflags & pflags) | 5600 | if (~cflags & pflags) |
5591 | return 0; | 5601 | return 0; |
5592 | 5602 | ||
5593 | return 1; | 5603 | return 1; |
5594 | } | 5604 | } |
5595 | 5605 | ||
5596 | /* | 5606 | /* |
5597 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 5607 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5598 | * hold the hotplug lock. | 5608 | * hold the hotplug lock. |
5599 | */ | 5609 | */ |
5600 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 5610 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
5601 | { | 5611 | { |
5602 | struct rq *rq = cpu_rq(cpu); | 5612 | struct rq *rq = cpu_rq(cpu); |
5603 | struct sched_domain *tmp; | 5613 | struct sched_domain *tmp; |
5604 | 5614 | ||
5605 | /* Remove the sched domains which do not contribute to scheduling. */ | 5615 | /* Remove the sched domains which do not contribute to scheduling. */ |
5606 | for (tmp = sd; tmp; tmp = tmp->parent) { | 5616 | for (tmp = sd; tmp; tmp = tmp->parent) { |
5607 | struct sched_domain *parent = tmp->parent; | 5617 | struct sched_domain *parent = tmp->parent; |
5608 | if (!parent) | 5618 | if (!parent) |
5609 | break; | 5619 | break; |
5610 | if (sd_parent_degenerate(tmp, parent)) { | 5620 | if (sd_parent_degenerate(tmp, parent)) { |
5611 | tmp->parent = parent->parent; | 5621 | tmp->parent = parent->parent; |
5612 | if (parent->parent) | 5622 | if (parent->parent) |
5613 | parent->parent->child = tmp; | 5623 | parent->parent->child = tmp; |
5614 | } | 5624 | } |
5615 | } | 5625 | } |
5616 | 5626 | ||
5617 | if (sd && sd_degenerate(sd)) { | 5627 | if (sd && sd_degenerate(sd)) { |
5618 | sd = sd->parent; | 5628 | sd = sd->parent; |
5619 | if (sd) | 5629 | if (sd) |
5620 | sd->child = NULL; | 5630 | sd->child = NULL; |
5621 | } | 5631 | } |
5622 | 5632 | ||
5623 | sched_domain_debug(sd, cpu); | 5633 | sched_domain_debug(sd, cpu); |
5624 | 5634 | ||
5625 | rcu_assign_pointer(rq->sd, sd); | 5635 | rcu_assign_pointer(rq->sd, sd); |
5626 | } | 5636 | } |
5627 | 5637 | ||
5628 | /* cpus with isolated domains */ | 5638 | /* cpus with isolated domains */ |
5629 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; | 5639 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; |
5630 | 5640 | ||
5631 | /* Setup the mask of cpus configured for isolated domains */ | 5641 | /* Setup the mask of cpus configured for isolated domains */ |
5632 | static int __init isolated_cpu_setup(char *str) | 5642 | static int __init isolated_cpu_setup(char *str) |
5633 | { | 5643 | { |
5634 | int ints[NR_CPUS], i; | 5644 | int ints[NR_CPUS], i; |
5635 | 5645 | ||
5636 | str = get_options(str, ARRAY_SIZE(ints), ints); | 5646 | str = get_options(str, ARRAY_SIZE(ints), ints); |
5637 | cpus_clear(cpu_isolated_map); | 5647 | cpus_clear(cpu_isolated_map); |
5638 | for (i = 1; i <= ints[0]; i++) | 5648 | for (i = 1; i <= ints[0]; i++) |
5639 | if (ints[i] < NR_CPUS) | 5649 | if (ints[i] < NR_CPUS) |
5640 | cpu_set(ints[i], cpu_isolated_map); | 5650 | cpu_set(ints[i], cpu_isolated_map); |
5641 | return 1; | 5651 | return 1; |
5642 | } | 5652 | } |
5643 | 5653 | ||
5644 | __setup ("isolcpus=", isolated_cpu_setup); | 5654 | __setup ("isolcpus=", isolated_cpu_setup); |
5645 | 5655 | ||
5646 | /* | 5656 | /* |
5647 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 5657 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
5648 | * to a function which identifies what group(along with sched group) a CPU | 5658 | * to a function which identifies what group(along with sched group) a CPU |
5649 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS | 5659 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS |
5650 | * (due to the fact that we keep track of groups covered with a cpumask_t). | 5660 | * (due to the fact that we keep track of groups covered with a cpumask_t). |
5651 | * | 5661 | * |
5652 | * init_sched_build_groups will build a circular linked list of the groups | 5662 | * init_sched_build_groups will build a circular linked list of the groups |
5653 | * covered by the given span, and will set each group's ->cpumask correctly, | 5663 | * covered by the given span, and will set each group's ->cpumask correctly, |
5654 | * and ->cpu_power to 0. | 5664 | * and ->cpu_power to 0. |
5655 | */ | 5665 | */ |
5656 | static void | 5666 | static void |
5657 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 5667 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, |
5658 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 5668 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
5659 | struct sched_group **sg)) | 5669 | struct sched_group **sg)) |
5660 | { | 5670 | { |
5661 | struct sched_group *first = NULL, *last = NULL; | 5671 | struct sched_group *first = NULL, *last = NULL; |
5662 | cpumask_t covered = CPU_MASK_NONE; | 5672 | cpumask_t covered = CPU_MASK_NONE; |
5663 | int i; | 5673 | int i; |
5664 | 5674 | ||
5665 | for_each_cpu_mask(i, span) { | 5675 | for_each_cpu_mask(i, span) { |
5666 | struct sched_group *sg; | 5676 | struct sched_group *sg; |
5667 | int group = group_fn(i, cpu_map, &sg); | 5677 | int group = group_fn(i, cpu_map, &sg); |
5668 | int j; | 5678 | int j; |
5669 | 5679 | ||
5670 | if (cpu_isset(i, covered)) | 5680 | if (cpu_isset(i, covered)) |
5671 | continue; | 5681 | continue; |
5672 | 5682 | ||
5673 | sg->cpumask = CPU_MASK_NONE; | 5683 | sg->cpumask = CPU_MASK_NONE; |
5674 | sg->__cpu_power = 0; | 5684 | sg->__cpu_power = 0; |
5675 | 5685 | ||
5676 | for_each_cpu_mask(j, span) { | 5686 | for_each_cpu_mask(j, span) { |
5677 | if (group_fn(j, cpu_map, NULL) != group) | 5687 | if (group_fn(j, cpu_map, NULL) != group) |
5678 | continue; | 5688 | continue; |
5679 | 5689 | ||
5680 | cpu_set(j, covered); | 5690 | cpu_set(j, covered); |
5681 | cpu_set(j, sg->cpumask); | 5691 | cpu_set(j, sg->cpumask); |
5682 | } | 5692 | } |
5683 | if (!first) | 5693 | if (!first) |
5684 | first = sg; | 5694 | first = sg; |
5685 | if (last) | 5695 | if (last) |
5686 | last->next = sg; | 5696 | last->next = sg; |
5687 | last = sg; | 5697 | last = sg; |
5688 | } | 5698 | } |
5689 | last->next = first; | 5699 | last->next = first; |
5690 | } | 5700 | } |
5691 | 5701 | ||
5692 | #define SD_NODES_PER_DOMAIN 16 | 5702 | #define SD_NODES_PER_DOMAIN 16 |
5693 | 5703 | ||
5694 | #ifdef CONFIG_NUMA | 5704 | #ifdef CONFIG_NUMA |
5695 | 5705 | ||
5696 | /** | 5706 | /** |
5697 | * find_next_best_node - find the next node to include in a sched_domain | 5707 | * find_next_best_node - find the next node to include in a sched_domain |
5698 | * @node: node whose sched_domain we're building | 5708 | * @node: node whose sched_domain we're building |
5699 | * @used_nodes: nodes already in the sched_domain | 5709 | * @used_nodes: nodes already in the sched_domain |
5700 | * | 5710 | * |
5701 | * Find the next node to include in a given scheduling domain. Simply | 5711 | * Find the next node to include in a given scheduling domain. Simply |
5702 | * finds the closest node not already in the @used_nodes map. | 5712 | * finds the closest node not already in the @used_nodes map. |
5703 | * | 5713 | * |
5704 | * Should use nodemask_t. | 5714 | * Should use nodemask_t. |
5705 | */ | 5715 | */ |
5706 | static int find_next_best_node(int node, unsigned long *used_nodes) | 5716 | static int find_next_best_node(int node, unsigned long *used_nodes) |
5707 | { | 5717 | { |
5708 | int i, n, val, min_val, best_node = 0; | 5718 | int i, n, val, min_val, best_node = 0; |
5709 | 5719 | ||
5710 | min_val = INT_MAX; | 5720 | min_val = INT_MAX; |
5711 | 5721 | ||
5712 | for (i = 0; i < MAX_NUMNODES; i++) { | 5722 | for (i = 0; i < MAX_NUMNODES; i++) { |
5713 | /* Start at @node */ | 5723 | /* Start at @node */ |
5714 | n = (node + i) % MAX_NUMNODES; | 5724 | n = (node + i) % MAX_NUMNODES; |
5715 | 5725 | ||
5716 | if (!nr_cpus_node(n)) | 5726 | if (!nr_cpus_node(n)) |
5717 | continue; | 5727 | continue; |
5718 | 5728 | ||
5719 | /* Skip already used nodes */ | 5729 | /* Skip already used nodes */ |
5720 | if (test_bit(n, used_nodes)) | 5730 | if (test_bit(n, used_nodes)) |
5721 | continue; | 5731 | continue; |
5722 | 5732 | ||
5723 | /* Simple min distance search */ | 5733 | /* Simple min distance search */ |
5724 | val = node_distance(node, n); | 5734 | val = node_distance(node, n); |
5725 | 5735 | ||
5726 | if (val < min_val) { | 5736 | if (val < min_val) { |
5727 | min_val = val; | 5737 | min_val = val; |
5728 | best_node = n; | 5738 | best_node = n; |
5729 | } | 5739 | } |
5730 | } | 5740 | } |
5731 | 5741 | ||
5732 | set_bit(best_node, used_nodes); | 5742 | set_bit(best_node, used_nodes); |
5733 | return best_node; | 5743 | return best_node; |
5734 | } | 5744 | } |
5735 | 5745 | ||
5736 | /** | 5746 | /** |
5737 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 5747 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
5738 | * @node: node whose cpumask we're constructing | 5748 | * @node: node whose cpumask we're constructing |
5739 | * @size: number of nodes to include in this span | 5749 | * @size: number of nodes to include in this span |
5740 | * | 5750 | * |
5741 | * Given a node, construct a good cpumask for its sched_domain to span. It | 5751 | * Given a node, construct a good cpumask for its sched_domain to span. It |
5742 | * should be one that prevents unnecessary balancing, but also spreads tasks | 5752 | * should be one that prevents unnecessary balancing, but also spreads tasks |
5743 | * out optimally. | 5753 | * out optimally. |
5744 | */ | 5754 | */ |
5745 | static cpumask_t sched_domain_node_span(int node) | 5755 | static cpumask_t sched_domain_node_span(int node) |
5746 | { | 5756 | { |
5747 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 5757 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
5748 | cpumask_t span, nodemask; | 5758 | cpumask_t span, nodemask; |
5749 | int i; | 5759 | int i; |
5750 | 5760 | ||
5751 | cpus_clear(span); | 5761 | cpus_clear(span); |
5752 | bitmap_zero(used_nodes, MAX_NUMNODES); | 5762 | bitmap_zero(used_nodes, MAX_NUMNODES); |
5753 | 5763 | ||
5754 | nodemask = node_to_cpumask(node); | 5764 | nodemask = node_to_cpumask(node); |
5755 | cpus_or(span, span, nodemask); | 5765 | cpus_or(span, span, nodemask); |
5756 | set_bit(node, used_nodes); | 5766 | set_bit(node, used_nodes); |
5757 | 5767 | ||
5758 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 5768 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
5759 | int next_node = find_next_best_node(node, used_nodes); | 5769 | int next_node = find_next_best_node(node, used_nodes); |
5760 | 5770 | ||
5761 | nodemask = node_to_cpumask(next_node); | 5771 | nodemask = node_to_cpumask(next_node); |
5762 | cpus_or(span, span, nodemask); | 5772 | cpus_or(span, span, nodemask); |
5763 | } | 5773 | } |
5764 | 5774 | ||
5765 | return span; | 5775 | return span; |
5766 | } | 5776 | } |
5767 | #endif | 5777 | #endif |
5768 | 5778 | ||
5769 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 5779 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
5770 | 5780 | ||
5771 | /* | 5781 | /* |
5772 | * SMT sched-domains: | 5782 | * SMT sched-domains: |
5773 | */ | 5783 | */ |
5774 | #ifdef CONFIG_SCHED_SMT | 5784 | #ifdef CONFIG_SCHED_SMT |
5775 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 5785 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
5776 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 5786 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
5777 | 5787 | ||
5778 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, | 5788 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, |
5779 | struct sched_group **sg) | 5789 | struct sched_group **sg) |
5780 | { | 5790 | { |
5781 | if (sg) | 5791 | if (sg) |
5782 | *sg = &per_cpu(sched_group_cpus, cpu); | 5792 | *sg = &per_cpu(sched_group_cpus, cpu); |
5783 | return cpu; | 5793 | return cpu; |
5784 | } | 5794 | } |
5785 | #endif | 5795 | #endif |
5786 | 5796 | ||
5787 | /* | 5797 | /* |
5788 | * multi-core sched-domains: | 5798 | * multi-core sched-domains: |
5789 | */ | 5799 | */ |
5790 | #ifdef CONFIG_SCHED_MC | 5800 | #ifdef CONFIG_SCHED_MC |
5791 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 5801 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
5792 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 5802 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
5793 | #endif | 5803 | #endif |
5794 | 5804 | ||
5795 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 5805 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
5796 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 5806 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
5797 | struct sched_group **sg) | 5807 | struct sched_group **sg) |
5798 | { | 5808 | { |
5799 | int group; | 5809 | int group; |
5800 | cpumask_t mask = cpu_sibling_map[cpu]; | 5810 | cpumask_t mask = cpu_sibling_map[cpu]; |
5801 | cpus_and(mask, mask, *cpu_map); | 5811 | cpus_and(mask, mask, *cpu_map); |
5802 | group = first_cpu(mask); | 5812 | group = first_cpu(mask); |
5803 | if (sg) | 5813 | if (sg) |
5804 | *sg = &per_cpu(sched_group_core, group); | 5814 | *sg = &per_cpu(sched_group_core, group); |
5805 | return group; | 5815 | return group; |
5806 | } | 5816 | } |
5807 | #elif defined(CONFIG_SCHED_MC) | 5817 | #elif defined(CONFIG_SCHED_MC) |
5808 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 5818 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
5809 | struct sched_group **sg) | 5819 | struct sched_group **sg) |
5810 | { | 5820 | { |
5811 | if (sg) | 5821 | if (sg) |
5812 | *sg = &per_cpu(sched_group_core, cpu); | 5822 | *sg = &per_cpu(sched_group_core, cpu); |
5813 | return cpu; | 5823 | return cpu; |
5814 | } | 5824 | } |
5815 | #endif | 5825 | #endif |
5816 | 5826 | ||
5817 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 5827 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5818 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 5828 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
5819 | 5829 | ||
5820 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, | 5830 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, |
5821 | struct sched_group **sg) | 5831 | struct sched_group **sg) |
5822 | { | 5832 | { |
5823 | int group; | 5833 | int group; |
5824 | #ifdef CONFIG_SCHED_MC | 5834 | #ifdef CONFIG_SCHED_MC |
5825 | cpumask_t mask = cpu_coregroup_map(cpu); | 5835 | cpumask_t mask = cpu_coregroup_map(cpu); |
5826 | cpus_and(mask, mask, *cpu_map); | 5836 | cpus_and(mask, mask, *cpu_map); |
5827 | group = first_cpu(mask); | 5837 | group = first_cpu(mask); |
5828 | #elif defined(CONFIG_SCHED_SMT) | 5838 | #elif defined(CONFIG_SCHED_SMT) |
5829 | cpumask_t mask = cpu_sibling_map[cpu]; | 5839 | cpumask_t mask = cpu_sibling_map[cpu]; |
5830 | cpus_and(mask, mask, *cpu_map); | 5840 | cpus_and(mask, mask, *cpu_map); |
5831 | group = first_cpu(mask); | 5841 | group = first_cpu(mask); |
5832 | #else | 5842 | #else |
5833 | group = cpu; | 5843 | group = cpu; |
5834 | #endif | 5844 | #endif |
5835 | if (sg) | 5845 | if (sg) |
5836 | *sg = &per_cpu(sched_group_phys, group); | 5846 | *sg = &per_cpu(sched_group_phys, group); |
5837 | return group; | 5847 | return group; |
5838 | } | 5848 | } |
5839 | 5849 | ||
5840 | #ifdef CONFIG_NUMA | 5850 | #ifdef CONFIG_NUMA |
5841 | /* | 5851 | /* |
5842 | * The init_sched_build_groups can't handle what we want to do with node | 5852 | * The init_sched_build_groups can't handle what we want to do with node |
5843 | * groups, so roll our own. Now each node has its own list of groups which | 5853 | * groups, so roll our own. Now each node has its own list of groups which |
5844 | * gets dynamically allocated. | 5854 | * gets dynamically allocated. |
5845 | */ | 5855 | */ |
5846 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 5856 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
5847 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 5857 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
5848 | 5858 | ||
5849 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 5859 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
5850 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 5860 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
5851 | 5861 | ||
5852 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 5862 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
5853 | struct sched_group **sg) | 5863 | struct sched_group **sg) |
5854 | { | 5864 | { |
5855 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | 5865 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); |
5856 | int group; | 5866 | int group; |
5857 | 5867 | ||
5858 | cpus_and(nodemask, nodemask, *cpu_map); | 5868 | cpus_and(nodemask, nodemask, *cpu_map); |
5859 | group = first_cpu(nodemask); | 5869 | group = first_cpu(nodemask); |
5860 | 5870 | ||
5861 | if (sg) | 5871 | if (sg) |
5862 | *sg = &per_cpu(sched_group_allnodes, group); | 5872 | *sg = &per_cpu(sched_group_allnodes, group); |
5863 | return group; | 5873 | return group; |
5864 | } | 5874 | } |
5865 | 5875 | ||
5866 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 5876 | static void init_numa_sched_groups_power(struct sched_group *group_head) |
5867 | { | 5877 | { |
5868 | struct sched_group *sg = group_head; | 5878 | struct sched_group *sg = group_head; |
5869 | int j; | 5879 | int j; |
5870 | 5880 | ||
5871 | if (!sg) | 5881 | if (!sg) |
5872 | return; | 5882 | return; |
5873 | next_sg: | 5883 | next_sg: |
5874 | for_each_cpu_mask(j, sg->cpumask) { | 5884 | for_each_cpu_mask(j, sg->cpumask) { |
5875 | struct sched_domain *sd; | 5885 | struct sched_domain *sd; |
5876 | 5886 | ||
5877 | sd = &per_cpu(phys_domains, j); | 5887 | sd = &per_cpu(phys_domains, j); |
5878 | if (j != first_cpu(sd->groups->cpumask)) { | 5888 | if (j != first_cpu(sd->groups->cpumask)) { |
5879 | /* | 5889 | /* |
5880 | * Only add "power" once for each | 5890 | * Only add "power" once for each |
5881 | * physical package. | 5891 | * physical package. |
5882 | */ | 5892 | */ |
5883 | continue; | 5893 | continue; |
5884 | } | 5894 | } |
5885 | 5895 | ||
5886 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 5896 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); |
5887 | } | 5897 | } |
5888 | sg = sg->next; | 5898 | sg = sg->next; |
5889 | if (sg != group_head) | 5899 | if (sg != group_head) |
5890 | goto next_sg; | 5900 | goto next_sg; |
5891 | } | 5901 | } |
5892 | #endif | 5902 | #endif |
5893 | 5903 | ||
5894 | #ifdef CONFIG_NUMA | 5904 | #ifdef CONFIG_NUMA |
5895 | /* Free memory allocated for various sched_group structures */ | 5905 | /* Free memory allocated for various sched_group structures */ |
5896 | static void free_sched_groups(const cpumask_t *cpu_map) | 5906 | static void free_sched_groups(const cpumask_t *cpu_map) |
5897 | { | 5907 | { |
5898 | int cpu, i; | 5908 | int cpu, i; |
5899 | 5909 | ||
5900 | for_each_cpu_mask(cpu, *cpu_map) { | 5910 | for_each_cpu_mask(cpu, *cpu_map) { |
5901 | struct sched_group **sched_group_nodes | 5911 | struct sched_group **sched_group_nodes |
5902 | = sched_group_nodes_bycpu[cpu]; | 5912 | = sched_group_nodes_bycpu[cpu]; |
5903 | 5913 | ||
5904 | if (!sched_group_nodes) | 5914 | if (!sched_group_nodes) |
5905 | continue; | 5915 | continue; |
5906 | 5916 | ||
5907 | for (i = 0; i < MAX_NUMNODES; i++) { | 5917 | for (i = 0; i < MAX_NUMNODES; i++) { |
5908 | cpumask_t nodemask = node_to_cpumask(i); | 5918 | cpumask_t nodemask = node_to_cpumask(i); |
5909 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 5919 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
5910 | 5920 | ||
5911 | cpus_and(nodemask, nodemask, *cpu_map); | 5921 | cpus_and(nodemask, nodemask, *cpu_map); |
5912 | if (cpus_empty(nodemask)) | 5922 | if (cpus_empty(nodemask)) |
5913 | continue; | 5923 | continue; |
5914 | 5924 | ||
5915 | if (sg == NULL) | 5925 | if (sg == NULL) |
5916 | continue; | 5926 | continue; |
5917 | sg = sg->next; | 5927 | sg = sg->next; |
5918 | next_sg: | 5928 | next_sg: |
5919 | oldsg = sg; | 5929 | oldsg = sg; |
5920 | sg = sg->next; | 5930 | sg = sg->next; |
5921 | kfree(oldsg); | 5931 | kfree(oldsg); |
5922 | if (oldsg != sched_group_nodes[i]) | 5932 | if (oldsg != sched_group_nodes[i]) |
5923 | goto next_sg; | 5933 | goto next_sg; |
5924 | } | 5934 | } |
5925 | kfree(sched_group_nodes); | 5935 | kfree(sched_group_nodes); |
5926 | sched_group_nodes_bycpu[cpu] = NULL; | 5936 | sched_group_nodes_bycpu[cpu] = NULL; |
5927 | } | 5937 | } |
5928 | } | 5938 | } |
5929 | #else | 5939 | #else |
5930 | static void free_sched_groups(const cpumask_t *cpu_map) | 5940 | static void free_sched_groups(const cpumask_t *cpu_map) |
5931 | { | 5941 | { |
5932 | } | 5942 | } |
5933 | #endif | 5943 | #endif |
5934 | 5944 | ||
5935 | /* | 5945 | /* |
5936 | * Initialize sched groups cpu_power. | 5946 | * Initialize sched groups cpu_power. |
5937 | * | 5947 | * |
5938 | * cpu_power indicates the capacity of sched group, which is used while | 5948 | * cpu_power indicates the capacity of sched group, which is used while |
5939 | * distributing the load between different sched groups in a sched domain. | 5949 | * distributing the load between different sched groups in a sched domain. |
5940 | * Typically cpu_power for all the groups in a sched domain will be same unless | 5950 | * Typically cpu_power for all the groups in a sched domain will be same unless |
5941 | * there are asymmetries in the topology. If there are asymmetries, group | 5951 | * there are asymmetries in the topology. If there are asymmetries, group |
5942 | * having more cpu_power will pickup more load compared to the group having | 5952 | * having more cpu_power will pickup more load compared to the group having |
5943 | * less cpu_power. | 5953 | * less cpu_power. |
5944 | * | 5954 | * |
5945 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | 5955 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents |
5946 | * the maximum number of tasks a group can handle in the presence of other idle | 5956 | * the maximum number of tasks a group can handle in the presence of other idle |
5947 | * or lightly loaded groups in the same sched domain. | 5957 | * or lightly loaded groups in the same sched domain. |
5948 | */ | 5958 | */ |
5949 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 5959 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
5950 | { | 5960 | { |
5951 | struct sched_domain *child; | 5961 | struct sched_domain *child; |
5952 | struct sched_group *group; | 5962 | struct sched_group *group; |
5953 | 5963 | ||
5954 | WARN_ON(!sd || !sd->groups); | 5964 | WARN_ON(!sd || !sd->groups); |
5955 | 5965 | ||
5956 | if (cpu != first_cpu(sd->groups->cpumask)) | 5966 | if (cpu != first_cpu(sd->groups->cpumask)) |
5957 | return; | 5967 | return; |
5958 | 5968 | ||
5959 | child = sd->child; | 5969 | child = sd->child; |
5960 | 5970 | ||
5961 | sd->groups->__cpu_power = 0; | 5971 | sd->groups->__cpu_power = 0; |
5962 | 5972 | ||
5963 | /* | 5973 | /* |
5964 | * For perf policy, if the groups in child domain share resources | 5974 | * For perf policy, if the groups in child domain share resources |
5965 | * (for example cores sharing some portions of the cache hierarchy | 5975 | * (for example cores sharing some portions of the cache hierarchy |
5966 | * or SMT), then set this domain groups cpu_power such that each group | 5976 | * or SMT), then set this domain groups cpu_power such that each group |
5967 | * can handle only one task, when there are other idle groups in the | 5977 | * can handle only one task, when there are other idle groups in the |
5968 | * same sched domain. | 5978 | * same sched domain. |
5969 | */ | 5979 | */ |
5970 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 5980 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && |
5971 | (child->flags & | 5981 | (child->flags & |
5972 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 5982 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { |
5973 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 5983 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); |
5974 | return; | 5984 | return; |
5975 | } | 5985 | } |
5976 | 5986 | ||
5977 | /* | 5987 | /* |
5978 | * add cpu_power of each child group to this groups cpu_power | 5988 | * add cpu_power of each child group to this groups cpu_power |
5979 | */ | 5989 | */ |
5980 | group = child->groups; | 5990 | group = child->groups; |
5981 | do { | 5991 | do { |
5982 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 5992 | sg_inc_cpu_power(sd->groups, group->__cpu_power); |
5983 | group = group->next; | 5993 | group = group->next; |
5984 | } while (group != child->groups); | 5994 | } while (group != child->groups); |
5985 | } | 5995 | } |
5986 | 5996 | ||
5987 | /* | 5997 | /* |
5988 | * Build sched domains for a given set of cpus and attach the sched domains | 5998 | * Build sched domains for a given set of cpus and attach the sched domains |
5989 | * to the individual cpus | 5999 | * to the individual cpus |
5990 | */ | 6000 | */ |
5991 | static int build_sched_domains(const cpumask_t *cpu_map) | 6001 | static int build_sched_domains(const cpumask_t *cpu_map) |
5992 | { | 6002 | { |
5993 | int i; | 6003 | int i; |
5994 | #ifdef CONFIG_NUMA | 6004 | #ifdef CONFIG_NUMA |
5995 | struct sched_group **sched_group_nodes = NULL; | 6005 | struct sched_group **sched_group_nodes = NULL; |
5996 | int sd_allnodes = 0; | 6006 | int sd_allnodes = 0; |
5997 | 6007 | ||
5998 | /* | 6008 | /* |
5999 | * Allocate the per-node list of sched groups | 6009 | * Allocate the per-node list of sched groups |
6000 | */ | 6010 | */ |
6001 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, | 6011 | sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES, |
6002 | GFP_KERNEL); | 6012 | GFP_KERNEL); |
6003 | if (!sched_group_nodes) { | 6013 | if (!sched_group_nodes) { |
6004 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6014 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6005 | return -ENOMEM; | 6015 | return -ENOMEM; |
6006 | } | 6016 | } |
6007 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6017 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6008 | #endif | 6018 | #endif |
6009 | 6019 | ||
6010 | /* | 6020 | /* |
6011 | * Set up domains for cpus specified by the cpu_map. | 6021 | * Set up domains for cpus specified by the cpu_map. |
6012 | */ | 6022 | */ |
6013 | for_each_cpu_mask(i, *cpu_map) { | 6023 | for_each_cpu_mask(i, *cpu_map) { |
6014 | struct sched_domain *sd = NULL, *p; | 6024 | struct sched_domain *sd = NULL, *p; |
6015 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 6025 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
6016 | 6026 | ||
6017 | cpus_and(nodemask, nodemask, *cpu_map); | 6027 | cpus_and(nodemask, nodemask, *cpu_map); |
6018 | 6028 | ||
6019 | #ifdef CONFIG_NUMA | 6029 | #ifdef CONFIG_NUMA |
6020 | if (cpus_weight(*cpu_map) > | 6030 | if (cpus_weight(*cpu_map) > |
6021 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 6031 | SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
6022 | sd = &per_cpu(allnodes_domains, i); | 6032 | sd = &per_cpu(allnodes_domains, i); |
6023 | *sd = SD_ALLNODES_INIT; | 6033 | *sd = SD_ALLNODES_INIT; |
6024 | sd->span = *cpu_map; | 6034 | sd->span = *cpu_map; |
6025 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 6035 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); |
6026 | p = sd; | 6036 | p = sd; |
6027 | sd_allnodes = 1; | 6037 | sd_allnodes = 1; |
6028 | } else | 6038 | } else |
6029 | p = NULL; | 6039 | p = NULL; |
6030 | 6040 | ||
6031 | sd = &per_cpu(node_domains, i); | 6041 | sd = &per_cpu(node_domains, i); |
6032 | *sd = SD_NODE_INIT; | 6042 | *sd = SD_NODE_INIT; |
6033 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 6043 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
6034 | sd->parent = p; | 6044 | sd->parent = p; |
6035 | if (p) | 6045 | if (p) |
6036 | p->child = sd; | 6046 | p->child = sd; |
6037 | cpus_and(sd->span, sd->span, *cpu_map); | 6047 | cpus_and(sd->span, sd->span, *cpu_map); |
6038 | #endif | 6048 | #endif |
6039 | 6049 | ||
6040 | p = sd; | 6050 | p = sd; |
6041 | sd = &per_cpu(phys_domains, i); | 6051 | sd = &per_cpu(phys_domains, i); |
6042 | *sd = SD_CPU_INIT; | 6052 | *sd = SD_CPU_INIT; |
6043 | sd->span = nodemask; | 6053 | sd->span = nodemask; |
6044 | sd->parent = p; | 6054 | sd->parent = p; |
6045 | if (p) | 6055 | if (p) |
6046 | p->child = sd; | 6056 | p->child = sd; |
6047 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 6057 | cpu_to_phys_group(i, cpu_map, &sd->groups); |
6048 | 6058 | ||
6049 | #ifdef CONFIG_SCHED_MC | 6059 | #ifdef CONFIG_SCHED_MC |
6050 | p = sd; | 6060 | p = sd; |
6051 | sd = &per_cpu(core_domains, i); | 6061 | sd = &per_cpu(core_domains, i); |
6052 | *sd = SD_MC_INIT; | 6062 | *sd = SD_MC_INIT; |
6053 | sd->span = cpu_coregroup_map(i); | 6063 | sd->span = cpu_coregroup_map(i); |
6054 | cpus_and(sd->span, sd->span, *cpu_map); | 6064 | cpus_and(sd->span, sd->span, *cpu_map); |
6055 | sd->parent = p; | 6065 | sd->parent = p; |
6056 | p->child = sd; | 6066 | p->child = sd; |
6057 | cpu_to_core_group(i, cpu_map, &sd->groups); | 6067 | cpu_to_core_group(i, cpu_map, &sd->groups); |
6058 | #endif | 6068 | #endif |
6059 | 6069 | ||
6060 | #ifdef CONFIG_SCHED_SMT | 6070 | #ifdef CONFIG_SCHED_SMT |
6061 | p = sd; | 6071 | p = sd; |
6062 | sd = &per_cpu(cpu_domains, i); | 6072 | sd = &per_cpu(cpu_domains, i); |
6063 | *sd = SD_SIBLING_INIT; | 6073 | *sd = SD_SIBLING_INIT; |
6064 | sd->span = cpu_sibling_map[i]; | 6074 | sd->span = cpu_sibling_map[i]; |
6065 | cpus_and(sd->span, sd->span, *cpu_map); | 6075 | cpus_and(sd->span, sd->span, *cpu_map); |
6066 | sd->parent = p; | 6076 | sd->parent = p; |
6067 | p->child = sd; | 6077 | p->child = sd; |
6068 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 6078 | cpu_to_cpu_group(i, cpu_map, &sd->groups); |
6069 | #endif | 6079 | #endif |
6070 | } | 6080 | } |
6071 | 6081 | ||
6072 | #ifdef CONFIG_SCHED_SMT | 6082 | #ifdef CONFIG_SCHED_SMT |
6073 | /* Set up CPU (sibling) groups */ | 6083 | /* Set up CPU (sibling) groups */ |
6074 | for_each_cpu_mask(i, *cpu_map) { | 6084 | for_each_cpu_mask(i, *cpu_map) { |
6075 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 6085 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
6076 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 6086 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
6077 | if (i != first_cpu(this_sibling_map)) | 6087 | if (i != first_cpu(this_sibling_map)) |
6078 | continue; | 6088 | continue; |
6079 | 6089 | ||
6080 | init_sched_build_groups(this_sibling_map, cpu_map, | 6090 | init_sched_build_groups(this_sibling_map, cpu_map, |
6081 | &cpu_to_cpu_group); | 6091 | &cpu_to_cpu_group); |
6082 | } | 6092 | } |
6083 | #endif | 6093 | #endif |
6084 | 6094 | ||
6085 | #ifdef CONFIG_SCHED_MC | 6095 | #ifdef CONFIG_SCHED_MC |
6086 | /* Set up multi-core groups */ | 6096 | /* Set up multi-core groups */ |
6087 | for_each_cpu_mask(i, *cpu_map) { | 6097 | for_each_cpu_mask(i, *cpu_map) { |
6088 | cpumask_t this_core_map = cpu_coregroup_map(i); | 6098 | cpumask_t this_core_map = cpu_coregroup_map(i); |
6089 | cpus_and(this_core_map, this_core_map, *cpu_map); | 6099 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6090 | if (i != first_cpu(this_core_map)) | 6100 | if (i != first_cpu(this_core_map)) |
6091 | continue; | 6101 | continue; |
6092 | init_sched_build_groups(this_core_map, cpu_map, | 6102 | init_sched_build_groups(this_core_map, cpu_map, |
6093 | &cpu_to_core_group); | 6103 | &cpu_to_core_group); |
6094 | } | 6104 | } |
6095 | #endif | 6105 | #endif |
6096 | 6106 | ||
6097 | /* Set up physical groups */ | 6107 | /* Set up physical groups */ |
6098 | for (i = 0; i < MAX_NUMNODES; i++) { | 6108 | for (i = 0; i < MAX_NUMNODES; i++) { |
6099 | cpumask_t nodemask = node_to_cpumask(i); | 6109 | cpumask_t nodemask = node_to_cpumask(i); |
6100 | 6110 | ||
6101 | cpus_and(nodemask, nodemask, *cpu_map); | 6111 | cpus_and(nodemask, nodemask, *cpu_map); |
6102 | if (cpus_empty(nodemask)) | 6112 | if (cpus_empty(nodemask)) |
6103 | continue; | 6113 | continue; |
6104 | 6114 | ||
6105 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 6115 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); |
6106 | } | 6116 | } |
6107 | 6117 | ||
6108 | #ifdef CONFIG_NUMA | 6118 | #ifdef CONFIG_NUMA |
6109 | /* Set up node groups */ | 6119 | /* Set up node groups */ |
6110 | if (sd_allnodes) | 6120 | if (sd_allnodes) |
6111 | init_sched_build_groups(*cpu_map, cpu_map, | 6121 | init_sched_build_groups(*cpu_map, cpu_map, |
6112 | &cpu_to_allnodes_group); | 6122 | &cpu_to_allnodes_group); |
6113 | 6123 | ||
6114 | for (i = 0; i < MAX_NUMNODES; i++) { | 6124 | for (i = 0; i < MAX_NUMNODES; i++) { |
6115 | /* Set up node groups */ | 6125 | /* Set up node groups */ |
6116 | struct sched_group *sg, *prev; | 6126 | struct sched_group *sg, *prev; |
6117 | cpumask_t nodemask = node_to_cpumask(i); | 6127 | cpumask_t nodemask = node_to_cpumask(i); |
6118 | cpumask_t domainspan; | 6128 | cpumask_t domainspan; |
6119 | cpumask_t covered = CPU_MASK_NONE; | 6129 | cpumask_t covered = CPU_MASK_NONE; |
6120 | int j; | 6130 | int j; |
6121 | 6131 | ||
6122 | cpus_and(nodemask, nodemask, *cpu_map); | 6132 | cpus_and(nodemask, nodemask, *cpu_map); |
6123 | if (cpus_empty(nodemask)) { | 6133 | if (cpus_empty(nodemask)) { |
6124 | sched_group_nodes[i] = NULL; | 6134 | sched_group_nodes[i] = NULL; |
6125 | continue; | 6135 | continue; |
6126 | } | 6136 | } |
6127 | 6137 | ||
6128 | domainspan = sched_domain_node_span(i); | 6138 | domainspan = sched_domain_node_span(i); |
6129 | cpus_and(domainspan, domainspan, *cpu_map); | 6139 | cpus_and(domainspan, domainspan, *cpu_map); |
6130 | 6140 | ||
6131 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 6141 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6132 | if (!sg) { | 6142 | if (!sg) { |
6133 | printk(KERN_WARNING "Can not alloc domain group for " | 6143 | printk(KERN_WARNING "Can not alloc domain group for " |
6134 | "node %d\n", i); | 6144 | "node %d\n", i); |
6135 | goto error; | 6145 | goto error; |
6136 | } | 6146 | } |
6137 | sched_group_nodes[i] = sg; | 6147 | sched_group_nodes[i] = sg; |
6138 | for_each_cpu_mask(j, nodemask) { | 6148 | for_each_cpu_mask(j, nodemask) { |
6139 | struct sched_domain *sd; | 6149 | struct sched_domain *sd; |
6140 | 6150 | ||
6141 | sd = &per_cpu(node_domains, j); | 6151 | sd = &per_cpu(node_domains, j); |
6142 | sd->groups = sg; | 6152 | sd->groups = sg; |
6143 | } | 6153 | } |
6144 | sg->__cpu_power = 0; | 6154 | sg->__cpu_power = 0; |
6145 | sg->cpumask = nodemask; | 6155 | sg->cpumask = nodemask; |
6146 | sg->next = sg; | 6156 | sg->next = sg; |
6147 | cpus_or(covered, covered, nodemask); | 6157 | cpus_or(covered, covered, nodemask); |
6148 | prev = sg; | 6158 | prev = sg; |
6149 | 6159 | ||
6150 | for (j = 0; j < MAX_NUMNODES; j++) { | 6160 | for (j = 0; j < MAX_NUMNODES; j++) { |
6151 | cpumask_t tmp, notcovered; | 6161 | cpumask_t tmp, notcovered; |
6152 | int n = (i + j) % MAX_NUMNODES; | 6162 | int n = (i + j) % MAX_NUMNODES; |
6153 | 6163 | ||
6154 | cpus_complement(notcovered, covered); | 6164 | cpus_complement(notcovered, covered); |
6155 | cpus_and(tmp, notcovered, *cpu_map); | 6165 | cpus_and(tmp, notcovered, *cpu_map); |
6156 | cpus_and(tmp, tmp, domainspan); | 6166 | cpus_and(tmp, tmp, domainspan); |
6157 | if (cpus_empty(tmp)) | 6167 | if (cpus_empty(tmp)) |
6158 | break; | 6168 | break; |
6159 | 6169 | ||
6160 | nodemask = node_to_cpumask(n); | 6170 | nodemask = node_to_cpumask(n); |
6161 | cpus_and(tmp, tmp, nodemask); | 6171 | cpus_and(tmp, tmp, nodemask); |
6162 | if (cpus_empty(tmp)) | 6172 | if (cpus_empty(tmp)) |
6163 | continue; | 6173 | continue; |
6164 | 6174 | ||
6165 | sg = kmalloc_node(sizeof(struct sched_group), | 6175 | sg = kmalloc_node(sizeof(struct sched_group), |
6166 | GFP_KERNEL, i); | 6176 | GFP_KERNEL, i); |
6167 | if (!sg) { | 6177 | if (!sg) { |
6168 | printk(KERN_WARNING | 6178 | printk(KERN_WARNING |
6169 | "Can not alloc domain group for node %d\n", j); | 6179 | "Can not alloc domain group for node %d\n", j); |
6170 | goto error; | 6180 | goto error; |
6171 | } | 6181 | } |
6172 | sg->__cpu_power = 0; | 6182 | sg->__cpu_power = 0; |
6173 | sg->cpumask = tmp; | 6183 | sg->cpumask = tmp; |
6174 | sg->next = prev->next; | 6184 | sg->next = prev->next; |
6175 | cpus_or(covered, covered, tmp); | 6185 | cpus_or(covered, covered, tmp); |
6176 | prev->next = sg; | 6186 | prev->next = sg; |
6177 | prev = sg; | 6187 | prev = sg; |
6178 | } | 6188 | } |
6179 | } | 6189 | } |
6180 | #endif | 6190 | #endif |
6181 | 6191 | ||
6182 | /* Calculate CPU power for physical packages and nodes */ | 6192 | /* Calculate CPU power for physical packages and nodes */ |
6183 | #ifdef CONFIG_SCHED_SMT | 6193 | #ifdef CONFIG_SCHED_SMT |
6184 | for_each_cpu_mask(i, *cpu_map) { | 6194 | for_each_cpu_mask(i, *cpu_map) { |
6185 | struct sched_domain *sd = &per_cpu(cpu_domains, i); | 6195 | struct sched_domain *sd = &per_cpu(cpu_domains, i); |
6186 | 6196 | ||
6187 | init_sched_groups_power(i, sd); | 6197 | init_sched_groups_power(i, sd); |
6188 | } | 6198 | } |
6189 | #endif | 6199 | #endif |
6190 | #ifdef CONFIG_SCHED_MC | 6200 | #ifdef CONFIG_SCHED_MC |
6191 | for_each_cpu_mask(i, *cpu_map) { | 6201 | for_each_cpu_mask(i, *cpu_map) { |
6192 | struct sched_domain *sd = &per_cpu(core_domains, i); | 6202 | struct sched_domain *sd = &per_cpu(core_domains, i); |
6193 | 6203 | ||
6194 | init_sched_groups_power(i, sd); | 6204 | init_sched_groups_power(i, sd); |
6195 | } | 6205 | } |
6196 | #endif | 6206 | #endif |
6197 | 6207 | ||
6198 | for_each_cpu_mask(i, *cpu_map) { | 6208 | for_each_cpu_mask(i, *cpu_map) { |
6199 | struct sched_domain *sd = &per_cpu(phys_domains, i); | 6209 | struct sched_domain *sd = &per_cpu(phys_domains, i); |
6200 | 6210 | ||
6201 | init_sched_groups_power(i, sd); | 6211 | init_sched_groups_power(i, sd); |
6202 | } | 6212 | } |
6203 | 6213 | ||
6204 | #ifdef CONFIG_NUMA | 6214 | #ifdef CONFIG_NUMA |
6205 | for (i = 0; i < MAX_NUMNODES; i++) | 6215 | for (i = 0; i < MAX_NUMNODES; i++) |
6206 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6216 | init_numa_sched_groups_power(sched_group_nodes[i]); |
6207 | 6217 | ||
6208 | if (sd_allnodes) { | 6218 | if (sd_allnodes) { |
6209 | struct sched_group *sg; | 6219 | struct sched_group *sg; |
6210 | 6220 | ||
6211 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 6221 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); |
6212 | init_numa_sched_groups_power(sg); | 6222 | init_numa_sched_groups_power(sg); |
6213 | } | 6223 | } |
6214 | #endif | 6224 | #endif |
6215 | 6225 | ||
6216 | /* Attach the domains */ | 6226 | /* Attach the domains */ |
6217 | for_each_cpu_mask(i, *cpu_map) { | 6227 | for_each_cpu_mask(i, *cpu_map) { |
6218 | struct sched_domain *sd; | 6228 | struct sched_domain *sd; |
6219 | #ifdef CONFIG_SCHED_SMT | 6229 | #ifdef CONFIG_SCHED_SMT |
6220 | sd = &per_cpu(cpu_domains, i); | 6230 | sd = &per_cpu(cpu_domains, i); |
6221 | #elif defined(CONFIG_SCHED_MC) | 6231 | #elif defined(CONFIG_SCHED_MC) |
6222 | sd = &per_cpu(core_domains, i); | 6232 | sd = &per_cpu(core_domains, i); |
6223 | #else | 6233 | #else |
6224 | sd = &per_cpu(phys_domains, i); | 6234 | sd = &per_cpu(phys_domains, i); |
6225 | #endif | 6235 | #endif |
6226 | cpu_attach_domain(sd, i); | 6236 | cpu_attach_domain(sd, i); |
6227 | } | 6237 | } |
6228 | 6238 | ||
6229 | return 0; | 6239 | return 0; |
6230 | 6240 | ||
6231 | #ifdef CONFIG_NUMA | 6241 | #ifdef CONFIG_NUMA |
6232 | error: | 6242 | error: |
6233 | free_sched_groups(cpu_map); | 6243 | free_sched_groups(cpu_map); |
6234 | return -ENOMEM; | 6244 | return -ENOMEM; |
6235 | #endif | 6245 | #endif |
6236 | } | 6246 | } |
6237 | /* | 6247 | /* |
6238 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6248 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6239 | */ | 6249 | */ |
6240 | static int arch_init_sched_domains(const cpumask_t *cpu_map) | 6250 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
6241 | { | 6251 | { |
6242 | cpumask_t cpu_default_map; | 6252 | cpumask_t cpu_default_map; |
6243 | int err; | 6253 | int err; |
6244 | 6254 | ||
6245 | /* | 6255 | /* |
6246 | * Setup mask for cpus without special case scheduling requirements. | 6256 | * Setup mask for cpus without special case scheduling requirements. |
6247 | * For now this just excludes isolated cpus, but could be used to | 6257 | * For now this just excludes isolated cpus, but could be used to |
6248 | * exclude other special cases in the future. | 6258 | * exclude other special cases in the future. |
6249 | */ | 6259 | */ |
6250 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6260 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
6251 | 6261 | ||
6252 | err = build_sched_domains(&cpu_default_map); | 6262 | err = build_sched_domains(&cpu_default_map); |
6253 | 6263 | ||
6254 | return err; | 6264 | return err; |
6255 | } | 6265 | } |
6256 | 6266 | ||
6257 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6267 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
6258 | { | 6268 | { |
6259 | free_sched_groups(cpu_map); | 6269 | free_sched_groups(cpu_map); |
6260 | } | 6270 | } |
6261 | 6271 | ||
6262 | /* | 6272 | /* |
6263 | * Detach sched domains from a group of cpus specified in cpu_map | 6273 | * Detach sched domains from a group of cpus specified in cpu_map |
6264 | * These cpus will now be attached to the NULL domain | 6274 | * These cpus will now be attached to the NULL domain |
6265 | */ | 6275 | */ |
6266 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 6276 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
6267 | { | 6277 | { |
6268 | int i; | 6278 | int i; |
6269 | 6279 | ||
6270 | for_each_cpu_mask(i, *cpu_map) | 6280 | for_each_cpu_mask(i, *cpu_map) |
6271 | cpu_attach_domain(NULL, i); | 6281 | cpu_attach_domain(NULL, i); |
6272 | synchronize_sched(); | 6282 | synchronize_sched(); |
6273 | arch_destroy_sched_domains(cpu_map); | 6283 | arch_destroy_sched_domains(cpu_map); |
6274 | } | 6284 | } |
6275 | 6285 | ||
6276 | /* | 6286 | /* |
6277 | * Partition sched domains as specified by the cpumasks below. | 6287 | * Partition sched domains as specified by the cpumasks below. |
6278 | * This attaches all cpus from the cpumasks to the NULL domain, | 6288 | * This attaches all cpus from the cpumasks to the NULL domain, |
6279 | * waits for a RCU quiescent period, recalculates sched | 6289 | * waits for a RCU quiescent period, recalculates sched |
6280 | * domain information and then attaches them back to the | 6290 | * domain information and then attaches them back to the |
6281 | * correct sched domains | 6291 | * correct sched domains |
6282 | * Call with hotplug lock held | 6292 | * Call with hotplug lock held |
6283 | */ | 6293 | */ |
6284 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6294 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6285 | { | 6295 | { |
6286 | cpumask_t change_map; | 6296 | cpumask_t change_map; |
6287 | int err = 0; | 6297 | int err = 0; |
6288 | 6298 | ||
6289 | cpus_and(*partition1, *partition1, cpu_online_map); | 6299 | cpus_and(*partition1, *partition1, cpu_online_map); |
6290 | cpus_and(*partition2, *partition2, cpu_online_map); | 6300 | cpus_and(*partition2, *partition2, cpu_online_map); |
6291 | cpus_or(change_map, *partition1, *partition2); | 6301 | cpus_or(change_map, *partition1, *partition2); |
6292 | 6302 | ||
6293 | /* Detach sched domains from all of the affected cpus */ | 6303 | /* Detach sched domains from all of the affected cpus */ |
6294 | detach_destroy_domains(&change_map); | 6304 | detach_destroy_domains(&change_map); |
6295 | if (!cpus_empty(*partition1)) | 6305 | if (!cpus_empty(*partition1)) |
6296 | err = build_sched_domains(partition1); | 6306 | err = build_sched_domains(partition1); |
6297 | if (!err && !cpus_empty(*partition2)) | 6307 | if (!err && !cpus_empty(*partition2)) |
6298 | err = build_sched_domains(partition2); | 6308 | err = build_sched_domains(partition2); |
6299 | 6309 | ||
6300 | return err; | 6310 | return err; |
6301 | } | 6311 | } |
6302 | 6312 | ||
6303 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6313 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
6304 | int arch_reinit_sched_domains(void) | 6314 | int arch_reinit_sched_domains(void) |
6305 | { | 6315 | { |
6306 | int err; | 6316 | int err; |
6307 | 6317 | ||
6308 | mutex_lock(&sched_hotcpu_mutex); | 6318 | mutex_lock(&sched_hotcpu_mutex); |
6309 | detach_destroy_domains(&cpu_online_map); | 6319 | detach_destroy_domains(&cpu_online_map); |
6310 | err = arch_init_sched_domains(&cpu_online_map); | 6320 | err = arch_init_sched_domains(&cpu_online_map); |
6311 | mutex_unlock(&sched_hotcpu_mutex); | 6321 | mutex_unlock(&sched_hotcpu_mutex); |
6312 | 6322 | ||
6313 | return err; | 6323 | return err; |
6314 | } | 6324 | } |
6315 | 6325 | ||
6316 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | 6326 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) |
6317 | { | 6327 | { |
6318 | int ret; | 6328 | int ret; |
6319 | 6329 | ||
6320 | if (buf[0] != '0' && buf[0] != '1') | 6330 | if (buf[0] != '0' && buf[0] != '1') |
6321 | return -EINVAL; | 6331 | return -EINVAL; |
6322 | 6332 | ||
6323 | if (smt) | 6333 | if (smt) |
6324 | sched_smt_power_savings = (buf[0] == '1'); | 6334 | sched_smt_power_savings = (buf[0] == '1'); |
6325 | else | 6335 | else |
6326 | sched_mc_power_savings = (buf[0] == '1'); | 6336 | sched_mc_power_savings = (buf[0] == '1'); |
6327 | 6337 | ||
6328 | ret = arch_reinit_sched_domains(); | 6338 | ret = arch_reinit_sched_domains(); |
6329 | 6339 | ||
6330 | return ret ? ret : count; | 6340 | return ret ? ret : count; |
6331 | } | 6341 | } |
6332 | 6342 | ||
6333 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 6343 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) |
6334 | { | 6344 | { |
6335 | int err = 0; | 6345 | int err = 0; |
6336 | 6346 | ||
6337 | #ifdef CONFIG_SCHED_SMT | 6347 | #ifdef CONFIG_SCHED_SMT |
6338 | if (smt_capable()) | 6348 | if (smt_capable()) |
6339 | err = sysfs_create_file(&cls->kset.kobj, | 6349 | err = sysfs_create_file(&cls->kset.kobj, |
6340 | &attr_sched_smt_power_savings.attr); | 6350 | &attr_sched_smt_power_savings.attr); |
6341 | #endif | 6351 | #endif |
6342 | #ifdef CONFIG_SCHED_MC | 6352 | #ifdef CONFIG_SCHED_MC |
6343 | if (!err && mc_capable()) | 6353 | if (!err && mc_capable()) |
6344 | err = sysfs_create_file(&cls->kset.kobj, | 6354 | err = sysfs_create_file(&cls->kset.kobj, |
6345 | &attr_sched_mc_power_savings.attr); | 6355 | &attr_sched_mc_power_savings.attr); |
6346 | #endif | 6356 | #endif |
6347 | return err; | 6357 | return err; |
6348 | } | 6358 | } |
6349 | #endif | 6359 | #endif |
6350 | 6360 | ||
6351 | #ifdef CONFIG_SCHED_MC | 6361 | #ifdef CONFIG_SCHED_MC |
6352 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | 6362 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) |
6353 | { | 6363 | { |
6354 | return sprintf(page, "%u\n", sched_mc_power_savings); | 6364 | return sprintf(page, "%u\n", sched_mc_power_savings); |
6355 | } | 6365 | } |
6356 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | 6366 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, |
6357 | const char *buf, size_t count) | 6367 | const char *buf, size_t count) |
6358 | { | 6368 | { |
6359 | return sched_power_savings_store(buf, count, 0); | 6369 | return sched_power_savings_store(buf, count, 0); |
6360 | } | 6370 | } |
6361 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | 6371 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, |
6362 | sched_mc_power_savings_store); | 6372 | sched_mc_power_savings_store); |
6363 | #endif | 6373 | #endif |
6364 | 6374 | ||
6365 | #ifdef CONFIG_SCHED_SMT | 6375 | #ifdef CONFIG_SCHED_SMT |
6366 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | 6376 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) |
6367 | { | 6377 | { |
6368 | return sprintf(page, "%u\n", sched_smt_power_savings); | 6378 | return sprintf(page, "%u\n", sched_smt_power_savings); |
6369 | } | 6379 | } |
6370 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | 6380 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, |
6371 | const char *buf, size_t count) | 6381 | const char *buf, size_t count) |
6372 | { | 6382 | { |
6373 | return sched_power_savings_store(buf, count, 1); | 6383 | return sched_power_savings_store(buf, count, 1); |
6374 | } | 6384 | } |
6375 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | 6385 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, |
6376 | sched_smt_power_savings_store); | 6386 | sched_smt_power_savings_store); |
6377 | #endif | 6387 | #endif |
6378 | 6388 | ||
6379 | /* | 6389 | /* |
6380 | * Force a reinitialization of the sched domains hierarchy. The domains | 6390 | * Force a reinitialization of the sched domains hierarchy. The domains |
6381 | * and groups cannot be updated in place without racing with the balancing | 6391 | * and groups cannot be updated in place without racing with the balancing |
6382 | * code, so we temporarily attach all running cpus to the NULL domain | 6392 | * code, so we temporarily attach all running cpus to the NULL domain |
6383 | * which will prevent rebalancing while the sched domains are recalculated. | 6393 | * which will prevent rebalancing while the sched domains are recalculated. |
6384 | */ | 6394 | */ |
6385 | static int update_sched_domains(struct notifier_block *nfb, | 6395 | static int update_sched_domains(struct notifier_block *nfb, |
6386 | unsigned long action, void *hcpu) | 6396 | unsigned long action, void *hcpu) |
6387 | { | 6397 | { |
6388 | switch (action) { | 6398 | switch (action) { |
6389 | case CPU_UP_PREPARE: | 6399 | case CPU_UP_PREPARE: |
6390 | case CPU_UP_PREPARE_FROZEN: | 6400 | case CPU_UP_PREPARE_FROZEN: |
6391 | case CPU_DOWN_PREPARE: | 6401 | case CPU_DOWN_PREPARE: |
6392 | case CPU_DOWN_PREPARE_FROZEN: | 6402 | case CPU_DOWN_PREPARE_FROZEN: |
6393 | detach_destroy_domains(&cpu_online_map); | 6403 | detach_destroy_domains(&cpu_online_map); |
6394 | return NOTIFY_OK; | 6404 | return NOTIFY_OK; |
6395 | 6405 | ||
6396 | case CPU_UP_CANCELED: | 6406 | case CPU_UP_CANCELED: |
6397 | case CPU_UP_CANCELED_FROZEN: | 6407 | case CPU_UP_CANCELED_FROZEN: |
6398 | case CPU_DOWN_FAILED: | 6408 | case CPU_DOWN_FAILED: |
6399 | case CPU_DOWN_FAILED_FROZEN: | 6409 | case CPU_DOWN_FAILED_FROZEN: |
6400 | case CPU_ONLINE: | 6410 | case CPU_ONLINE: |
6401 | case CPU_ONLINE_FROZEN: | 6411 | case CPU_ONLINE_FROZEN: |
6402 | case CPU_DEAD: | 6412 | case CPU_DEAD: |
6403 | case CPU_DEAD_FROZEN: | 6413 | case CPU_DEAD_FROZEN: |
6404 | /* | 6414 | /* |
6405 | * Fall through and re-initialise the domains. | 6415 | * Fall through and re-initialise the domains. |
6406 | */ | 6416 | */ |
6407 | break; | 6417 | break; |
6408 | default: | 6418 | default: |
6409 | return NOTIFY_DONE; | 6419 | return NOTIFY_DONE; |
6410 | } | 6420 | } |
6411 | 6421 | ||
6412 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 6422 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
6413 | arch_init_sched_domains(&cpu_online_map); | 6423 | arch_init_sched_domains(&cpu_online_map); |
6414 | 6424 | ||
6415 | return NOTIFY_OK; | 6425 | return NOTIFY_OK; |
6416 | } | 6426 | } |
6417 | 6427 | ||
6418 | void __init sched_init_smp(void) | 6428 | void __init sched_init_smp(void) |
6419 | { | 6429 | { |
6420 | cpumask_t non_isolated_cpus; | 6430 | cpumask_t non_isolated_cpus; |
6421 | 6431 | ||
6422 | mutex_lock(&sched_hotcpu_mutex); | 6432 | mutex_lock(&sched_hotcpu_mutex); |
6423 | arch_init_sched_domains(&cpu_online_map); | 6433 | arch_init_sched_domains(&cpu_online_map); |
6424 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 6434 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6425 | if (cpus_empty(non_isolated_cpus)) | 6435 | if (cpus_empty(non_isolated_cpus)) |
6426 | cpu_set(smp_processor_id(), non_isolated_cpus); | 6436 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6427 | mutex_unlock(&sched_hotcpu_mutex); | 6437 | mutex_unlock(&sched_hotcpu_mutex); |
6428 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6438 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6429 | hotcpu_notifier(update_sched_domains, 0); | 6439 | hotcpu_notifier(update_sched_domains, 0); |
6430 | 6440 | ||
6431 | init_sched_domain_sysctl(); | 6441 | init_sched_domain_sysctl(); |
6432 | 6442 | ||
6433 | /* Move init over to a non-isolated CPU */ | 6443 | /* Move init over to a non-isolated CPU */ |
6434 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6444 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6435 | BUG(); | 6445 | BUG(); |
6436 | sched_init_granularity(); | 6446 | sched_init_granularity(); |
6437 | } | 6447 | } |
6438 | #else | 6448 | #else |
6439 | void __init sched_init_smp(void) | 6449 | void __init sched_init_smp(void) |
6440 | { | 6450 | { |
6441 | sched_init_granularity(); | 6451 | sched_init_granularity(); |
6442 | } | 6452 | } |
6443 | #endif /* CONFIG_SMP */ | 6453 | #endif /* CONFIG_SMP */ |
6444 | 6454 | ||
6445 | int in_sched_functions(unsigned long addr) | 6455 | int in_sched_functions(unsigned long addr) |
6446 | { | 6456 | { |
6447 | /* Linker adds these: start and end of __sched functions */ | 6457 | /* Linker adds these: start and end of __sched functions */ |
6448 | extern char __sched_text_start[], __sched_text_end[]; | 6458 | extern char __sched_text_start[], __sched_text_end[]; |
6449 | 6459 | ||
6450 | return in_lock_functions(addr) || | 6460 | return in_lock_functions(addr) || |
6451 | (addr >= (unsigned long)__sched_text_start | 6461 | (addr >= (unsigned long)__sched_text_start |
6452 | && addr < (unsigned long)__sched_text_end); | 6462 | && addr < (unsigned long)__sched_text_end); |
6453 | } | 6463 | } |
6454 | 6464 | ||
6455 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | 6465 | static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) |
6456 | { | 6466 | { |
6457 | cfs_rq->tasks_timeline = RB_ROOT; | 6467 | cfs_rq->tasks_timeline = RB_ROOT; |
6458 | cfs_rq->fair_clock = 1; | 6468 | cfs_rq->fair_clock = 1; |
6459 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6469 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6460 | cfs_rq->rq = rq; | 6470 | cfs_rq->rq = rq; |
6461 | #endif | 6471 | #endif |
6462 | } | 6472 | } |
6463 | 6473 | ||
6464 | void __init sched_init(void) | 6474 | void __init sched_init(void) |
6465 | { | 6475 | { |
6466 | u64 now = sched_clock(); | 6476 | u64 now = sched_clock(); |
6467 | int highest_cpu = 0; | 6477 | int highest_cpu = 0; |
6468 | int i, j; | 6478 | int i, j; |
6469 | 6479 | ||
6470 | /* | 6480 | /* |
6471 | * Link up the scheduling class hierarchy: | 6481 | * Link up the scheduling class hierarchy: |
6472 | */ | 6482 | */ |
6473 | rt_sched_class.next = &fair_sched_class; | 6483 | rt_sched_class.next = &fair_sched_class; |
6474 | fair_sched_class.next = &idle_sched_class; | 6484 | fair_sched_class.next = &idle_sched_class; |
6475 | idle_sched_class.next = NULL; | 6485 | idle_sched_class.next = NULL; |
6476 | 6486 | ||
6477 | for_each_possible_cpu(i) { | 6487 | for_each_possible_cpu(i) { |
6478 | struct rt_prio_array *array; | 6488 | struct rt_prio_array *array; |
6479 | struct rq *rq; | 6489 | struct rq *rq; |
6480 | 6490 | ||
6481 | rq = cpu_rq(i); | 6491 | rq = cpu_rq(i); |
6482 | spin_lock_init(&rq->lock); | 6492 | spin_lock_init(&rq->lock); |
6483 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 6493 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
6484 | rq->nr_running = 0; | 6494 | rq->nr_running = 0; |
6485 | rq->clock = 1; | 6495 | rq->clock = 1; |
6486 | init_cfs_rq(&rq->cfs, rq); | 6496 | init_cfs_rq(&rq->cfs, rq); |
6487 | #ifdef CONFIG_FAIR_GROUP_SCHED | 6497 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6488 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 6498 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
6489 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 6499 | list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
6490 | #endif | 6500 | #endif |
6491 | rq->ls.load_update_last = now; | 6501 | rq->ls.load_update_last = now; |
6492 | rq->ls.load_update_start = now; | 6502 | rq->ls.load_update_start = now; |
6493 | 6503 | ||
6494 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 6504 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6495 | rq->cpu_load[j] = 0; | 6505 | rq->cpu_load[j] = 0; |
6496 | #ifdef CONFIG_SMP | 6506 | #ifdef CONFIG_SMP |
6497 | rq->sd = NULL; | 6507 | rq->sd = NULL; |
6498 | rq->active_balance = 0; | 6508 | rq->active_balance = 0; |
6499 | rq->next_balance = jiffies; | 6509 | rq->next_balance = jiffies; |
6500 | rq->push_cpu = 0; | 6510 | rq->push_cpu = 0; |
6501 | rq->cpu = i; | 6511 | rq->cpu = i; |
6502 | rq->migration_thread = NULL; | 6512 | rq->migration_thread = NULL; |
6503 | INIT_LIST_HEAD(&rq->migration_queue); | 6513 | INIT_LIST_HEAD(&rq->migration_queue); |
6504 | #endif | 6514 | #endif |
6505 | atomic_set(&rq->nr_iowait, 0); | 6515 | atomic_set(&rq->nr_iowait, 0); |
6506 | 6516 | ||
6507 | array = &rq->rt.active; | 6517 | array = &rq->rt.active; |
6508 | for (j = 0; j < MAX_RT_PRIO; j++) { | 6518 | for (j = 0; j < MAX_RT_PRIO; j++) { |
6509 | INIT_LIST_HEAD(array->queue + j); | 6519 | INIT_LIST_HEAD(array->queue + j); |
6510 | __clear_bit(j, array->bitmap); | 6520 | __clear_bit(j, array->bitmap); |
6511 | } | 6521 | } |
6512 | highest_cpu = i; | 6522 | highest_cpu = i; |
6513 | /* delimiter for bitsearch: */ | 6523 | /* delimiter for bitsearch: */ |
6514 | __set_bit(MAX_RT_PRIO, array->bitmap); | 6524 | __set_bit(MAX_RT_PRIO, array->bitmap); |
6515 | } | 6525 | } |
6516 | 6526 | ||
6517 | set_load_weight(&init_task); | 6527 | set_load_weight(&init_task); |
6518 | 6528 | ||
6519 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 6529 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
6520 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); | 6530 | INIT_HLIST_HEAD(&init_task.preempt_notifiers); |
6521 | #endif | 6531 | #endif |
6522 | 6532 | ||
6523 | #ifdef CONFIG_SMP | 6533 | #ifdef CONFIG_SMP |
6524 | nr_cpu_ids = highest_cpu + 1; | 6534 | nr_cpu_ids = highest_cpu + 1; |
6525 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 6535 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
6526 | #endif | 6536 | #endif |
6527 | 6537 | ||
6528 | #ifdef CONFIG_RT_MUTEXES | 6538 | #ifdef CONFIG_RT_MUTEXES |
6529 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 6539 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); |
6530 | #endif | 6540 | #endif |
6531 | 6541 | ||
6532 | /* | 6542 | /* |
6533 | * The boot idle thread does lazy MMU switching as well: | 6543 | * The boot idle thread does lazy MMU switching as well: |
6534 | */ | 6544 | */ |
6535 | atomic_inc(&init_mm.mm_count); | 6545 | atomic_inc(&init_mm.mm_count); |
6536 | enter_lazy_tlb(&init_mm, current); | 6546 | enter_lazy_tlb(&init_mm, current); |
6537 | 6547 | ||
6538 | /* | 6548 | /* |
6539 | * Make us the idle thread. Technically, schedule() should not be | 6549 | * Make us the idle thread. Technically, schedule() should not be |
6540 | * called from this thread, however somewhere below it might be, | 6550 | * called from this thread, however somewhere below it might be, |
6541 | * but because we are the idle thread, we just pick up running again | 6551 | * but because we are the idle thread, we just pick up running again |
6542 | * when this runqueue becomes "idle". | 6552 | * when this runqueue becomes "idle". |
6543 | */ | 6553 | */ |
6544 | init_idle(current, smp_processor_id()); | 6554 | init_idle(current, smp_processor_id()); |
6545 | /* | 6555 | /* |
6546 | * During early bootup we pretend to be a normal task: | 6556 | * During early bootup we pretend to be a normal task: |
6547 | */ | 6557 | */ |
6548 | current->sched_class = &fair_sched_class; | 6558 | current->sched_class = &fair_sched_class; |
6549 | } | 6559 | } |
6550 | 6560 | ||
6551 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6561 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
6552 | void __might_sleep(char *file, int line) | 6562 | void __might_sleep(char *file, int line) |
6553 | { | 6563 | { |
6554 | #ifdef in_atomic | 6564 | #ifdef in_atomic |
6555 | static unsigned long prev_jiffy; /* ratelimiting */ | 6565 | static unsigned long prev_jiffy; /* ratelimiting */ |
6556 | 6566 | ||
6557 | if ((in_atomic() || irqs_disabled()) && | 6567 | if ((in_atomic() || irqs_disabled()) && |
6558 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 6568 | system_state == SYSTEM_RUNNING && !oops_in_progress) { |
6559 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 6569 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
6560 | return; | 6570 | return; |
6561 | prev_jiffy = jiffies; | 6571 | prev_jiffy = jiffies; |
6562 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 6572 | printk(KERN_ERR "BUG: sleeping function called from invalid" |
6563 | " context at %s:%d\n", file, line); | 6573 | " context at %s:%d\n", file, line); |
6564 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6574 | printk("in_atomic():%d, irqs_disabled():%d\n", |
6565 | in_atomic(), irqs_disabled()); | 6575 | in_atomic(), irqs_disabled()); |
6566 | debug_show_held_locks(current); | 6576 | debug_show_held_locks(current); |
6567 | if (irqs_disabled()) | 6577 | if (irqs_disabled()) |
6568 | print_irqtrace_events(current); | 6578 | print_irqtrace_events(current); |
6569 | dump_stack(); | 6579 | dump_stack(); |
6570 | } | 6580 | } |
6571 | #endif | 6581 | #endif |
6572 | } | 6582 | } |
6573 | EXPORT_SYMBOL(__might_sleep); | 6583 | EXPORT_SYMBOL(__might_sleep); |
6574 | #endif | 6584 | #endif |
6575 | 6585 | ||
6576 | #ifdef CONFIG_MAGIC_SYSRQ | 6586 | #ifdef CONFIG_MAGIC_SYSRQ |
6577 | void normalize_rt_tasks(void) | 6587 | void normalize_rt_tasks(void) |
6578 | { | 6588 | { |
6579 | struct task_struct *g, *p; | 6589 | struct task_struct *g, *p; |
6580 | unsigned long flags; | 6590 | unsigned long flags; |
6581 | struct rq *rq; | 6591 | struct rq *rq; |
6582 | int on_rq; | 6592 | int on_rq; |
6583 | 6593 | ||
6584 | read_lock_irq(&tasklist_lock); | 6594 | read_lock_irq(&tasklist_lock); |
6585 | do_each_thread(g, p) { | 6595 | do_each_thread(g, p) { |
6586 | p->se.fair_key = 0; | 6596 | p->se.fair_key = 0; |
6587 | p->se.wait_runtime = 0; | 6597 | p->se.wait_runtime = 0; |
6588 | p->se.exec_start = 0; | 6598 | p->se.exec_start = 0; |
6589 | p->se.wait_start_fair = 0; | 6599 | p->se.wait_start_fair = 0; |
6590 | p->se.sleep_start_fair = 0; | 6600 | p->se.sleep_start_fair = 0; |
6591 | #ifdef CONFIG_SCHEDSTATS | 6601 | #ifdef CONFIG_SCHEDSTATS |
6592 | p->se.wait_start = 0; | 6602 | p->se.wait_start = 0; |
6593 | p->se.sleep_start = 0; | 6603 | p->se.sleep_start = 0; |
6594 | p->se.block_start = 0; | 6604 | p->se.block_start = 0; |
6595 | #endif | 6605 | #endif |
6596 | task_rq(p)->cfs.fair_clock = 0; | 6606 | task_rq(p)->cfs.fair_clock = 0; |
6597 | task_rq(p)->clock = 0; | 6607 | task_rq(p)->clock = 0; |
6598 | 6608 | ||
6599 | if (!rt_task(p)) { | 6609 | if (!rt_task(p)) { |
6600 | /* | 6610 | /* |
6601 | * Renice negative nice level userspace | 6611 | * Renice negative nice level userspace |
6602 | * tasks back to 0: | 6612 | * tasks back to 0: |
6603 | */ | 6613 | */ |
6604 | if (TASK_NICE(p) < 0 && p->mm) | 6614 | if (TASK_NICE(p) < 0 && p->mm) |
6605 | set_user_nice(p, 0); | 6615 | set_user_nice(p, 0); |
6606 | continue; | 6616 | continue; |
6607 | } | 6617 | } |
6608 | 6618 | ||
6609 | spin_lock_irqsave(&p->pi_lock, flags); | 6619 | spin_lock_irqsave(&p->pi_lock, flags); |
6610 | rq = __task_rq_lock(p); | 6620 | rq = __task_rq_lock(p); |
6611 | #ifdef CONFIG_SMP | 6621 | #ifdef CONFIG_SMP |
6612 | /* | 6622 | /* |
6613 | * Do not touch the migration thread: | 6623 | * Do not touch the migration thread: |
6614 | */ | 6624 | */ |
6615 | if (p == rq->migration_thread) | 6625 | if (p == rq->migration_thread) |
6616 | goto out_unlock; | 6626 | goto out_unlock; |
6617 | #endif | 6627 | #endif |
6618 | 6628 | ||
6619 | on_rq = p->se.on_rq; | 6629 | on_rq = p->se.on_rq; |
6620 | if (on_rq) | 6630 | if (on_rq) |
6621 | deactivate_task(task_rq(p), p, 0); | 6631 | deactivate_task(task_rq(p), p, 0); |
6622 | __setscheduler(rq, p, SCHED_NORMAL, 0); | 6632 | __setscheduler(rq, p, SCHED_NORMAL, 0); |
6623 | if (on_rq) { | 6633 | if (on_rq) { |
6624 | activate_task(task_rq(p), p, 0); | 6634 | activate_task(task_rq(p), p, 0); |
6625 | resched_task(rq->curr); | 6635 | resched_task(rq->curr); |
6626 | } | 6636 | } |
6627 | #ifdef CONFIG_SMP | 6637 | #ifdef CONFIG_SMP |
6628 | out_unlock: | 6638 | out_unlock: |
6629 | #endif | 6639 | #endif |
6630 | __task_rq_unlock(rq); | 6640 | __task_rq_unlock(rq); |
6631 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6641 | spin_unlock_irqrestore(&p->pi_lock, flags); |
6632 | } while_each_thread(g, p); | 6642 | } while_each_thread(g, p); |
6633 | 6643 | ||
6634 | read_unlock_irq(&tasklist_lock); | 6644 | read_unlock_irq(&tasklist_lock); |
6635 | } | 6645 | } |
6636 | 6646 | ||
6637 | #endif /* CONFIG_MAGIC_SYSRQ */ | 6647 | #endif /* CONFIG_MAGIC_SYSRQ */ |
6638 | 6648 | ||
6639 | #ifdef CONFIG_IA64 | 6649 | #ifdef CONFIG_IA64 |
6640 | /* | 6650 | /* |
6641 | * These functions are only useful for the IA64 MCA handling. | 6651 | * These functions are only useful for the IA64 MCA handling. |
6642 | * | 6652 | * |
6643 | * They can only be called when the whole system has been | 6653 | * They can only be called when the whole system has been |
6644 | * stopped - every CPU needs to be quiescent, and no scheduling | 6654 | * stopped - every CPU needs to be quiescent, and no scheduling |
6645 | * activity can take place. Using them for anything else would | 6655 | * activity can take place. Using them for anything else would |
6646 | * be a serious bug, and as a result, they aren't even visible | 6656 | * be a serious bug, and as a result, they aren't even visible |
6647 | * under any other configuration. | 6657 | * under any other configuration. |
6648 | */ | 6658 | */ |
6649 | 6659 | ||
6650 | /** | 6660 | /** |
6651 | * curr_task - return the current task for a given cpu. | 6661 | * curr_task - return the current task for a given cpu. |
6652 | * @cpu: the processor in question. | 6662 | * @cpu: the processor in question. |
6653 | * | 6663 | * |
6654 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6664 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6655 | */ | 6665 | */ |
6656 | struct task_struct *curr_task(int cpu) | 6666 | struct task_struct *curr_task(int cpu) |
6657 | { | 6667 | { |
6658 | return cpu_curr(cpu); | 6668 | return cpu_curr(cpu); |
6659 | } | 6669 | } |
6660 | 6670 | ||
6661 | /** | 6671 | /** |
6662 | * set_curr_task - set the current task for a given cpu. | 6672 | * set_curr_task - set the current task for a given cpu. |
6663 | * @cpu: the processor in question. | 6673 | * @cpu: the processor in question. |
6664 | * @p: the task pointer to set. | 6674 | * @p: the task pointer to set. |
6665 | * | 6675 | * |
6666 | * Description: This function must only be used when non-maskable interrupts | 6676 | * Description: This function must only be used when non-maskable interrupts |
6667 | * are serviced on a separate stack. It allows the architecture to switch the | 6677 | * are serviced on a separate stack. It allows the architecture to switch the |
6668 | * notion of the current task on a cpu in a non-blocking manner. This function | 6678 | * notion of the current task on a cpu in a non-blocking manner. This function |
6669 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6679 | * must be called with all CPU's synchronized, and interrupts disabled, the |
6670 | * and caller must save the original value of the current task (see | 6680 | * and caller must save the original value of the current task (see |
6671 | * curr_task() above) and restore that value before reenabling interrupts and | 6681 | * curr_task() above) and restore that value before reenabling interrupts and |
6672 | * re-starting the system. | 6682 | * re-starting the system. |
kernel/sched_fair.c
1 | /* | 1 | /* |
2 | * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) | 2 | * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) |
3 | * | 3 | * |
4 | * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | 4 | * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> |
5 | * | 5 | * |
6 | * Interactivity improvements by Mike Galbraith | 6 | * Interactivity improvements by Mike Galbraith |
7 | * (C) 2007 Mike Galbraith <efault@gmx.de> | 7 | * (C) 2007 Mike Galbraith <efault@gmx.de> |
8 | * | 8 | * |
9 | * Various enhancements by Dmitry Adamushko. | 9 | * Various enhancements by Dmitry Adamushko. |
10 | * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> | 10 | * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com> |
11 | * | 11 | * |
12 | * Group scheduling enhancements by Srivatsa Vaddagiri | 12 | * Group scheduling enhancements by Srivatsa Vaddagiri |
13 | * Copyright IBM Corporation, 2007 | 13 | * Copyright IBM Corporation, 2007 |
14 | * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> | 14 | * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> |
15 | * | 15 | * |
16 | * Scaled math optimizations by Thomas Gleixner | 16 | * Scaled math optimizations by Thomas Gleixner |
17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> | 17 | * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> |
18 | */ | 18 | */ |
19 | 19 | ||
20 | /* | 20 | /* |
21 | * Preemption granularity: | 21 | * Preemption granularity: |
22 | * (default: 2 msec, units: nanoseconds) | 22 | * (default: 2 msec, units: nanoseconds) |
23 | * | 23 | * |
24 | * NOTE: this granularity value is not the same as the concept of | 24 | * NOTE: this granularity value is not the same as the concept of |
25 | * 'timeslice length' - timeslices in CFS will typically be somewhat | 25 | * 'timeslice length' - timeslices in CFS will typically be somewhat |
26 | * larger than this value. (to see the precise effective timeslice | 26 | * larger than this value. (to see the precise effective timeslice |
27 | * length of your workload, run vmstat and monitor the context-switches | 27 | * length of your workload, run vmstat and monitor the context-switches |
28 | * field) | 28 | * field) |
29 | * | 29 | * |
30 | * On SMP systems the value of this is multiplied by the log2 of the | 30 | * On SMP systems the value of this is multiplied by the log2 of the |
31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way | 31 | * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way |
32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) | 32 | * systems, 4x on 8-way systems, 5x on 16-way systems, etc.) |
33 | */ | 33 | */ |
34 | unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; | 34 | unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ; |
35 | 35 | ||
36 | /* | 36 | /* |
37 | * SCHED_BATCH wake-up granularity. | 37 | * SCHED_BATCH wake-up granularity. |
38 | * (default: 10 msec, units: nanoseconds) | 38 | * (default: 10 msec, units: nanoseconds) |
39 | * | 39 | * |
40 | * This option delays the preemption effects of decoupled workloads | 40 | * This option delays the preemption effects of decoupled workloads |
41 | * and reduces their over-scheduling. Synchronous workloads will still | 41 | * and reduces their over-scheduling. Synchronous workloads will still |
42 | * have immediate wakeup/sleep latencies. | 42 | * have immediate wakeup/sleep latencies. |
43 | */ | 43 | */ |
44 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = | 44 | unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly = |
45 | 10000000000ULL/HZ; | 45 | 10000000000ULL/HZ; |
46 | 46 | ||
47 | /* | 47 | /* |
48 | * SCHED_OTHER wake-up granularity. | 48 | * SCHED_OTHER wake-up granularity. |
49 | * (default: 1 msec, units: nanoseconds) | 49 | * (default: 1 msec, units: nanoseconds) |
50 | * | 50 | * |
51 | * This option delays the preemption effects of decoupled workloads | 51 | * This option delays the preemption effects of decoupled workloads |
52 | * and reduces their over-scheduling. Synchronous workloads will still | 52 | * and reduces their over-scheduling. Synchronous workloads will still |
53 | * have immediate wakeup/sleep latencies. | 53 | * have immediate wakeup/sleep latencies. |
54 | */ | 54 | */ |
55 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; | 55 | unsigned int sysctl_sched_wakeup_granularity __read_mostly = 1000000000ULL/HZ; |
56 | 56 | ||
57 | unsigned int sysctl_sched_stat_granularity __read_mostly; | 57 | unsigned int sysctl_sched_stat_granularity __read_mostly; |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Initialized in sched_init_granularity(): | 60 | * Initialized in sched_init_granularity(): |
61 | */ | 61 | */ |
62 | unsigned int sysctl_sched_runtime_limit __read_mostly; | 62 | unsigned int sysctl_sched_runtime_limit __read_mostly; |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * Debugging: various feature bits | 65 | * Debugging: various feature bits |
66 | */ | 66 | */ |
67 | enum { | 67 | enum { |
68 | SCHED_FEAT_FAIR_SLEEPERS = 1, | 68 | SCHED_FEAT_FAIR_SLEEPERS = 1, |
69 | SCHED_FEAT_SLEEPER_AVG = 2, | 69 | SCHED_FEAT_SLEEPER_AVG = 2, |
70 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, | 70 | SCHED_FEAT_SLEEPER_LOAD_AVG = 4, |
71 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, | 71 | SCHED_FEAT_PRECISE_CPU_LOAD = 8, |
72 | SCHED_FEAT_START_DEBIT = 16, | 72 | SCHED_FEAT_START_DEBIT = 16, |
73 | SCHED_FEAT_SKIP_INITIAL = 32, | 73 | SCHED_FEAT_SKIP_INITIAL = 32, |
74 | }; | 74 | }; |
75 | 75 | ||
76 | unsigned int sysctl_sched_features __read_mostly = | 76 | unsigned int sysctl_sched_features __read_mostly = |
77 | SCHED_FEAT_FAIR_SLEEPERS *1 | | 77 | SCHED_FEAT_FAIR_SLEEPERS *1 | |
78 | SCHED_FEAT_SLEEPER_AVG *1 | | 78 | SCHED_FEAT_SLEEPER_AVG *1 | |
79 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | | 79 | SCHED_FEAT_SLEEPER_LOAD_AVG *1 | |
80 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | | 80 | SCHED_FEAT_PRECISE_CPU_LOAD *1 | |
81 | SCHED_FEAT_START_DEBIT *1 | | 81 | SCHED_FEAT_START_DEBIT *1 | |
82 | SCHED_FEAT_SKIP_INITIAL *0; | 82 | SCHED_FEAT_SKIP_INITIAL *0; |
83 | 83 | ||
84 | extern struct sched_class fair_sched_class; | 84 | extern struct sched_class fair_sched_class; |
85 | 85 | ||
86 | /************************************************************** | 86 | /************************************************************** |
87 | * CFS operations on generic schedulable entities: | 87 | * CFS operations on generic schedulable entities: |
88 | */ | 88 | */ |
89 | 89 | ||
90 | #ifdef CONFIG_FAIR_GROUP_SCHED | 90 | #ifdef CONFIG_FAIR_GROUP_SCHED |
91 | 91 | ||
92 | /* cpu runqueue to which this cfs_rq is attached */ | 92 | /* cpu runqueue to which this cfs_rq is attached */ |
93 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 93 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
94 | { | 94 | { |
95 | return cfs_rq->rq; | 95 | return cfs_rq->rq; |
96 | } | 96 | } |
97 | 97 | ||
98 | /* currently running entity (if any) on this cfs_rq */ | 98 | /* currently running entity (if any) on this cfs_rq */ |
99 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | 99 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) |
100 | { | 100 | { |
101 | return cfs_rq->curr; | 101 | return cfs_rq->curr; |
102 | } | 102 | } |
103 | 103 | ||
104 | /* An entity is a task if it doesn't "own" a runqueue */ | 104 | /* An entity is a task if it doesn't "own" a runqueue */ |
105 | #define entity_is_task(se) (!se->my_q) | 105 | #define entity_is_task(se) (!se->my_q) |
106 | 106 | ||
107 | static inline void | 107 | static inline void |
108 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) | 108 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) |
109 | { | 109 | { |
110 | cfs_rq->curr = se; | 110 | cfs_rq->curr = se; |
111 | } | 111 | } |
112 | 112 | ||
113 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 113 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
114 | 114 | ||
115 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) | 115 | static inline struct rq *rq_of(struct cfs_rq *cfs_rq) |
116 | { | 116 | { |
117 | return container_of(cfs_rq, struct rq, cfs); | 117 | return container_of(cfs_rq, struct rq, cfs); |
118 | } | 118 | } |
119 | 119 | ||
120 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) | 120 | static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq) |
121 | { | 121 | { |
122 | struct rq *rq = rq_of(cfs_rq); | 122 | struct rq *rq = rq_of(cfs_rq); |
123 | 123 | ||
124 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) | 124 | if (unlikely(rq->curr->sched_class != &fair_sched_class)) |
125 | return NULL; | 125 | return NULL; |
126 | 126 | ||
127 | return &rq->curr->se; | 127 | return &rq->curr->se; |
128 | } | 128 | } |
129 | 129 | ||
130 | #define entity_is_task(se) 1 | 130 | #define entity_is_task(se) 1 |
131 | 131 | ||
132 | static inline void | 132 | static inline void |
133 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } | 133 | set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { } |
134 | 134 | ||
135 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 135 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
136 | 136 | ||
137 | static inline struct task_struct *task_of(struct sched_entity *se) | 137 | static inline struct task_struct *task_of(struct sched_entity *se) |
138 | { | 138 | { |
139 | return container_of(se, struct task_struct, se); | 139 | return container_of(se, struct task_struct, se); |
140 | } | 140 | } |
141 | 141 | ||
142 | 142 | ||
143 | /************************************************************** | 143 | /************************************************************** |
144 | * Scheduling class tree data structure manipulation methods: | 144 | * Scheduling class tree data structure manipulation methods: |
145 | */ | 145 | */ |
146 | 146 | ||
147 | /* | 147 | /* |
148 | * Enqueue an entity into the rb-tree: | 148 | * Enqueue an entity into the rb-tree: |
149 | */ | 149 | */ |
150 | static inline void | 150 | static inline void |
151 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 151 | __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
152 | { | 152 | { |
153 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; | 153 | struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; |
154 | struct rb_node *parent = NULL; | 154 | struct rb_node *parent = NULL; |
155 | struct sched_entity *entry; | 155 | struct sched_entity *entry; |
156 | s64 key = se->fair_key; | 156 | s64 key = se->fair_key; |
157 | int leftmost = 1; | 157 | int leftmost = 1; |
158 | 158 | ||
159 | /* | 159 | /* |
160 | * Find the right place in the rbtree: | 160 | * Find the right place in the rbtree: |
161 | */ | 161 | */ |
162 | while (*link) { | 162 | while (*link) { |
163 | parent = *link; | 163 | parent = *link; |
164 | entry = rb_entry(parent, struct sched_entity, run_node); | 164 | entry = rb_entry(parent, struct sched_entity, run_node); |
165 | /* | 165 | /* |
166 | * We dont care about collisions. Nodes with | 166 | * We dont care about collisions. Nodes with |
167 | * the same key stay together. | 167 | * the same key stay together. |
168 | */ | 168 | */ |
169 | if (key - entry->fair_key < 0) { | 169 | if (key - entry->fair_key < 0) { |
170 | link = &parent->rb_left; | 170 | link = &parent->rb_left; |
171 | } else { | 171 | } else { |
172 | link = &parent->rb_right; | 172 | link = &parent->rb_right; |
173 | leftmost = 0; | 173 | leftmost = 0; |
174 | } | 174 | } |
175 | } | 175 | } |
176 | 176 | ||
177 | /* | 177 | /* |
178 | * Maintain a cache of leftmost tree entries (it is frequently | 178 | * Maintain a cache of leftmost tree entries (it is frequently |
179 | * used): | 179 | * used): |
180 | */ | 180 | */ |
181 | if (leftmost) | 181 | if (leftmost) |
182 | cfs_rq->rb_leftmost = &se->run_node; | 182 | cfs_rq->rb_leftmost = &se->run_node; |
183 | 183 | ||
184 | rb_link_node(&se->run_node, parent, link); | 184 | rb_link_node(&se->run_node, parent, link); |
185 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); | 185 | rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); |
186 | update_load_add(&cfs_rq->load, se->load.weight); | 186 | update_load_add(&cfs_rq->load, se->load.weight); |
187 | cfs_rq->nr_running++; | 187 | cfs_rq->nr_running++; |
188 | se->on_rq = 1; | 188 | se->on_rq = 1; |
189 | } | 189 | } |
190 | 190 | ||
191 | static inline void | 191 | static inline void |
192 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | 192 | __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) |
193 | { | 193 | { |
194 | if (cfs_rq->rb_leftmost == &se->run_node) | 194 | if (cfs_rq->rb_leftmost == &se->run_node) |
195 | cfs_rq->rb_leftmost = rb_next(&se->run_node); | 195 | cfs_rq->rb_leftmost = rb_next(&se->run_node); |
196 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); | 196 | rb_erase(&se->run_node, &cfs_rq->tasks_timeline); |
197 | update_load_sub(&cfs_rq->load, se->load.weight); | 197 | update_load_sub(&cfs_rq->load, se->load.weight); |
198 | cfs_rq->nr_running--; | 198 | cfs_rq->nr_running--; |
199 | se->on_rq = 0; | 199 | se->on_rq = 0; |
200 | } | 200 | } |
201 | 201 | ||
202 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) | 202 | static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq) |
203 | { | 203 | { |
204 | return cfs_rq->rb_leftmost; | 204 | return cfs_rq->rb_leftmost; |
205 | } | 205 | } |
206 | 206 | ||
207 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) | 207 | static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) |
208 | { | 208 | { |
209 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); | 209 | return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node); |
210 | } | 210 | } |
211 | 211 | ||
212 | /************************************************************** | 212 | /************************************************************** |
213 | * Scheduling class statistics methods: | 213 | * Scheduling class statistics methods: |
214 | */ | 214 | */ |
215 | 215 | ||
216 | /* | 216 | /* |
217 | * We rescale the rescheduling granularity of tasks according to their | 217 | * We rescale the rescheduling granularity of tasks according to their |
218 | * nice level, but only linearly, not exponentially: | 218 | * nice level, but only linearly, not exponentially: |
219 | */ | 219 | */ |
220 | static long | 220 | static long |
221 | niced_granularity(struct sched_entity *curr, unsigned long granularity) | 221 | niced_granularity(struct sched_entity *curr, unsigned long granularity) |
222 | { | 222 | { |
223 | u64 tmp; | 223 | u64 tmp; |
224 | 224 | ||
225 | /* | 225 | /* |
226 | * Negative nice levels get the same granularity as nice-0: | 226 | * Negative nice levels get the same granularity as nice-0: |
227 | */ | 227 | */ |
228 | if (likely(curr->load.weight >= NICE_0_LOAD)) | 228 | if (likely(curr->load.weight >= NICE_0_LOAD)) |
229 | return granularity; | 229 | return granularity; |
230 | /* | 230 | /* |
231 | * Positive nice level tasks get linearly finer | 231 | * Positive nice level tasks get linearly finer |
232 | * granularity: | 232 | * granularity: |
233 | */ | 233 | */ |
234 | tmp = curr->load.weight * (u64)granularity; | 234 | tmp = curr->load.weight * (u64)granularity; |
235 | 235 | ||
236 | /* | 236 | /* |
237 | * It will always fit into 'long': | 237 | * It will always fit into 'long': |
238 | */ | 238 | */ |
239 | return (long) (tmp >> NICE_0_SHIFT); | 239 | return (long) (tmp >> NICE_0_SHIFT); |
240 | } | 240 | } |
241 | 241 | ||
242 | static inline void | 242 | static inline void |
243 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) | 243 | limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se) |
244 | { | 244 | { |
245 | long limit = sysctl_sched_runtime_limit; | 245 | long limit = sysctl_sched_runtime_limit; |
246 | 246 | ||
247 | /* | 247 | /* |
248 | * Niced tasks have the same history dynamic range as | 248 | * Niced tasks have the same history dynamic range as |
249 | * non-niced tasks: | 249 | * non-niced tasks: |
250 | */ | 250 | */ |
251 | if (unlikely(se->wait_runtime > limit)) { | 251 | if (unlikely(se->wait_runtime > limit)) { |
252 | se->wait_runtime = limit; | 252 | se->wait_runtime = limit; |
253 | schedstat_inc(se, wait_runtime_overruns); | 253 | schedstat_inc(se, wait_runtime_overruns); |
254 | schedstat_inc(cfs_rq, wait_runtime_overruns); | 254 | schedstat_inc(cfs_rq, wait_runtime_overruns); |
255 | } | 255 | } |
256 | if (unlikely(se->wait_runtime < -limit)) { | 256 | if (unlikely(se->wait_runtime < -limit)) { |
257 | se->wait_runtime = -limit; | 257 | se->wait_runtime = -limit; |
258 | schedstat_inc(se, wait_runtime_underruns); | 258 | schedstat_inc(se, wait_runtime_underruns); |
259 | schedstat_inc(cfs_rq, wait_runtime_underruns); | 259 | schedstat_inc(cfs_rq, wait_runtime_underruns); |
260 | } | 260 | } |
261 | } | 261 | } |
262 | 262 | ||
263 | static inline void | 263 | static inline void |
264 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | 264 | __add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) |
265 | { | 265 | { |
266 | se->wait_runtime += delta; | 266 | se->wait_runtime += delta; |
267 | schedstat_add(se, sum_wait_runtime, delta); | 267 | schedstat_add(se, sum_wait_runtime, delta); |
268 | limit_wait_runtime(cfs_rq, se); | 268 | limit_wait_runtime(cfs_rq, se); |
269 | } | 269 | } |
270 | 270 | ||
271 | static void | 271 | static void |
272 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) | 272 | add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta) |
273 | { | 273 | { |
274 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); | 274 | schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime); |
275 | __add_wait_runtime(cfs_rq, se, delta); | 275 | __add_wait_runtime(cfs_rq, se, delta); |
276 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | 276 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); |
277 | } | 277 | } |
278 | 278 | ||
279 | /* | 279 | /* |
280 | * Update the current task's runtime statistics. Skip current tasks that | 280 | * Update the current task's runtime statistics. Skip current tasks that |
281 | * are not in our scheduling class. | 281 | * are not in our scheduling class. |
282 | */ | 282 | */ |
283 | static inline void | 283 | static inline void |
284 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) | 284 | __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now) |
285 | { | 285 | { |
286 | unsigned long delta, delta_exec, delta_fair; | 286 | unsigned long delta, delta_exec, delta_fair; |
287 | long delta_mine; | 287 | long delta_mine; |
288 | struct load_weight *lw = &cfs_rq->load; | 288 | struct load_weight *lw = &cfs_rq->load; |
289 | unsigned long load = lw->weight; | 289 | unsigned long load = lw->weight; |
290 | 290 | ||
291 | if (unlikely(!load)) | 291 | if (unlikely(!load)) |
292 | return; | 292 | return; |
293 | 293 | ||
294 | delta_exec = curr->delta_exec; | 294 | delta_exec = curr->delta_exec; |
295 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); | 295 | schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); |
296 | 296 | ||
297 | curr->sum_exec_runtime += delta_exec; | 297 | curr->sum_exec_runtime += delta_exec; |
298 | cfs_rq->exec_clock += delta_exec; | 298 | cfs_rq->exec_clock += delta_exec; |
299 | 299 | ||
300 | delta_fair = calc_delta_fair(delta_exec, lw); | 300 | delta_fair = calc_delta_fair(delta_exec, lw); |
301 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); | 301 | delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw); |
302 | 302 | ||
303 | if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { | 303 | if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) { |
304 | delta = calc_delta_mine(cfs_rq->sleeper_bonus, | 304 | delta = calc_delta_mine(cfs_rq->sleeper_bonus, |
305 | curr->load.weight, lw); | 305 | curr->load.weight, lw); |
306 | if (unlikely(delta > cfs_rq->sleeper_bonus)) | 306 | if (unlikely(delta > cfs_rq->sleeper_bonus)) |
307 | delta = cfs_rq->sleeper_bonus; | 307 | delta = cfs_rq->sleeper_bonus; |
308 | 308 | ||
309 | cfs_rq->sleeper_bonus -= delta; | 309 | cfs_rq->sleeper_bonus -= delta; |
310 | delta_mine -= delta; | 310 | delta_mine -= delta; |
311 | } | 311 | } |
312 | 312 | ||
313 | cfs_rq->fair_clock += delta_fair; | 313 | cfs_rq->fair_clock += delta_fair; |
314 | /* | 314 | /* |
315 | * We executed delta_exec amount of time on the CPU, | 315 | * We executed delta_exec amount of time on the CPU, |
316 | * but we were only entitled to delta_mine amount of | 316 | * but we were only entitled to delta_mine amount of |
317 | * time during that period (if nr_running == 1 then | 317 | * time during that period (if nr_running == 1 then |
318 | * the two values are equal) | 318 | * the two values are equal) |
319 | * [Note: delta_mine - delta_exec is negative]: | 319 | * [Note: delta_mine - delta_exec is negative]: |
320 | */ | 320 | */ |
321 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); | 321 | add_wait_runtime(cfs_rq, curr, delta_mine - delta_exec); |
322 | } | 322 | } |
323 | 323 | ||
324 | static void update_curr(struct cfs_rq *cfs_rq, u64 now) | 324 | static void update_curr(struct cfs_rq *cfs_rq, u64 now) |
325 | { | 325 | { |
326 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); | 326 | struct sched_entity *curr = cfs_rq_curr(cfs_rq); |
327 | unsigned long delta_exec; | 327 | unsigned long delta_exec; |
328 | 328 | ||
329 | if (unlikely(!curr)) | 329 | if (unlikely(!curr)) |
330 | return; | 330 | return; |
331 | 331 | ||
332 | /* | 332 | /* |
333 | * Get the amount of time the current task was running | 333 | * Get the amount of time the current task was running |
334 | * since the last time we changed load (this cannot | 334 | * since the last time we changed load (this cannot |
335 | * overflow on 32 bits): | 335 | * overflow on 32 bits): |
336 | */ | 336 | */ |
337 | delta_exec = (unsigned long)(now - curr->exec_start); | 337 | delta_exec = (unsigned long)(now - curr->exec_start); |
338 | 338 | ||
339 | curr->delta_exec += delta_exec; | 339 | curr->delta_exec += delta_exec; |
340 | 340 | ||
341 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { | 341 | if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) { |
342 | __update_curr(cfs_rq, curr, now); | 342 | __update_curr(cfs_rq, curr, now); |
343 | curr->delta_exec = 0; | 343 | curr->delta_exec = 0; |
344 | } | 344 | } |
345 | curr->exec_start = now; | 345 | curr->exec_start = now; |
346 | } | 346 | } |
347 | 347 | ||
348 | static inline void | 348 | static inline void |
349 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 349 | update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
350 | { | 350 | { |
351 | se->wait_start_fair = cfs_rq->fair_clock; | 351 | se->wait_start_fair = cfs_rq->fair_clock; |
352 | schedstat_set(se->wait_start, now); | 352 | schedstat_set(se->wait_start, now); |
353 | } | 353 | } |
354 | 354 | ||
355 | /* | 355 | /* |
356 | * We calculate fair deltas here, so protect against the random effects | 356 | * We calculate fair deltas here, so protect against the random effects |
357 | * of a multiplication overflow by capping it to the runtime limit: | 357 | * of a multiplication overflow by capping it to the runtime limit: |
358 | */ | 358 | */ |
359 | #if BITS_PER_LONG == 32 | 359 | #if BITS_PER_LONG == 32 |
360 | static inline unsigned long | 360 | static inline unsigned long |
361 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | 361 | calc_weighted(unsigned long delta, unsigned long weight, int shift) |
362 | { | 362 | { |
363 | u64 tmp = (u64)delta * weight >> shift; | 363 | u64 tmp = (u64)delta * weight >> shift; |
364 | 364 | ||
365 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) | 365 | if (unlikely(tmp > sysctl_sched_runtime_limit*2)) |
366 | return sysctl_sched_runtime_limit*2; | 366 | return sysctl_sched_runtime_limit*2; |
367 | return tmp; | 367 | return tmp; |
368 | } | 368 | } |
369 | #else | 369 | #else |
370 | static inline unsigned long | 370 | static inline unsigned long |
371 | calc_weighted(unsigned long delta, unsigned long weight, int shift) | 371 | calc_weighted(unsigned long delta, unsigned long weight, int shift) |
372 | { | 372 | { |
373 | return delta * weight >> shift; | 373 | return delta * weight >> shift; |
374 | } | 374 | } |
375 | #endif | 375 | #endif |
376 | 376 | ||
377 | /* | 377 | /* |
378 | * Task is being enqueued - update stats: | 378 | * Task is being enqueued - update stats: |
379 | */ | 379 | */ |
380 | static void | 380 | static void |
381 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 381 | update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
382 | { | 382 | { |
383 | s64 key; | 383 | s64 key; |
384 | 384 | ||
385 | /* | 385 | /* |
386 | * Are we enqueueing a waiting task? (for current tasks | 386 | * Are we enqueueing a waiting task? (for current tasks |
387 | * a dequeue/enqueue event is a NOP) | 387 | * a dequeue/enqueue event is a NOP) |
388 | */ | 388 | */ |
389 | if (se != cfs_rq_curr(cfs_rq)) | 389 | if (se != cfs_rq_curr(cfs_rq)) |
390 | update_stats_wait_start(cfs_rq, se, now); | 390 | update_stats_wait_start(cfs_rq, se, now); |
391 | /* | 391 | /* |
392 | * Update the key: | 392 | * Update the key: |
393 | */ | 393 | */ |
394 | key = cfs_rq->fair_clock; | 394 | key = cfs_rq->fair_clock; |
395 | 395 | ||
396 | /* | 396 | /* |
397 | * Optimize the common nice 0 case: | 397 | * Optimize the common nice 0 case: |
398 | */ | 398 | */ |
399 | if (likely(se->load.weight == NICE_0_LOAD)) { | 399 | if (likely(se->load.weight == NICE_0_LOAD)) { |
400 | key -= se->wait_runtime; | 400 | key -= se->wait_runtime; |
401 | } else { | 401 | } else { |
402 | u64 tmp; | 402 | u64 tmp; |
403 | 403 | ||
404 | if (se->wait_runtime < 0) { | 404 | if (se->wait_runtime < 0) { |
405 | tmp = -se->wait_runtime; | 405 | tmp = -se->wait_runtime; |
406 | key += (tmp * se->load.inv_weight) >> | 406 | key += (tmp * se->load.inv_weight) >> |
407 | (WMULT_SHIFT - NICE_0_SHIFT); | 407 | (WMULT_SHIFT - NICE_0_SHIFT); |
408 | } else { | 408 | } else { |
409 | tmp = se->wait_runtime; | 409 | tmp = se->wait_runtime; |
410 | key -= (tmp * se->load.weight) >> NICE_0_SHIFT; | 410 | key -= (tmp * se->load.weight) >> NICE_0_SHIFT; |
411 | } | 411 | } |
412 | } | 412 | } |
413 | 413 | ||
414 | se->fair_key = key; | 414 | se->fair_key = key; |
415 | } | 415 | } |
416 | 416 | ||
417 | /* | 417 | /* |
418 | * Note: must be called with a freshly updated rq->fair_clock. | 418 | * Note: must be called with a freshly updated rq->fair_clock. |
419 | */ | 419 | */ |
420 | static inline void | 420 | static inline void |
421 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 421 | __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
422 | { | 422 | { |
423 | unsigned long delta_fair = se->delta_fair_run; | 423 | unsigned long delta_fair = se->delta_fair_run; |
424 | 424 | ||
425 | schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start)); | 425 | schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start)); |
426 | 426 | ||
427 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 427 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
428 | delta_fair = calc_weighted(delta_fair, se->load.weight, | 428 | delta_fair = calc_weighted(delta_fair, se->load.weight, |
429 | NICE_0_SHIFT); | 429 | NICE_0_SHIFT); |
430 | 430 | ||
431 | add_wait_runtime(cfs_rq, se, delta_fair); | 431 | add_wait_runtime(cfs_rq, se, delta_fair); |
432 | } | 432 | } |
433 | 433 | ||
434 | static void | 434 | static void |
435 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 435 | update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
436 | { | 436 | { |
437 | unsigned long delta_fair; | 437 | unsigned long delta_fair; |
438 | 438 | ||
439 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | 439 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), |
440 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); | 440 | (u64)(cfs_rq->fair_clock - se->wait_start_fair)); |
441 | 441 | ||
442 | se->delta_fair_run += delta_fair; | 442 | se->delta_fair_run += delta_fair; |
443 | if (unlikely(abs(se->delta_fair_run) >= | 443 | if (unlikely(abs(se->delta_fair_run) >= |
444 | sysctl_sched_stat_granularity)) { | 444 | sysctl_sched_stat_granularity)) { |
445 | __update_stats_wait_end(cfs_rq, se, now); | 445 | __update_stats_wait_end(cfs_rq, se, now); |
446 | se->delta_fair_run = 0; | 446 | se->delta_fair_run = 0; |
447 | } | 447 | } |
448 | 448 | ||
449 | se->wait_start_fair = 0; | 449 | se->wait_start_fair = 0; |
450 | schedstat_set(se->wait_start, 0); | 450 | schedstat_set(se->wait_start, 0); |
451 | } | 451 | } |
452 | 452 | ||
453 | static inline void | 453 | static inline void |
454 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 454 | update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
455 | { | 455 | { |
456 | update_curr(cfs_rq, now); | 456 | update_curr(cfs_rq, now); |
457 | /* | 457 | /* |
458 | * Mark the end of the wait period if dequeueing a | 458 | * Mark the end of the wait period if dequeueing a |
459 | * waiting task: | 459 | * waiting task: |
460 | */ | 460 | */ |
461 | if (se != cfs_rq_curr(cfs_rq)) | 461 | if (se != cfs_rq_curr(cfs_rq)) |
462 | update_stats_wait_end(cfs_rq, se, now); | 462 | update_stats_wait_end(cfs_rq, se, now); |
463 | } | 463 | } |
464 | 464 | ||
465 | /* | 465 | /* |
466 | * We are picking a new current task - update its stats: | 466 | * We are picking a new current task - update its stats: |
467 | */ | 467 | */ |
468 | static inline void | 468 | static inline void |
469 | update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 469 | update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
470 | { | 470 | { |
471 | /* | 471 | /* |
472 | * We are starting a new run period: | 472 | * We are starting a new run period: |
473 | */ | 473 | */ |
474 | se->exec_start = now; | 474 | se->exec_start = now; |
475 | } | 475 | } |
476 | 476 | ||
477 | /* | 477 | /* |
478 | * We are descheduling a task - update its stats: | 478 | * We are descheduling a task - update its stats: |
479 | */ | 479 | */ |
480 | static inline void | 480 | static inline void |
481 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 481 | update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
482 | { | 482 | { |
483 | se->exec_start = 0; | 483 | se->exec_start = 0; |
484 | } | 484 | } |
485 | 485 | ||
486 | /************************************************** | 486 | /************************************************** |
487 | * Scheduling class queueing methods: | 487 | * Scheduling class queueing methods: |
488 | */ | 488 | */ |
489 | 489 | ||
490 | static void | 490 | static void |
491 | __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 491 | __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
492 | { | 492 | { |
493 | unsigned long load = cfs_rq->load.weight, delta_fair; | 493 | unsigned long load = cfs_rq->load.weight, delta_fair; |
494 | long prev_runtime; | 494 | long prev_runtime; |
495 | 495 | ||
496 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) | 496 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG) |
497 | load = rq_of(cfs_rq)->cpu_load[2]; | 497 | load = rq_of(cfs_rq)->cpu_load[2]; |
498 | 498 | ||
499 | delta_fair = se->delta_fair_sleep; | 499 | delta_fair = se->delta_fair_sleep; |
500 | 500 | ||
501 | /* | 501 | /* |
502 | * Fix up delta_fair with the effect of us running | 502 | * Fix up delta_fair with the effect of us running |
503 | * during the whole sleep period: | 503 | * during the whole sleep period: |
504 | */ | 504 | */ |
505 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) | 505 | if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG) |
506 | delta_fair = div64_likely32((u64)delta_fair * load, | 506 | delta_fair = div64_likely32((u64)delta_fair * load, |
507 | load + se->load.weight); | 507 | load + se->load.weight); |
508 | 508 | ||
509 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 509 | if (unlikely(se->load.weight != NICE_0_LOAD)) |
510 | delta_fair = calc_weighted(delta_fair, se->load.weight, | 510 | delta_fair = calc_weighted(delta_fair, se->load.weight, |
511 | NICE_0_SHIFT); | 511 | NICE_0_SHIFT); |
512 | 512 | ||
513 | prev_runtime = se->wait_runtime; | 513 | prev_runtime = se->wait_runtime; |
514 | __add_wait_runtime(cfs_rq, se, delta_fair); | 514 | __add_wait_runtime(cfs_rq, se, delta_fair); |
515 | delta_fair = se->wait_runtime - prev_runtime; | 515 | delta_fair = se->wait_runtime - prev_runtime; |
516 | 516 | ||
517 | /* | 517 | /* |
518 | * Track the amount of bonus we've given to sleepers: | 518 | * Track the amount of bonus we've given to sleepers: |
519 | */ | 519 | */ |
520 | cfs_rq->sleeper_bonus += delta_fair; | 520 | cfs_rq->sleeper_bonus += delta_fair; |
521 | 521 | ||
522 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); | 522 | schedstat_add(cfs_rq, wait_runtime, se->wait_runtime); |
523 | } | 523 | } |
524 | 524 | ||
525 | static void | 525 | static void |
526 | enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 526 | enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
527 | { | 527 | { |
528 | struct task_struct *tsk = task_of(se); | 528 | struct task_struct *tsk = task_of(se); |
529 | unsigned long delta_fair; | 529 | unsigned long delta_fair; |
530 | 530 | ||
531 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || | 531 | if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) || |
532 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) | 532 | !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS)) |
533 | return; | 533 | return; |
534 | 534 | ||
535 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), | 535 | delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit), |
536 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); | 536 | (u64)(cfs_rq->fair_clock - se->sleep_start_fair)); |
537 | 537 | ||
538 | se->delta_fair_sleep += delta_fair; | 538 | se->delta_fair_sleep += delta_fair; |
539 | if (unlikely(abs(se->delta_fair_sleep) >= | 539 | if (unlikely(abs(se->delta_fair_sleep) >= |
540 | sysctl_sched_stat_granularity)) { | 540 | sysctl_sched_stat_granularity)) { |
541 | __enqueue_sleeper(cfs_rq, se, now); | 541 | __enqueue_sleeper(cfs_rq, se, now); |
542 | se->delta_fair_sleep = 0; | 542 | se->delta_fair_sleep = 0; |
543 | } | 543 | } |
544 | 544 | ||
545 | se->sleep_start_fair = 0; | 545 | se->sleep_start_fair = 0; |
546 | 546 | ||
547 | #ifdef CONFIG_SCHEDSTATS | 547 | #ifdef CONFIG_SCHEDSTATS |
548 | if (se->sleep_start) { | 548 | if (se->sleep_start) { |
549 | u64 delta = now - se->sleep_start; | 549 | u64 delta = now - se->sleep_start; |
550 | 550 | ||
551 | if ((s64)delta < 0) | 551 | if ((s64)delta < 0) |
552 | delta = 0; | 552 | delta = 0; |
553 | 553 | ||
554 | if (unlikely(delta > se->sleep_max)) | 554 | if (unlikely(delta > se->sleep_max)) |
555 | se->sleep_max = delta; | 555 | se->sleep_max = delta; |
556 | 556 | ||
557 | se->sleep_start = 0; | 557 | se->sleep_start = 0; |
558 | se->sum_sleep_runtime += delta; | 558 | se->sum_sleep_runtime += delta; |
559 | } | 559 | } |
560 | if (se->block_start) { | 560 | if (se->block_start) { |
561 | u64 delta = now - se->block_start; | 561 | u64 delta = now - se->block_start; |
562 | 562 | ||
563 | if ((s64)delta < 0) | 563 | if ((s64)delta < 0) |
564 | delta = 0; | 564 | delta = 0; |
565 | 565 | ||
566 | if (unlikely(delta > se->block_max)) | 566 | if (unlikely(delta > se->block_max)) |
567 | se->block_max = delta; | 567 | se->block_max = delta; |
568 | 568 | ||
569 | se->block_start = 0; | 569 | se->block_start = 0; |
570 | se->sum_sleep_runtime += delta; | 570 | se->sum_sleep_runtime += delta; |
571 | } | 571 | } |
572 | #endif | 572 | #endif |
573 | } | 573 | } |
574 | 574 | ||
575 | static void | 575 | static void |
576 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 576 | enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
577 | int wakeup, u64 now) | 577 | int wakeup, u64 now) |
578 | { | 578 | { |
579 | /* | 579 | /* |
580 | * Update the fair clock. | 580 | * Update the fair clock. |
581 | */ | 581 | */ |
582 | update_curr(cfs_rq, now); | 582 | update_curr(cfs_rq, now); |
583 | 583 | ||
584 | if (wakeup) | 584 | if (wakeup) |
585 | enqueue_sleeper(cfs_rq, se, now); | 585 | enqueue_sleeper(cfs_rq, se, now); |
586 | 586 | ||
587 | update_stats_enqueue(cfs_rq, se, now); | 587 | update_stats_enqueue(cfs_rq, se, now); |
588 | __enqueue_entity(cfs_rq, se); | 588 | __enqueue_entity(cfs_rq, se); |
589 | } | 589 | } |
590 | 590 | ||
591 | static void | 591 | static void |
592 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | 592 | dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, |
593 | int sleep, u64 now) | 593 | int sleep, u64 now) |
594 | { | 594 | { |
595 | update_stats_dequeue(cfs_rq, se, now); | 595 | update_stats_dequeue(cfs_rq, se, now); |
596 | if (sleep) { | 596 | if (sleep) { |
597 | se->sleep_start_fair = cfs_rq->fair_clock; | 597 | se->sleep_start_fair = cfs_rq->fair_clock; |
598 | #ifdef CONFIG_SCHEDSTATS | 598 | #ifdef CONFIG_SCHEDSTATS |
599 | if (entity_is_task(se)) { | 599 | if (entity_is_task(se)) { |
600 | struct task_struct *tsk = task_of(se); | 600 | struct task_struct *tsk = task_of(se); |
601 | 601 | ||
602 | if (tsk->state & TASK_INTERRUPTIBLE) | 602 | if (tsk->state & TASK_INTERRUPTIBLE) |
603 | se->sleep_start = now; | 603 | se->sleep_start = now; |
604 | if (tsk->state & TASK_UNINTERRUPTIBLE) | 604 | if (tsk->state & TASK_UNINTERRUPTIBLE) |
605 | se->block_start = now; | 605 | se->block_start = now; |
606 | } | 606 | } |
607 | cfs_rq->wait_runtime -= se->wait_runtime; | 607 | cfs_rq->wait_runtime -= se->wait_runtime; |
608 | #endif | 608 | #endif |
609 | } | 609 | } |
610 | __dequeue_entity(cfs_rq, se); | 610 | __dequeue_entity(cfs_rq, se); |
611 | } | 611 | } |
612 | 612 | ||
613 | /* | 613 | /* |
614 | * Preempt the current task with a newly woken task if needed: | 614 | * Preempt the current task with a newly woken task if needed: |
615 | */ | 615 | */ |
616 | static void | 616 | static void |
617 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, | 617 | __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, |
618 | struct sched_entity *curr, unsigned long granularity) | 618 | struct sched_entity *curr, unsigned long granularity) |
619 | { | 619 | { |
620 | s64 __delta = curr->fair_key - se->fair_key; | 620 | s64 __delta = curr->fair_key - se->fair_key; |
621 | 621 | ||
622 | /* | 622 | /* |
623 | * Take scheduling granularity into account - do not | 623 | * Take scheduling granularity into account - do not |
624 | * preempt the current task unless the best task has | 624 | * preempt the current task unless the best task has |
625 | * a larger than sched_granularity fairness advantage: | 625 | * a larger than sched_granularity fairness advantage: |
626 | */ | 626 | */ |
627 | if (__delta > niced_granularity(curr, granularity)) | 627 | if (__delta > niced_granularity(curr, granularity)) |
628 | resched_task(rq_of(cfs_rq)->curr); | 628 | resched_task(rq_of(cfs_rq)->curr); |
629 | } | 629 | } |
630 | 630 | ||
631 | static inline void | 631 | static inline void |
632 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) | 632 | set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now) |
633 | { | 633 | { |
634 | /* | 634 | /* |
635 | * Any task has to be enqueued before it get to execute on | 635 | * Any task has to be enqueued before it get to execute on |
636 | * a CPU. So account for the time it spent waiting on the | 636 | * a CPU. So account for the time it spent waiting on the |
637 | * runqueue. (note, here we rely on pick_next_task() having | 637 | * runqueue. (note, here we rely on pick_next_task() having |
638 | * done a put_prev_task_fair() shortly before this, which | 638 | * done a put_prev_task_fair() shortly before this, which |
639 | * updated rq->fair_clock - used by update_stats_wait_end()) | 639 | * updated rq->fair_clock - used by update_stats_wait_end()) |
640 | */ | 640 | */ |
641 | update_stats_wait_end(cfs_rq, se, now); | 641 | update_stats_wait_end(cfs_rq, se, now); |
642 | update_stats_curr_start(cfs_rq, se, now); | 642 | update_stats_curr_start(cfs_rq, se, now); |
643 | set_cfs_rq_curr(cfs_rq, se); | 643 | set_cfs_rq_curr(cfs_rq, se); |
644 | } | 644 | } |
645 | 645 | ||
646 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) | 646 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now) |
647 | { | 647 | { |
648 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 648 | struct sched_entity *se = __pick_next_entity(cfs_rq); |
649 | 649 | ||
650 | set_next_entity(cfs_rq, se, now); | 650 | set_next_entity(cfs_rq, se, now); |
651 | 651 | ||
652 | return se; | 652 | return se; |
653 | } | 653 | } |
654 | 654 | ||
655 | static void | 655 | static void |
656 | put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) | 656 | put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now) |
657 | { | 657 | { |
658 | /* | 658 | /* |
659 | * If still on the runqueue then deactivate_task() | 659 | * If still on the runqueue then deactivate_task() |
660 | * was not called and update_curr() has to be done: | 660 | * was not called and update_curr() has to be done: |
661 | */ | 661 | */ |
662 | if (prev->on_rq) | 662 | if (prev->on_rq) |
663 | update_curr(cfs_rq, now); | 663 | update_curr(cfs_rq, now); |
664 | 664 | ||
665 | update_stats_curr_end(cfs_rq, prev, now); | 665 | update_stats_curr_end(cfs_rq, prev, now); |
666 | 666 | ||
667 | if (prev->on_rq) | 667 | if (prev->on_rq) |
668 | update_stats_wait_start(cfs_rq, prev, now); | 668 | update_stats_wait_start(cfs_rq, prev, now); |
669 | set_cfs_rq_curr(cfs_rq, NULL); | 669 | set_cfs_rq_curr(cfs_rq, NULL); |
670 | } | 670 | } |
671 | 671 | ||
672 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 672 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) |
673 | { | 673 | { |
674 | struct rq *rq = rq_of(cfs_rq); | 674 | struct rq *rq = rq_of(cfs_rq); |
675 | struct sched_entity *next; | 675 | struct sched_entity *next; |
676 | u64 now = __rq_clock(rq); | 676 | u64 now = __rq_clock(rq); |
677 | 677 | ||
678 | /* | 678 | /* |
679 | * Dequeue and enqueue the task to update its | 679 | * Dequeue and enqueue the task to update its |
680 | * position within the tree: | 680 | * position within the tree: |
681 | */ | 681 | */ |
682 | dequeue_entity(cfs_rq, curr, 0, now); | 682 | dequeue_entity(cfs_rq, curr, 0, now); |
683 | enqueue_entity(cfs_rq, curr, 0, now); | 683 | enqueue_entity(cfs_rq, curr, 0, now); |
684 | 684 | ||
685 | /* | 685 | /* |
686 | * Reschedule if another task tops the current one. | 686 | * Reschedule if another task tops the current one. |
687 | */ | 687 | */ |
688 | next = __pick_next_entity(cfs_rq); | 688 | next = __pick_next_entity(cfs_rq); |
689 | if (next == curr) | 689 | if (next == curr) |
690 | return; | 690 | return; |
691 | 691 | ||
692 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); | 692 | __check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity); |
693 | } | 693 | } |
694 | 694 | ||
695 | /************************************************** | 695 | /************************************************** |
696 | * CFS operations on tasks: | 696 | * CFS operations on tasks: |
697 | */ | 697 | */ |
698 | 698 | ||
699 | #ifdef CONFIG_FAIR_GROUP_SCHED | 699 | #ifdef CONFIG_FAIR_GROUP_SCHED |
700 | 700 | ||
701 | /* Walk up scheduling entities hierarchy */ | 701 | /* Walk up scheduling entities hierarchy */ |
702 | #define for_each_sched_entity(se) \ | 702 | #define for_each_sched_entity(se) \ |
703 | for (; se; se = se->parent) | 703 | for (; se; se = se->parent) |
704 | 704 | ||
705 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | 705 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
706 | { | 706 | { |
707 | return p->se.cfs_rq; | 707 | return p->se.cfs_rq; |
708 | } | 708 | } |
709 | 709 | ||
710 | /* runqueue on which this entity is (to be) queued */ | 710 | /* runqueue on which this entity is (to be) queued */ |
711 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | 711 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) |
712 | { | 712 | { |
713 | return se->cfs_rq; | 713 | return se->cfs_rq; |
714 | } | 714 | } |
715 | 715 | ||
716 | /* runqueue "owned" by this group */ | 716 | /* runqueue "owned" by this group */ |
717 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | 717 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) |
718 | { | 718 | { |
719 | return grp->my_q; | 719 | return grp->my_q; |
720 | } | 720 | } |
721 | 721 | ||
722 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on | 722 | /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on |
723 | * another cpu ('this_cpu') | 723 | * another cpu ('this_cpu') |
724 | */ | 724 | */ |
725 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | 725 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) |
726 | { | 726 | { |
727 | /* A later patch will take group into account */ | 727 | /* A later patch will take group into account */ |
728 | return &cpu_rq(this_cpu)->cfs; | 728 | return &cpu_rq(this_cpu)->cfs; |
729 | } | 729 | } |
730 | 730 | ||
731 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 731 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
732 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 732 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
733 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 733 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
734 | 734 | ||
735 | /* Do the two (enqueued) tasks belong to the same group ? */ | 735 | /* Do the two (enqueued) tasks belong to the same group ? */ |
736 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 736 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) |
737 | { | 737 | { |
738 | if (curr->se.cfs_rq == p->se.cfs_rq) | 738 | if (curr->se.cfs_rq == p->se.cfs_rq) |
739 | return 1; | 739 | return 1; |
740 | 740 | ||
741 | return 0; | 741 | return 0; |
742 | } | 742 | } |
743 | 743 | ||
744 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 744 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
745 | 745 | ||
746 | #define for_each_sched_entity(se) \ | 746 | #define for_each_sched_entity(se) \ |
747 | for (; se; se = NULL) | 747 | for (; se; se = NULL) |
748 | 748 | ||
749 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) | 749 | static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) |
750 | { | 750 | { |
751 | return &task_rq(p)->cfs; | 751 | return &task_rq(p)->cfs; |
752 | } | 752 | } |
753 | 753 | ||
754 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) | 754 | static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) |
755 | { | 755 | { |
756 | struct task_struct *p = task_of(se); | 756 | struct task_struct *p = task_of(se); |
757 | struct rq *rq = task_rq(p); | 757 | struct rq *rq = task_rq(p); |
758 | 758 | ||
759 | return &rq->cfs; | 759 | return &rq->cfs; |
760 | } | 760 | } |
761 | 761 | ||
762 | /* runqueue "owned" by this group */ | 762 | /* runqueue "owned" by this group */ |
763 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | 763 | static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) |
764 | { | 764 | { |
765 | return NULL; | 765 | return NULL; |
766 | } | 766 | } |
767 | 767 | ||
768 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | 768 | static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) |
769 | { | 769 | { |
770 | return &cpu_rq(this_cpu)->cfs; | 770 | return &cpu_rq(this_cpu)->cfs; |
771 | } | 771 | } |
772 | 772 | ||
773 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 773 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
774 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) | 774 | for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) |
775 | 775 | ||
776 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) | 776 | static inline int is_same_group(struct task_struct *curr, struct task_struct *p) |
777 | { | 777 | { |
778 | return 1; | 778 | return 1; |
779 | } | 779 | } |
780 | 780 | ||
781 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 781 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
782 | 782 | ||
783 | /* | 783 | /* |
784 | * The enqueue_task method is called before nr_running is | 784 | * The enqueue_task method is called before nr_running is |
785 | * increased. Here we update the fair scheduling stats and | 785 | * increased. Here we update the fair scheduling stats and |
786 | * then put the task into the rbtree: | 786 | * then put the task into the rbtree: |
787 | */ | 787 | */ |
788 | static void | 788 | static void |
789 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | 789 | enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now) |
790 | { | 790 | { |
791 | struct cfs_rq *cfs_rq; | 791 | struct cfs_rq *cfs_rq; |
792 | struct sched_entity *se = &p->se; | 792 | struct sched_entity *se = &p->se; |
793 | 793 | ||
794 | for_each_sched_entity(se) { | 794 | for_each_sched_entity(se) { |
795 | if (se->on_rq) | 795 | if (se->on_rq) |
796 | break; | 796 | break; |
797 | cfs_rq = cfs_rq_of(se); | 797 | cfs_rq = cfs_rq_of(se); |
798 | enqueue_entity(cfs_rq, se, wakeup, now); | 798 | enqueue_entity(cfs_rq, se, wakeup, now); |
799 | } | 799 | } |
800 | } | 800 | } |
801 | 801 | ||
802 | /* | 802 | /* |
803 | * The dequeue_task method is called before nr_running is | 803 | * The dequeue_task method is called before nr_running is |
804 | * decreased. We remove the task from the rbtree and | 804 | * decreased. We remove the task from the rbtree and |
805 | * update the fair scheduling stats: | 805 | * update the fair scheduling stats: |
806 | */ | 806 | */ |
807 | static void | 807 | static void |
808 | dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) | 808 | dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now) |
809 | { | 809 | { |
810 | struct cfs_rq *cfs_rq; | 810 | struct cfs_rq *cfs_rq; |
811 | struct sched_entity *se = &p->se; | 811 | struct sched_entity *se = &p->se; |
812 | 812 | ||
813 | for_each_sched_entity(se) { | 813 | for_each_sched_entity(se) { |
814 | cfs_rq = cfs_rq_of(se); | 814 | cfs_rq = cfs_rq_of(se); |
815 | dequeue_entity(cfs_rq, se, sleep, now); | 815 | dequeue_entity(cfs_rq, se, sleep, now); |
816 | /* Don't dequeue parent if it has other entities besides us */ | 816 | /* Don't dequeue parent if it has other entities besides us */ |
817 | if (cfs_rq->load.weight) | 817 | if (cfs_rq->load.weight) |
818 | break; | 818 | break; |
819 | } | 819 | } |
820 | } | 820 | } |
821 | 821 | ||
822 | /* | 822 | /* |
823 | * sched_yield() support is very simple - we dequeue and enqueue | 823 | * sched_yield() support is very simple - we dequeue and enqueue |
824 | */ | 824 | */ |
825 | static void yield_task_fair(struct rq *rq, struct task_struct *p) | 825 | static void yield_task_fair(struct rq *rq, struct task_struct *p) |
826 | { | 826 | { |
827 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 827 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
828 | u64 now = __rq_clock(rq); | 828 | u64 now = __rq_clock(rq); |
829 | 829 | ||
830 | /* | 830 | /* |
831 | * Dequeue and enqueue the task to update its | 831 | * Dequeue and enqueue the task to update its |
832 | * position within the tree: | 832 | * position within the tree: |
833 | */ | 833 | */ |
834 | dequeue_entity(cfs_rq, &p->se, 0, now); | 834 | dequeue_entity(cfs_rq, &p->se, 0, now); |
835 | enqueue_entity(cfs_rq, &p->se, 0, now); | 835 | enqueue_entity(cfs_rq, &p->se, 0, now); |
836 | } | 836 | } |
837 | 837 | ||
838 | /* | 838 | /* |
839 | * Preempt the current task with a newly woken task if needed: | 839 | * Preempt the current task with a newly woken task if needed: |
840 | */ | 840 | */ |
841 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) | 841 | static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p) |
842 | { | 842 | { |
843 | struct task_struct *curr = rq->curr; | 843 | struct task_struct *curr = rq->curr; |
844 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 844 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
845 | unsigned long gran; | 845 | unsigned long gran; |
846 | 846 | ||
847 | if (unlikely(rt_prio(p->prio))) { | 847 | if (unlikely(rt_prio(p->prio))) { |
848 | update_curr(cfs_rq, rq_clock(rq)); | 848 | update_curr(cfs_rq, rq_clock(rq)); |
849 | resched_task(curr); | 849 | resched_task(curr); |
850 | return; | 850 | return; |
851 | } | 851 | } |
852 | 852 | ||
853 | gran = sysctl_sched_wakeup_granularity; | 853 | gran = sysctl_sched_wakeup_granularity; |
854 | /* | 854 | /* |
855 | * Batch tasks prefer throughput over latency: | 855 | * Batch tasks prefer throughput over latency: |
856 | */ | 856 | */ |
857 | if (unlikely(p->policy == SCHED_BATCH)) | 857 | if (unlikely(p->policy == SCHED_BATCH)) |
858 | gran = sysctl_sched_batch_wakeup_granularity; | 858 | gran = sysctl_sched_batch_wakeup_granularity; |
859 | 859 | ||
860 | if (is_same_group(curr, p)) | 860 | if (is_same_group(curr, p)) |
861 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); | 861 | __check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran); |
862 | } | 862 | } |
863 | 863 | ||
864 | static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) | 864 | static struct task_struct *pick_next_task_fair(struct rq *rq, u64 now) |
865 | { | 865 | { |
866 | struct cfs_rq *cfs_rq = &rq->cfs; | 866 | struct cfs_rq *cfs_rq = &rq->cfs; |
867 | struct sched_entity *se; | 867 | struct sched_entity *se; |
868 | 868 | ||
869 | if (unlikely(!cfs_rq->nr_running)) | 869 | if (unlikely(!cfs_rq->nr_running)) |
870 | return NULL; | 870 | return NULL; |
871 | 871 | ||
872 | do { | 872 | do { |
873 | se = pick_next_entity(cfs_rq, now); | 873 | se = pick_next_entity(cfs_rq, now); |
874 | cfs_rq = group_cfs_rq(se); | 874 | cfs_rq = group_cfs_rq(se); |
875 | } while (cfs_rq); | 875 | } while (cfs_rq); |
876 | 876 | ||
877 | return task_of(se); | 877 | return task_of(se); |
878 | } | 878 | } |
879 | 879 | ||
880 | /* | 880 | /* |
881 | * Account for a descheduled task: | 881 | * Account for a descheduled task: |
882 | */ | 882 | */ |
883 | static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) | 883 | static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now) |
884 | { | 884 | { |
885 | struct sched_entity *se = &prev->se; | 885 | struct sched_entity *se = &prev->se; |
886 | struct cfs_rq *cfs_rq; | 886 | struct cfs_rq *cfs_rq; |
887 | 887 | ||
888 | for_each_sched_entity(se) { | 888 | for_each_sched_entity(se) { |
889 | cfs_rq = cfs_rq_of(se); | 889 | cfs_rq = cfs_rq_of(se); |
890 | put_prev_entity(cfs_rq, se, now); | 890 | put_prev_entity(cfs_rq, se, now); |
891 | } | 891 | } |
892 | } | 892 | } |
893 | 893 | ||
894 | /************************************************** | 894 | /************************************************** |
895 | * Fair scheduling class load-balancing methods: | 895 | * Fair scheduling class load-balancing methods: |
896 | */ | 896 | */ |
897 | 897 | ||
898 | /* | 898 | /* |
899 | * Load-balancing iterator. Note: while the runqueue stays locked | 899 | * Load-balancing iterator. Note: while the runqueue stays locked |
900 | * during the whole iteration, the current task might be | 900 | * during the whole iteration, the current task might be |
901 | * dequeued so the iterator has to be dequeue-safe. Here we | 901 | * dequeued so the iterator has to be dequeue-safe. Here we |
902 | * achieve that by always pre-iterating before returning | 902 | * achieve that by always pre-iterating before returning |
903 | * the current task: | 903 | * the current task: |
904 | */ | 904 | */ |
905 | static inline struct task_struct * | 905 | static inline struct task_struct * |
906 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) | 906 | __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) |
907 | { | 907 | { |
908 | struct task_struct *p; | 908 | struct task_struct *p; |
909 | 909 | ||
910 | if (!curr) | 910 | if (!curr) |
911 | return NULL; | 911 | return NULL; |
912 | 912 | ||
913 | p = rb_entry(curr, struct task_struct, se.run_node); | 913 | p = rb_entry(curr, struct task_struct, se.run_node); |
914 | cfs_rq->rb_load_balance_curr = rb_next(curr); | 914 | cfs_rq->rb_load_balance_curr = rb_next(curr); |
915 | 915 | ||
916 | return p; | 916 | return p; |
917 | } | 917 | } |
918 | 918 | ||
919 | static struct task_struct *load_balance_start_fair(void *arg) | 919 | static struct task_struct *load_balance_start_fair(void *arg) |
920 | { | 920 | { |
921 | struct cfs_rq *cfs_rq = arg; | 921 | struct cfs_rq *cfs_rq = arg; |
922 | 922 | ||
923 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); | 923 | return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); |
924 | } | 924 | } |
925 | 925 | ||
926 | static struct task_struct *load_balance_next_fair(void *arg) | 926 | static struct task_struct *load_balance_next_fair(void *arg) |
927 | { | 927 | { |
928 | struct cfs_rq *cfs_rq = arg; | 928 | struct cfs_rq *cfs_rq = arg; |
929 | 929 | ||
930 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 930 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
931 | } | 931 | } |
932 | 932 | ||
933 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 933 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) |
934 | { | 934 | { |
935 | struct sched_entity *curr; | 935 | struct sched_entity *curr; |
936 | struct task_struct *p; | 936 | struct task_struct *p; |
937 | 937 | ||
938 | if (!cfs_rq->nr_running) | 938 | if (!cfs_rq->nr_running) |
939 | return MAX_PRIO; | 939 | return MAX_PRIO; |
940 | 940 | ||
941 | curr = __pick_next_entity(cfs_rq); | 941 | curr = __pick_next_entity(cfs_rq); |
942 | p = task_of(curr); | 942 | p = task_of(curr); |
943 | 943 | ||
944 | return p->prio; | 944 | return p->prio; |
945 | } | 945 | } |
946 | 946 | ||
947 | static int | 947 | static unsigned long |
948 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 948 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
949 | unsigned long max_nr_move, unsigned long max_load_move, | 949 | unsigned long max_nr_move, unsigned long max_load_move, |
950 | struct sched_domain *sd, enum cpu_idle_type idle, | 950 | struct sched_domain *sd, enum cpu_idle_type idle, |
951 | int *all_pinned, unsigned long *total_load_moved) | 951 | int *all_pinned) |
952 | { | 952 | { |
953 | struct cfs_rq *busy_cfs_rq; | 953 | struct cfs_rq *busy_cfs_rq; |
954 | unsigned long load_moved, total_nr_moved = 0, nr_moved; | 954 | unsigned long load_moved, total_nr_moved = 0, nr_moved; |
955 | long rem_load_move = max_load_move; | 955 | long rem_load_move = max_load_move; |
956 | struct rq_iterator cfs_rq_iterator; | 956 | struct rq_iterator cfs_rq_iterator; |
957 | 957 | ||
958 | cfs_rq_iterator.start = load_balance_start_fair; | 958 | cfs_rq_iterator.start = load_balance_start_fair; |
959 | cfs_rq_iterator.next = load_balance_next_fair; | 959 | cfs_rq_iterator.next = load_balance_next_fair; |
960 | 960 | ||
961 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 961 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
962 | struct cfs_rq *this_cfs_rq; | 962 | struct cfs_rq *this_cfs_rq; |
963 | long imbalance; | 963 | long imbalance; |
964 | unsigned long maxload; | 964 | unsigned long maxload; |
965 | int this_best_prio, best_prio, best_prio_seen = 0; | 965 | int this_best_prio, best_prio, best_prio_seen = 0; |
966 | 966 | ||
967 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 967 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
968 | 968 | ||
969 | imbalance = busy_cfs_rq->load.weight - | 969 | imbalance = busy_cfs_rq->load.weight - |
970 | this_cfs_rq->load.weight; | 970 | this_cfs_rq->load.weight; |
971 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 971 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ |
972 | if (imbalance <= 0) | 972 | if (imbalance <= 0) |
973 | continue; | 973 | continue; |
974 | 974 | ||
975 | /* Don't pull more than imbalance/2 */ | 975 | /* Don't pull more than imbalance/2 */ |
976 | imbalance /= 2; | 976 | imbalance /= 2; |
977 | maxload = min(rem_load_move, imbalance); | 977 | maxload = min(rem_load_move, imbalance); |
978 | 978 | ||
979 | this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 979 | this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
980 | best_prio = cfs_rq_best_prio(busy_cfs_rq); | 980 | best_prio = cfs_rq_best_prio(busy_cfs_rq); |
981 | 981 | ||
982 | /* | 982 | /* |
983 | * Enable handling of the case where there is more than one task | 983 | * Enable handling of the case where there is more than one task |
984 | * with the best priority. If the current running task is one | 984 | * with the best priority. If the current running task is one |
985 | * of those with prio==best_prio we know it won't be moved | 985 | * of those with prio==best_prio we know it won't be moved |
986 | * and therefore it's safe to override the skip (based on load) | 986 | * and therefore it's safe to override the skip (based on load) |
987 | * of any task we find with that prio. | 987 | * of any task we find with that prio. |
988 | */ | 988 | */ |
989 | if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) | 989 | if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se) |
990 | best_prio_seen = 1; | 990 | best_prio_seen = 1; |
991 | 991 | ||
992 | /* pass busy_cfs_rq argument into | 992 | /* pass busy_cfs_rq argument into |
993 | * load_balance_[start|next]_fair iterators | 993 | * load_balance_[start|next]_fair iterators |
994 | */ | 994 | */ |
995 | cfs_rq_iterator.arg = busy_cfs_rq; | 995 | cfs_rq_iterator.arg = busy_cfs_rq; |
996 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, | 996 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, |
997 | max_nr_move, maxload, sd, idle, all_pinned, | 997 | max_nr_move, maxload, sd, idle, all_pinned, |
998 | &load_moved, this_best_prio, best_prio, | 998 | &load_moved, this_best_prio, best_prio, |
999 | best_prio_seen, &cfs_rq_iterator); | 999 | best_prio_seen, &cfs_rq_iterator); |
1000 | 1000 | ||
1001 | total_nr_moved += nr_moved; | 1001 | total_nr_moved += nr_moved; |
1002 | max_nr_move -= nr_moved; | 1002 | max_nr_move -= nr_moved; |
1003 | rem_load_move -= load_moved; | 1003 | rem_load_move -= load_moved; |
1004 | 1004 | ||
1005 | if (max_nr_move <= 0 || rem_load_move <= 0) | 1005 | if (max_nr_move <= 0 || rem_load_move <= 0) |
1006 | break; | 1006 | break; |
1007 | } | 1007 | } |
1008 | 1008 | ||
1009 | *total_load_moved = max_load_move - rem_load_move; | 1009 | return max_load_move - rem_load_move; |
1010 | |||
1011 | return total_nr_moved; | ||
1012 | } | 1010 | } |
1013 | 1011 | ||
1014 | /* | 1012 | /* |
1015 | * scheduler tick hitting a task of our scheduling class: | 1013 | * scheduler tick hitting a task of our scheduling class: |
1016 | */ | 1014 | */ |
1017 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1015 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) |
1018 | { | 1016 | { |
1019 | struct cfs_rq *cfs_rq; | 1017 | struct cfs_rq *cfs_rq; |
1020 | struct sched_entity *se = &curr->se; | 1018 | struct sched_entity *se = &curr->se; |
1021 | 1019 | ||
1022 | for_each_sched_entity(se) { | 1020 | for_each_sched_entity(se) { |
1023 | cfs_rq = cfs_rq_of(se); | 1021 | cfs_rq = cfs_rq_of(se); |
1024 | entity_tick(cfs_rq, se); | 1022 | entity_tick(cfs_rq, se); |
1025 | } | 1023 | } |
1026 | } | 1024 | } |
1027 | 1025 | ||
1028 | /* | 1026 | /* |
1029 | * Share the fairness runtime between parent and child, thus the | 1027 | * Share the fairness runtime between parent and child, thus the |
1030 | * total amount of pressure for CPU stays equal - new tasks | 1028 | * total amount of pressure for CPU stays equal - new tasks |
1031 | * get a chance to run but frequent forkers are not allowed to | 1029 | * get a chance to run but frequent forkers are not allowed to |
1032 | * monopolize the CPU. Note: the parent runqueue is locked, | 1030 | * monopolize the CPU. Note: the parent runqueue is locked, |
1033 | * the child is not running yet. | 1031 | * the child is not running yet. |
1034 | */ | 1032 | */ |
1035 | static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now) | 1033 | static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now) |
1036 | { | 1034 | { |
1037 | struct cfs_rq *cfs_rq = task_cfs_rq(p); | 1035 | struct cfs_rq *cfs_rq = task_cfs_rq(p); |
1038 | struct sched_entity *se = &p->se; | 1036 | struct sched_entity *se = &p->se; |
1039 | 1037 | ||
1040 | sched_info_queued(p); | 1038 | sched_info_queued(p); |
1041 | 1039 | ||
1042 | update_stats_enqueue(cfs_rq, se, now); | 1040 | update_stats_enqueue(cfs_rq, se, now); |
1043 | /* | 1041 | /* |
1044 | * Child runs first: we let it run before the parent | 1042 | * Child runs first: we let it run before the parent |
1045 | * until it reschedules once. We set up the key so that | 1043 | * until it reschedules once. We set up the key so that |
1046 | * it will preempt the parent: | 1044 | * it will preempt the parent: |
1047 | */ | 1045 | */ |
1048 | p->se.fair_key = current->se.fair_key - | 1046 | p->se.fair_key = current->se.fair_key - |
1049 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; | 1047 | niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1; |
1050 | /* | 1048 | /* |
1051 | * The first wait is dominated by the child-runs-first logic, | 1049 | * The first wait is dominated by the child-runs-first logic, |
1052 | * so do not credit it with that waiting time yet: | 1050 | * so do not credit it with that waiting time yet: |
1053 | */ | 1051 | */ |
1054 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) | 1052 | if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL) |
1055 | p->se.wait_start_fair = 0; | 1053 | p->se.wait_start_fair = 0; |
1056 | 1054 | ||
1057 | /* | 1055 | /* |
1058 | * The statistical average of wait_runtime is about | 1056 | * The statistical average of wait_runtime is about |
1059 | * -granularity/2, so initialize the task with that: | 1057 | * -granularity/2, so initialize the task with that: |
1060 | */ | 1058 | */ |
1061 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) | 1059 | if (sysctl_sched_features & SCHED_FEAT_START_DEBIT) |
1062 | p->se.wait_runtime = -(sysctl_sched_granularity / 2); | 1060 | p->se.wait_runtime = -(sysctl_sched_granularity / 2); |
1063 | 1061 | ||
1064 | __enqueue_entity(cfs_rq, se); | 1062 | __enqueue_entity(cfs_rq, se); |
1065 | } | 1063 | } |
1066 | 1064 | ||
1067 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1065 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1068 | /* Account for a task changing its policy or group. | 1066 | /* Account for a task changing its policy or group. |
1069 | * | 1067 | * |
1070 | * This routine is mostly called to set cfs_rq->curr field when a task | 1068 | * This routine is mostly called to set cfs_rq->curr field when a task |
1071 | * migrates between groups/classes. | 1069 | * migrates between groups/classes. |
1072 | */ | 1070 | */ |
1073 | static void set_curr_task_fair(struct rq *rq) | 1071 | static void set_curr_task_fair(struct rq *rq) |
1074 | { | 1072 | { |
1075 | struct task_struct *curr = rq->curr; | 1073 | struct task_struct *curr = rq->curr; |
1076 | struct sched_entity *se = &curr->se; | 1074 | struct sched_entity *se = &curr->se; |
1077 | u64 now = rq_clock(rq); | 1075 | u64 now = rq_clock(rq); |
1078 | struct cfs_rq *cfs_rq; | 1076 | struct cfs_rq *cfs_rq; |
1079 | 1077 | ||
1080 | for_each_sched_entity(se) { | 1078 | for_each_sched_entity(se) { |
1081 | cfs_rq = cfs_rq_of(se); | 1079 | cfs_rq = cfs_rq_of(se); |
1082 | set_next_entity(cfs_rq, se, now); | 1080 | set_next_entity(cfs_rq, se, now); |
1083 | } | 1081 | } |
1084 | } | 1082 | } |
1085 | #else | 1083 | #else |
1086 | static void set_curr_task_fair(struct rq *rq) | 1084 | static void set_curr_task_fair(struct rq *rq) |
1087 | { | 1085 | { |
1088 | } | 1086 | } |
1089 | #endif | 1087 | #endif |
1090 | 1088 | ||
1091 | /* | 1089 | /* |
1092 | * All the scheduling class methods: | 1090 | * All the scheduling class methods: |
1093 | */ | 1091 | */ |
1094 | struct sched_class fair_sched_class __read_mostly = { | 1092 | struct sched_class fair_sched_class __read_mostly = { |
1095 | .enqueue_task = enqueue_task_fair, | 1093 | .enqueue_task = enqueue_task_fair, |
1096 | .dequeue_task = dequeue_task_fair, | 1094 | .dequeue_task = dequeue_task_fair, |
1097 | .yield_task = yield_task_fair, | 1095 | .yield_task = yield_task_fair, |
1098 | 1096 | ||
1099 | .check_preempt_curr = check_preempt_curr_fair, | 1097 | .check_preempt_curr = check_preempt_curr_fair, |
1100 | 1098 | ||
1101 | .pick_next_task = pick_next_task_fair, | 1099 | .pick_next_task = pick_next_task_fair, |
1102 | .put_prev_task = put_prev_task_fair, | 1100 | .put_prev_task = put_prev_task_fair, |
1103 | 1101 | ||
1104 | .load_balance = load_balance_fair, | 1102 | .load_balance = load_balance_fair, |
1105 | 1103 | ||
1106 | .set_curr_task = set_curr_task_fair, | 1104 | .set_curr_task = set_curr_task_fair, |
1107 | .task_tick = task_tick_fair, | 1105 | .task_tick = task_tick_fair, |
1108 | .task_new = task_new_fair, | 1106 | .task_new = task_new_fair, |
1109 | }; | 1107 | }; |
1110 | 1108 | ||
1111 | #ifdef CONFIG_SCHED_DEBUG | 1109 | #ifdef CONFIG_SCHED_DEBUG |
1112 | void print_cfs_stats(struct seq_file *m, int cpu, u64 now) | 1110 | void print_cfs_stats(struct seq_file *m, int cpu, u64 now) |
1113 | { | 1111 | { |
1114 | struct rq *rq = cpu_rq(cpu); | 1112 | struct rq *rq = cpu_rq(cpu); |
1115 | struct cfs_rq *cfs_rq; | 1113 | struct cfs_rq *cfs_rq; |
1116 | 1114 | ||
1117 | for_each_leaf_cfs_rq(rq, cfs_rq) | 1115 | for_each_leaf_cfs_rq(rq, cfs_rq) |
1118 | print_cfs_rq(m, cpu, cfs_rq, now); | 1116 | print_cfs_rq(m, cpu, cfs_rq, now); |
1119 | } | 1117 | } |
1120 | #endif | 1118 | #endif |
1121 | 1119 |
kernel/sched_idletask.c
1 | /* | 1 | /* |
2 | * idle-task scheduling class. | 2 | * idle-task scheduling class. |
3 | * | 3 | * |
4 | * (NOTE: these are not related to SCHED_IDLE tasks which are | 4 | * (NOTE: these are not related to SCHED_IDLE tasks which are |
5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
6 | */ | 6 | */ |
7 | 7 | ||
8 | /* | 8 | /* |
9 | * Idle tasks are unconditionally rescheduled: | 9 | * Idle tasks are unconditionally rescheduled: |
10 | */ | 10 | */ |
11 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) | 11 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) |
12 | { | 12 | { |
13 | resched_task(rq->idle); | 13 | resched_task(rq->idle); |
14 | } | 14 | } |
15 | 15 | ||
16 | static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) | 16 | static struct task_struct *pick_next_task_idle(struct rq *rq, u64 now) |
17 | { | 17 | { |
18 | schedstat_inc(rq, sched_goidle); | 18 | schedstat_inc(rq, sched_goidle); |
19 | 19 | ||
20 | return rq->idle; | 20 | return rq->idle; |
21 | } | 21 | } |
22 | 22 | ||
23 | /* | 23 | /* |
24 | * It is not legal to sleep in the idle task - print a warning | 24 | * It is not legal to sleep in the idle task - print a warning |
25 | * message if some code attempts to do it: | 25 | * message if some code attempts to do it: |
26 | */ | 26 | */ |
27 | static void | 27 | static void |
28 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) | 28 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep, u64 now) |
29 | { | 29 | { |
30 | spin_unlock_irq(&rq->lock); | 30 | spin_unlock_irq(&rq->lock); |
31 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | 31 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); |
32 | dump_stack(); | 32 | dump_stack(); |
33 | spin_lock_irq(&rq->lock); | 33 | spin_lock_irq(&rq->lock); |
34 | } | 34 | } |
35 | 35 | ||
36 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) | 36 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, u64 now) |
37 | { | 37 | { |
38 | } | 38 | } |
39 | 39 | ||
40 | static int | 40 | static unsigned long |
41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | 41 | load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, |
42 | unsigned long max_nr_move, unsigned long max_load_move, | 42 | unsigned long max_nr_move, unsigned long max_load_move, |
43 | struct sched_domain *sd, enum cpu_idle_type idle, | 43 | struct sched_domain *sd, enum cpu_idle_type idle, |
44 | int *all_pinned, unsigned long *total_load_moved) | 44 | int *all_pinned) |
45 | { | 45 | { |
46 | return 0; | 46 | return 0; |
47 | } | 47 | } |
48 | 48 | ||
49 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | 49 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) |
50 | { | 50 | { |
51 | } | 51 | } |
52 | 52 | ||
53 | /* | 53 | /* |
54 | * Simple, special scheduling class for the per-CPU idle tasks: | 54 | * Simple, special scheduling class for the per-CPU idle tasks: |
55 | */ | 55 | */ |
56 | static struct sched_class idle_sched_class __read_mostly = { | 56 | static struct sched_class idle_sched_class __read_mostly = { |
57 | /* no enqueue/yield_task for idle tasks */ | 57 | /* no enqueue/yield_task for idle tasks */ |
58 | 58 | ||
59 | /* dequeue is not valid, we print a debug message there: */ | 59 | /* dequeue is not valid, we print a debug message there: */ |
60 | .dequeue_task = dequeue_task_idle, | 60 | .dequeue_task = dequeue_task_idle, |
61 | 61 | ||
62 | .check_preempt_curr = check_preempt_curr_idle, | 62 | .check_preempt_curr = check_preempt_curr_idle, |
63 | 63 | ||
64 | .pick_next_task = pick_next_task_idle, | 64 | .pick_next_task = pick_next_task_idle, |
65 | .put_prev_task = put_prev_task_idle, | 65 | .put_prev_task = put_prev_task_idle, |
66 | 66 | ||
67 | .load_balance = load_balance_idle, | 67 | .load_balance = load_balance_idle, |
68 | 68 | ||
69 | .task_tick = task_tick_idle, | 69 | .task_tick = task_tick_idle, |
70 | /* no .task_new for idle tasks */ | 70 | /* no .task_new for idle tasks */ |
71 | }; | 71 | }; |
72 | 72 |
kernel/sched_rt.c
1 | /* | 1 | /* |
2 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR | 2 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR |
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | /* | 6 | /* |
7 | * Update the current task's runtime statistics. Skip current tasks that | 7 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 8 | * are not in our scheduling class. |
9 | */ | 9 | */ |
10 | static inline void update_curr_rt(struct rq *rq, u64 now) | 10 | static inline void update_curr_rt(struct rq *rq, u64 now) |
11 | { | 11 | { |
12 | struct task_struct *curr = rq->curr; | 12 | struct task_struct *curr = rq->curr; |
13 | u64 delta_exec; | 13 | u64 delta_exec; |
14 | 14 | ||
15 | if (!task_has_rt_policy(curr)) | 15 | if (!task_has_rt_policy(curr)) |
16 | return; | 16 | return; |
17 | 17 | ||
18 | delta_exec = now - curr->se.exec_start; | 18 | delta_exec = now - curr->se.exec_start; |
19 | if (unlikely((s64)delta_exec < 0)) | 19 | if (unlikely((s64)delta_exec < 0)) |
20 | delta_exec = 0; | 20 | delta_exec = 0; |
21 | 21 | ||
22 | schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); | 22 | schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); |
23 | 23 | ||
24 | curr->se.sum_exec_runtime += delta_exec; | 24 | curr->se.sum_exec_runtime += delta_exec; |
25 | curr->se.exec_start = now; | 25 | curr->se.exec_start = now; |
26 | } | 26 | } |
27 | 27 | ||
28 | static void | 28 | static void |
29 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) | 29 | enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now) |
30 | { | 30 | { |
31 | struct rt_prio_array *array = &rq->rt.active; | 31 | struct rt_prio_array *array = &rq->rt.active; |
32 | 32 | ||
33 | list_add_tail(&p->run_list, array->queue + p->prio); | 33 | list_add_tail(&p->run_list, array->queue + p->prio); |
34 | __set_bit(p->prio, array->bitmap); | 34 | __set_bit(p->prio, array->bitmap); |
35 | } | 35 | } |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * Adding/removing a task to/from a priority array: | 38 | * Adding/removing a task to/from a priority array: |
39 | */ | 39 | */ |
40 | static void | 40 | static void |
41 | dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) | 41 | dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep, u64 now) |
42 | { | 42 | { |
43 | struct rt_prio_array *array = &rq->rt.active; | 43 | struct rt_prio_array *array = &rq->rt.active; |
44 | 44 | ||
45 | update_curr_rt(rq, now); | 45 | update_curr_rt(rq, now); |
46 | 46 | ||
47 | list_del(&p->run_list); | 47 | list_del(&p->run_list); |
48 | if (list_empty(array->queue + p->prio)) | 48 | if (list_empty(array->queue + p->prio)) |
49 | __clear_bit(p->prio, array->bitmap); | 49 | __clear_bit(p->prio, array->bitmap); |
50 | } | 50 | } |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * Put task to the end of the run list without the overhead of dequeue | 53 | * Put task to the end of the run list without the overhead of dequeue |
54 | * followed by enqueue. | 54 | * followed by enqueue. |
55 | */ | 55 | */ |
56 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 56 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
57 | { | 57 | { |
58 | struct rt_prio_array *array = &rq->rt.active; | 58 | struct rt_prio_array *array = &rq->rt.active; |
59 | 59 | ||
60 | list_move_tail(&p->run_list, array->queue + p->prio); | 60 | list_move_tail(&p->run_list, array->queue + p->prio); |
61 | } | 61 | } |
62 | 62 | ||
63 | static void | 63 | static void |
64 | yield_task_rt(struct rq *rq, struct task_struct *p) | 64 | yield_task_rt(struct rq *rq, struct task_struct *p) |
65 | { | 65 | { |
66 | requeue_task_rt(rq, p); | 66 | requeue_task_rt(rq, p); |
67 | } | 67 | } |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Preempt the current task with a newly woken task if needed: | 70 | * Preempt the current task with a newly woken task if needed: |
71 | */ | 71 | */ |
72 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | 72 | static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) |
73 | { | 73 | { |
74 | if (p->prio < rq->curr->prio) | 74 | if (p->prio < rq->curr->prio) |
75 | resched_task(rq->curr); | 75 | resched_task(rq->curr); |
76 | } | 76 | } |
77 | 77 | ||
78 | static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) | 78 | static struct task_struct *pick_next_task_rt(struct rq *rq, u64 now) |
79 | { | 79 | { |
80 | struct rt_prio_array *array = &rq->rt.active; | 80 | struct rt_prio_array *array = &rq->rt.active; |
81 | struct task_struct *next; | 81 | struct task_struct *next; |
82 | struct list_head *queue; | 82 | struct list_head *queue; |
83 | int idx; | 83 | int idx; |
84 | 84 | ||
85 | idx = sched_find_first_bit(array->bitmap); | 85 | idx = sched_find_first_bit(array->bitmap); |
86 | if (idx >= MAX_RT_PRIO) | 86 | if (idx >= MAX_RT_PRIO) |
87 | return NULL; | 87 | return NULL; |
88 | 88 | ||
89 | queue = array->queue + idx; | 89 | queue = array->queue + idx; |
90 | next = list_entry(queue->next, struct task_struct, run_list); | 90 | next = list_entry(queue->next, struct task_struct, run_list); |
91 | 91 | ||
92 | next->se.exec_start = now; | 92 | next->se.exec_start = now; |
93 | 93 | ||
94 | return next; | 94 | return next; |
95 | } | 95 | } |
96 | 96 | ||
97 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) | 97 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now) |
98 | { | 98 | { |
99 | update_curr_rt(rq, now); | 99 | update_curr_rt(rq, now); |
100 | p->se.exec_start = 0; | 100 | p->se.exec_start = 0; |
101 | } | 101 | } |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * Load-balancing iterator. Note: while the runqueue stays locked | 104 | * Load-balancing iterator. Note: while the runqueue stays locked |
105 | * during the whole iteration, the current task might be | 105 | * during the whole iteration, the current task might be |
106 | * dequeued so the iterator has to be dequeue-safe. Here we | 106 | * dequeued so the iterator has to be dequeue-safe. Here we |
107 | * achieve that by always pre-iterating before returning | 107 | * achieve that by always pre-iterating before returning |
108 | * the current task: | 108 | * the current task: |
109 | */ | 109 | */ |
110 | static struct task_struct *load_balance_start_rt(void *arg) | 110 | static struct task_struct *load_balance_start_rt(void *arg) |
111 | { | 111 | { |
112 | struct rq *rq = arg; | 112 | struct rq *rq = arg; |
113 | struct rt_prio_array *array = &rq->rt.active; | 113 | struct rt_prio_array *array = &rq->rt.active; |
114 | struct list_head *head, *curr; | 114 | struct list_head *head, *curr; |
115 | struct task_struct *p; | 115 | struct task_struct *p; |
116 | int idx; | 116 | int idx; |
117 | 117 | ||
118 | idx = sched_find_first_bit(array->bitmap); | 118 | idx = sched_find_first_bit(array->bitmap); |
119 | if (idx >= MAX_RT_PRIO) | 119 | if (idx >= MAX_RT_PRIO) |
120 | return NULL; | 120 | return NULL; |
121 | 121 | ||
122 | head = array->queue + idx; | 122 | head = array->queue + idx; |
123 | curr = head->prev; | 123 | curr = head->prev; |
124 | 124 | ||
125 | p = list_entry(curr, struct task_struct, run_list); | 125 | p = list_entry(curr, struct task_struct, run_list); |
126 | 126 | ||
127 | curr = curr->prev; | 127 | curr = curr->prev; |
128 | 128 | ||
129 | rq->rt.rt_load_balance_idx = idx; | 129 | rq->rt.rt_load_balance_idx = idx; |
130 | rq->rt.rt_load_balance_head = head; | 130 | rq->rt.rt_load_balance_head = head; |
131 | rq->rt.rt_load_balance_curr = curr; | 131 | rq->rt.rt_load_balance_curr = curr; |
132 | 132 | ||
133 | return p; | 133 | return p; |
134 | } | 134 | } |
135 | 135 | ||
136 | static struct task_struct *load_balance_next_rt(void *arg) | 136 | static struct task_struct *load_balance_next_rt(void *arg) |
137 | { | 137 | { |
138 | struct rq *rq = arg; | 138 | struct rq *rq = arg; |
139 | struct rt_prio_array *array = &rq->rt.active; | 139 | struct rt_prio_array *array = &rq->rt.active; |
140 | struct list_head *head, *curr; | 140 | struct list_head *head, *curr; |
141 | struct task_struct *p; | 141 | struct task_struct *p; |
142 | int idx; | 142 | int idx; |
143 | 143 | ||
144 | idx = rq->rt.rt_load_balance_idx; | 144 | idx = rq->rt.rt_load_balance_idx; |
145 | head = rq->rt.rt_load_balance_head; | 145 | head = rq->rt.rt_load_balance_head; |
146 | curr = rq->rt.rt_load_balance_curr; | 146 | curr = rq->rt.rt_load_balance_curr; |
147 | 147 | ||
148 | /* | 148 | /* |
149 | * If we arrived back to the head again then | 149 | * If we arrived back to the head again then |
150 | * iterate to the next queue (if any): | 150 | * iterate to the next queue (if any): |
151 | */ | 151 | */ |
152 | if (unlikely(head == curr)) { | 152 | if (unlikely(head == curr)) { |
153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 153 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); |
154 | 154 | ||
155 | if (next_idx >= MAX_RT_PRIO) | 155 | if (next_idx >= MAX_RT_PRIO) |
156 | return NULL; | 156 | return NULL; |
157 | 157 | ||
158 | idx = next_idx; | 158 | idx = next_idx; |
159 | head = array->queue + idx; | 159 | head = array->queue + idx; |
160 | curr = head->prev; | 160 | curr = head->prev; |
161 | 161 | ||
162 | rq->rt.rt_load_balance_idx = idx; | 162 | rq->rt.rt_load_balance_idx = idx; |
163 | rq->rt.rt_load_balance_head = head; | 163 | rq->rt.rt_load_balance_head = head; |
164 | } | 164 | } |
165 | 165 | ||
166 | p = list_entry(curr, struct task_struct, run_list); | 166 | p = list_entry(curr, struct task_struct, run_list); |
167 | 167 | ||
168 | curr = curr->prev; | 168 | curr = curr->prev; |
169 | 169 | ||
170 | rq->rt.rt_load_balance_curr = curr; | 170 | rq->rt.rt_load_balance_curr = curr; |
171 | 171 | ||
172 | return p; | 172 | return p; |
173 | } | 173 | } |
174 | 174 | ||
175 | static int | 175 | static unsigned long |
176 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 176 | load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
177 | unsigned long max_nr_move, unsigned long max_load_move, | 177 | unsigned long max_nr_move, unsigned long max_load_move, |
178 | struct sched_domain *sd, enum cpu_idle_type idle, | 178 | struct sched_domain *sd, enum cpu_idle_type idle, |
179 | int *all_pinned, unsigned long *load_moved) | 179 | int *all_pinned) |
180 | { | 180 | { |
181 | int this_best_prio, best_prio, best_prio_seen = 0; | 181 | int this_best_prio, best_prio, best_prio_seen = 0; |
182 | int nr_moved; | 182 | int nr_moved; |
183 | struct rq_iterator rt_rq_iterator; | 183 | struct rq_iterator rt_rq_iterator; |
184 | unsigned long load_moved; | ||
184 | 185 | ||
185 | best_prio = sched_find_first_bit(busiest->rt.active.bitmap); | 186 | best_prio = sched_find_first_bit(busiest->rt.active.bitmap); |
186 | this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); | 187 | this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap); |
187 | 188 | ||
188 | /* | 189 | /* |
189 | * Enable handling of the case where there is more than one task | 190 | * Enable handling of the case where there is more than one task |
190 | * with the best priority. If the current running task is one | 191 | * with the best priority. If the current running task is one |
191 | * of those with prio==best_prio we know it won't be moved | 192 | * of those with prio==best_prio we know it won't be moved |
192 | * and therefore it's safe to override the skip (based on load) | 193 | * and therefore it's safe to override the skip (based on load) |
193 | * of any task we find with that prio. | 194 | * of any task we find with that prio. |
194 | */ | 195 | */ |
195 | if (busiest->curr->prio == best_prio) | 196 | if (busiest->curr->prio == best_prio) |
196 | best_prio_seen = 1; | 197 | best_prio_seen = 1; |
197 | 198 | ||
198 | rt_rq_iterator.start = load_balance_start_rt; | 199 | rt_rq_iterator.start = load_balance_start_rt; |
199 | rt_rq_iterator.next = load_balance_next_rt; | 200 | rt_rq_iterator.next = load_balance_next_rt; |
200 | /* pass 'busiest' rq argument into | 201 | /* pass 'busiest' rq argument into |
201 | * load_balance_[start|next]_rt iterators | 202 | * load_balance_[start|next]_rt iterators |
202 | */ | 203 | */ |
203 | rt_rq_iterator.arg = busiest; | 204 | rt_rq_iterator.arg = busiest; |
204 | 205 | ||
205 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, | 206 | nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move, |
206 | max_load_move, sd, idle, all_pinned, load_moved, | 207 | max_load_move, sd, idle, all_pinned, &load_moved, |
207 | this_best_prio, best_prio, best_prio_seen, | 208 | this_best_prio, best_prio, best_prio_seen, |
208 | &rt_rq_iterator); | 209 | &rt_rq_iterator); |
209 | 210 | ||
210 | return nr_moved; | 211 | return load_moved; |
211 | } | 212 | } |
212 | 213 | ||
213 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 214 | static void task_tick_rt(struct rq *rq, struct task_struct *p) |
214 | { | 215 | { |
215 | /* | 216 | /* |
216 | * RR tasks need a special form of timeslice management. | 217 | * RR tasks need a special form of timeslice management. |
217 | * FIFO tasks have no timeslices. | 218 | * FIFO tasks have no timeslices. |
218 | */ | 219 | */ |
219 | if (p->policy != SCHED_RR) | 220 | if (p->policy != SCHED_RR) |
220 | return; | 221 | return; |
221 | 222 | ||
222 | if (--p->time_slice) | 223 | if (--p->time_slice) |
223 | return; | 224 | return; |
224 | 225 | ||
225 | p->time_slice = static_prio_timeslice(p->static_prio); | 226 | p->time_slice = static_prio_timeslice(p->static_prio); |
226 | set_tsk_need_resched(p); | 227 | set_tsk_need_resched(p); |
227 | 228 | ||
228 | /* put it at the end of the queue: */ | 229 | /* put it at the end of the queue: */ |
229 | requeue_task_rt(rq, p); | 230 | requeue_task_rt(rq, p); |
230 | } | 231 | } |
231 | 232 | ||
232 | static struct sched_class rt_sched_class __read_mostly = { | 233 | static struct sched_class rt_sched_class __read_mostly = { |
233 | .enqueue_task = enqueue_task_rt, | 234 | .enqueue_task = enqueue_task_rt, |
234 | .dequeue_task = dequeue_task_rt, | 235 | .dequeue_task = dequeue_task_rt, |
235 | .yield_task = yield_task_rt, | 236 | .yield_task = yield_task_rt, |
236 | 237 | ||
237 | .check_preempt_curr = check_preempt_curr_rt, | 238 | .check_preempt_curr = check_preempt_curr_rt, |
238 | 239 | ||
239 | .pick_next_task = pick_next_task_rt, | 240 | .pick_next_task = pick_next_task_rt, |
240 | .put_prev_task = put_prev_task_rt, | 241 | .put_prev_task = put_prev_task_rt, |
241 | 242 | ||
242 | .load_balance = load_balance_rt, | 243 | .load_balance = load_balance_rt, |
243 | 244 | ||
244 | .task_tick = task_tick_rt, | 245 | .task_tick = task_tick_rt, |
245 | }; | 246 | }; |
246 | 247 |