Commit 476f35348eb8d2a827765992899fea78b7dcc46f
Committed by
Linus Torvalds
1 parent
aee16b3cee
Safer nr_node_ids and nr_node_ids determination and initial values
The nr_cpu_ids value is currently only calculated in smp_init. However, it may be needed before (SLUB needs it on kmem_cache_init!) and other kernel components may also want to allocate dynamically sized per cpu array before smp_init. So move the determination of possible cpus into sched_init() where we already loop over all possible cpus early in boot. Also initialize both nr_node_ids and nr_cpu_ids with the highest value they could take. If we have accidental users before these values are determined then the current valud of 0 may cause too small per cpu and per node arrays to be allocated. If it is set to the maximum possible then we only waste some memory for early boot users. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 4 changed files with 9 additions and 9 deletions Inline Diff
init/main.c
1 | /* | 1 | /* |
2 | * linux/init/main.c | 2 | * linux/init/main.c |
3 | * | 3 | * |
4 | * Copyright (C) 1991, 1992 Linus Torvalds | 4 | * Copyright (C) 1991, 1992 Linus Torvalds |
5 | * | 5 | * |
6 | * GK 2/5/95 - Changed to support mounting root fs via NFS | 6 | * GK 2/5/95 - Changed to support mounting root fs via NFS |
7 | * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 | 7 | * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 |
8 | * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 | 8 | * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 |
9 | * Simplified starting of init: Michael A. Griffith <grif@acm.org> | 9 | * Simplified starting of init: Michael A. Griffith <grif@acm.org> |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/proc_fs.h> | 14 | #include <linux/proc_fs.h> |
15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
16 | #include <linux/syscalls.h> | 16 | #include <linux/syscalls.h> |
17 | #include <linux/string.h> | 17 | #include <linux/string.h> |
18 | #include <linux/ctype.h> | 18 | #include <linux/ctype.h> |
19 | #include <linux/delay.h> | 19 | #include <linux/delay.h> |
20 | #include <linux/utsname.h> | 20 | #include <linux/utsname.h> |
21 | #include <linux/ioport.h> | 21 | #include <linux/ioport.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/smp_lock.h> | 23 | #include <linux/smp_lock.h> |
24 | #include <linux/initrd.h> | 24 | #include <linux/initrd.h> |
25 | #include <linux/hdreg.h> | 25 | #include <linux/hdreg.h> |
26 | #include <linux/bootmem.h> | 26 | #include <linux/bootmem.h> |
27 | #include <linux/tty.h> | 27 | #include <linux/tty.h> |
28 | #include <linux/gfp.h> | 28 | #include <linux/gfp.h> |
29 | #include <linux/percpu.h> | 29 | #include <linux/percpu.h> |
30 | #include <linux/kmod.h> | 30 | #include <linux/kmod.h> |
31 | #include <linux/kernel_stat.h> | 31 | #include <linux/kernel_stat.h> |
32 | #include <linux/start_kernel.h> | 32 | #include <linux/start_kernel.h> |
33 | #include <linux/security.h> | 33 | #include <linux/security.h> |
34 | #include <linux/workqueue.h> | 34 | #include <linux/workqueue.h> |
35 | #include <linux/profile.h> | 35 | #include <linux/profile.h> |
36 | #include <linux/rcupdate.h> | 36 | #include <linux/rcupdate.h> |
37 | #include <linux/moduleparam.h> | 37 | #include <linux/moduleparam.h> |
38 | #include <linux/kallsyms.h> | 38 | #include <linux/kallsyms.h> |
39 | #include <linux/writeback.h> | 39 | #include <linux/writeback.h> |
40 | #include <linux/cpu.h> | 40 | #include <linux/cpu.h> |
41 | #include <linux/cpuset.h> | 41 | #include <linux/cpuset.h> |
42 | #include <linux/efi.h> | 42 | #include <linux/efi.h> |
43 | #include <linux/tick.h> | 43 | #include <linux/tick.h> |
44 | #include <linux/interrupt.h> | 44 | #include <linux/interrupt.h> |
45 | #include <linux/taskstats_kern.h> | 45 | #include <linux/taskstats_kern.h> |
46 | #include <linux/delayacct.h> | 46 | #include <linux/delayacct.h> |
47 | #include <linux/unistd.h> | 47 | #include <linux/unistd.h> |
48 | #include <linux/rmap.h> | 48 | #include <linux/rmap.h> |
49 | #include <linux/mempolicy.h> | 49 | #include <linux/mempolicy.h> |
50 | #include <linux/key.h> | 50 | #include <linux/key.h> |
51 | #include <linux/unwind.h> | 51 | #include <linux/unwind.h> |
52 | #include <linux/buffer_head.h> | 52 | #include <linux/buffer_head.h> |
53 | #include <linux/debug_locks.h> | 53 | #include <linux/debug_locks.h> |
54 | #include <linux/lockdep.h> | 54 | #include <linux/lockdep.h> |
55 | #include <linux/pid_namespace.h> | 55 | #include <linux/pid_namespace.h> |
56 | #include <linux/device.h> | 56 | #include <linux/device.h> |
57 | 57 | ||
58 | #include <asm/io.h> | 58 | #include <asm/io.h> |
59 | #include <asm/bugs.h> | 59 | #include <asm/bugs.h> |
60 | #include <asm/setup.h> | 60 | #include <asm/setup.h> |
61 | #include <asm/sections.h> | 61 | #include <asm/sections.h> |
62 | #include <asm/cacheflush.h> | 62 | #include <asm/cacheflush.h> |
63 | 63 | ||
64 | #ifdef CONFIG_X86_LOCAL_APIC | 64 | #ifdef CONFIG_X86_LOCAL_APIC |
65 | #include <asm/smp.h> | 65 | #include <asm/smp.h> |
66 | #endif | 66 | #endif |
67 | 67 | ||
68 | /* | 68 | /* |
69 | * This is one of the first .c files built. Error out early if we have compiler | 69 | * This is one of the first .c files built. Error out early if we have compiler |
70 | * trouble. | 70 | * trouble. |
71 | * | 71 | * |
72 | * Versions of gcc older than that listed below may actually compile and link | 72 | * Versions of gcc older than that listed below may actually compile and link |
73 | * okay, but the end product can have subtle run time bugs. To avoid associated | 73 | * okay, but the end product can have subtle run time bugs. To avoid associated |
74 | * bogus bug reports, we flatly refuse to compile with a gcc that is known to be | 74 | * bogus bug reports, we flatly refuse to compile with a gcc that is known to be |
75 | * too old from the very beginning. | 75 | * too old from the very beginning. |
76 | */ | 76 | */ |
77 | #if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2) | 77 | #if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2) |
78 | #error Sorry, your GCC is too old. It builds incorrect kernels. | 78 | #error Sorry, your GCC is too old. It builds incorrect kernels. |
79 | #endif | 79 | #endif |
80 | 80 | ||
81 | #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0 | 81 | #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0 |
82 | #warning gcc-4.1.0 is known to miscompile the kernel. A different compiler version is recommended. | 82 | #warning gcc-4.1.0 is known to miscompile the kernel. A different compiler version is recommended. |
83 | #endif | 83 | #endif |
84 | 84 | ||
85 | static int kernel_init(void *); | 85 | static int kernel_init(void *); |
86 | 86 | ||
87 | extern void init_IRQ(void); | 87 | extern void init_IRQ(void); |
88 | extern void fork_init(unsigned long); | 88 | extern void fork_init(unsigned long); |
89 | extern void mca_init(void); | 89 | extern void mca_init(void); |
90 | extern void sbus_init(void); | 90 | extern void sbus_init(void); |
91 | extern void signals_init(void); | 91 | extern void signals_init(void); |
92 | extern void pidhash_init(void); | 92 | extern void pidhash_init(void); |
93 | extern void pidmap_init(void); | 93 | extern void pidmap_init(void); |
94 | extern void prio_tree_init(void); | 94 | extern void prio_tree_init(void); |
95 | extern void radix_tree_init(void); | 95 | extern void radix_tree_init(void); |
96 | extern void free_initmem(void); | 96 | extern void free_initmem(void); |
97 | extern void prepare_namespace(void); | 97 | extern void prepare_namespace(void); |
98 | #ifdef CONFIG_ACPI | 98 | #ifdef CONFIG_ACPI |
99 | extern void acpi_early_init(void); | 99 | extern void acpi_early_init(void); |
100 | #else | 100 | #else |
101 | static inline void acpi_early_init(void) { } | 101 | static inline void acpi_early_init(void) { } |
102 | #endif | 102 | #endif |
103 | #ifndef CONFIG_DEBUG_RODATA | 103 | #ifndef CONFIG_DEBUG_RODATA |
104 | static inline void mark_rodata_ro(void) { } | 104 | static inline void mark_rodata_ro(void) { } |
105 | #endif | 105 | #endif |
106 | 106 | ||
107 | #ifdef CONFIG_TC | 107 | #ifdef CONFIG_TC |
108 | extern void tc_init(void); | 108 | extern void tc_init(void); |
109 | #endif | 109 | #endif |
110 | 110 | ||
111 | enum system_states system_state; | 111 | enum system_states system_state; |
112 | EXPORT_SYMBOL(system_state); | 112 | EXPORT_SYMBOL(system_state); |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * Boot command-line arguments | 115 | * Boot command-line arguments |
116 | */ | 116 | */ |
117 | #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT | 117 | #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT |
118 | #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT | 118 | #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT |
119 | 119 | ||
120 | extern void time_init(void); | 120 | extern void time_init(void); |
121 | /* Default late time init is NULL. archs can override this later. */ | 121 | /* Default late time init is NULL. archs can override this later. */ |
122 | void (*late_time_init)(void); | 122 | void (*late_time_init)(void); |
123 | extern void softirq_init(void); | 123 | extern void softirq_init(void); |
124 | 124 | ||
125 | /* Untouched command line saved by arch-specific code. */ | 125 | /* Untouched command line saved by arch-specific code. */ |
126 | char __initdata boot_command_line[COMMAND_LINE_SIZE]; | 126 | char __initdata boot_command_line[COMMAND_LINE_SIZE]; |
127 | /* Untouched saved command line (eg. for /proc) */ | 127 | /* Untouched saved command line (eg. for /proc) */ |
128 | char *saved_command_line; | 128 | char *saved_command_line; |
129 | /* Command line for parameter parsing */ | 129 | /* Command line for parameter parsing */ |
130 | static char *static_command_line; | 130 | static char *static_command_line; |
131 | 131 | ||
132 | static char *execute_command; | 132 | static char *execute_command; |
133 | static char *ramdisk_execute_command; | 133 | static char *ramdisk_execute_command; |
134 | 134 | ||
135 | /* Setup configured maximum number of CPUs to activate */ | 135 | /* Setup configured maximum number of CPUs to activate */ |
136 | static unsigned int max_cpus = NR_CPUS; | 136 | static unsigned int max_cpus = NR_CPUS; |
137 | 137 | ||
138 | /* | 138 | /* |
139 | * If set, this is an indication to the drivers that reset the underlying | 139 | * If set, this is an indication to the drivers that reset the underlying |
140 | * device before going ahead with the initialization otherwise driver might | 140 | * device before going ahead with the initialization otherwise driver might |
141 | * rely on the BIOS and skip the reset operation. | 141 | * rely on the BIOS and skip the reset operation. |
142 | * | 142 | * |
143 | * This is useful if kernel is booting in an unreliable environment. | 143 | * This is useful if kernel is booting in an unreliable environment. |
144 | * For ex. kdump situaiton where previous kernel has crashed, BIOS has been | 144 | * For ex. kdump situaiton where previous kernel has crashed, BIOS has been |
145 | * skipped and devices will be in unknown state. | 145 | * skipped and devices will be in unknown state. |
146 | */ | 146 | */ |
147 | unsigned int reset_devices; | 147 | unsigned int reset_devices; |
148 | EXPORT_SYMBOL(reset_devices); | 148 | EXPORT_SYMBOL(reset_devices); |
149 | 149 | ||
150 | /* | 150 | /* |
151 | * Setup routine for controlling SMP activation | 151 | * Setup routine for controlling SMP activation |
152 | * | 152 | * |
153 | * Command-line option of "nosmp" or "maxcpus=0" will disable SMP | 153 | * Command-line option of "nosmp" or "maxcpus=0" will disable SMP |
154 | * activation entirely (the MPS table probe still happens, though). | 154 | * activation entirely (the MPS table probe still happens, though). |
155 | * | 155 | * |
156 | * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer | 156 | * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer |
157 | * greater than 0, limits the maximum number of CPUs activated in | 157 | * greater than 0, limits the maximum number of CPUs activated in |
158 | * SMP mode to <NUM>. | 158 | * SMP mode to <NUM>. |
159 | */ | 159 | */ |
160 | static int __init nosmp(char *str) | 160 | static int __init nosmp(char *str) |
161 | { | 161 | { |
162 | max_cpus = 0; | 162 | max_cpus = 0; |
163 | return 1; | 163 | return 1; |
164 | } | 164 | } |
165 | 165 | ||
166 | __setup("nosmp", nosmp); | 166 | __setup("nosmp", nosmp); |
167 | 167 | ||
168 | static int __init maxcpus(char *str) | 168 | static int __init maxcpus(char *str) |
169 | { | 169 | { |
170 | get_option(&str, &max_cpus); | 170 | get_option(&str, &max_cpus); |
171 | return 1; | 171 | return 1; |
172 | } | 172 | } |
173 | 173 | ||
174 | __setup("maxcpus=", maxcpus); | 174 | __setup("maxcpus=", maxcpus); |
175 | 175 | ||
176 | static int __init set_reset_devices(char *str) | 176 | static int __init set_reset_devices(char *str) |
177 | { | 177 | { |
178 | reset_devices = 1; | 178 | reset_devices = 1; |
179 | return 1; | 179 | return 1; |
180 | } | 180 | } |
181 | 181 | ||
182 | __setup("reset_devices", set_reset_devices); | 182 | __setup("reset_devices", set_reset_devices); |
183 | 183 | ||
184 | static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; | 184 | static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; |
185 | char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; | 185 | char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; |
186 | static const char *panic_later, *panic_param; | 186 | static const char *panic_later, *panic_param; |
187 | 187 | ||
188 | extern struct obs_kernel_param __setup_start[], __setup_end[]; | 188 | extern struct obs_kernel_param __setup_start[], __setup_end[]; |
189 | 189 | ||
190 | static int __init obsolete_checksetup(char *line) | 190 | static int __init obsolete_checksetup(char *line) |
191 | { | 191 | { |
192 | struct obs_kernel_param *p; | 192 | struct obs_kernel_param *p; |
193 | int had_early_param = 0; | 193 | int had_early_param = 0; |
194 | 194 | ||
195 | p = __setup_start; | 195 | p = __setup_start; |
196 | do { | 196 | do { |
197 | int n = strlen(p->str); | 197 | int n = strlen(p->str); |
198 | if (!strncmp(line, p->str, n)) { | 198 | if (!strncmp(line, p->str, n)) { |
199 | if (p->early) { | 199 | if (p->early) { |
200 | /* Already done in parse_early_param? | 200 | /* Already done in parse_early_param? |
201 | * (Needs exact match on param part). | 201 | * (Needs exact match on param part). |
202 | * Keep iterating, as we can have early | 202 | * Keep iterating, as we can have early |
203 | * params and __setups of same names 8( */ | 203 | * params and __setups of same names 8( */ |
204 | if (line[n] == '\0' || line[n] == '=') | 204 | if (line[n] == '\0' || line[n] == '=') |
205 | had_early_param = 1; | 205 | had_early_param = 1; |
206 | } else if (!p->setup_func) { | 206 | } else if (!p->setup_func) { |
207 | printk(KERN_WARNING "Parameter %s is obsolete," | 207 | printk(KERN_WARNING "Parameter %s is obsolete," |
208 | " ignored\n", p->str); | 208 | " ignored\n", p->str); |
209 | return 1; | 209 | return 1; |
210 | } else if (p->setup_func(line + n)) | 210 | } else if (p->setup_func(line + n)) |
211 | return 1; | 211 | return 1; |
212 | } | 212 | } |
213 | p++; | 213 | p++; |
214 | } while (p < __setup_end); | 214 | } while (p < __setup_end); |
215 | 215 | ||
216 | return had_early_param; | 216 | return had_early_param; |
217 | } | 217 | } |
218 | 218 | ||
219 | /* | 219 | /* |
220 | * This should be approx 2 Bo*oMips to start (note initial shift), and will | 220 | * This should be approx 2 Bo*oMips to start (note initial shift), and will |
221 | * still work even if initially too large, it will just take slightly longer | 221 | * still work even if initially too large, it will just take slightly longer |
222 | */ | 222 | */ |
223 | unsigned long loops_per_jiffy = (1<<12); | 223 | unsigned long loops_per_jiffy = (1<<12); |
224 | 224 | ||
225 | EXPORT_SYMBOL(loops_per_jiffy); | 225 | EXPORT_SYMBOL(loops_per_jiffy); |
226 | 226 | ||
227 | static int __init debug_kernel(char *str) | 227 | static int __init debug_kernel(char *str) |
228 | { | 228 | { |
229 | if (*str) | 229 | if (*str) |
230 | return 0; | 230 | return 0; |
231 | console_loglevel = 10; | 231 | console_loglevel = 10; |
232 | return 1; | 232 | return 1; |
233 | } | 233 | } |
234 | 234 | ||
235 | static int __init quiet_kernel(char *str) | 235 | static int __init quiet_kernel(char *str) |
236 | { | 236 | { |
237 | if (*str) | 237 | if (*str) |
238 | return 0; | 238 | return 0; |
239 | console_loglevel = 4; | 239 | console_loglevel = 4; |
240 | return 1; | 240 | return 1; |
241 | } | 241 | } |
242 | 242 | ||
243 | __setup("debug", debug_kernel); | 243 | __setup("debug", debug_kernel); |
244 | __setup("quiet", quiet_kernel); | 244 | __setup("quiet", quiet_kernel); |
245 | 245 | ||
246 | static int __init loglevel(char *str) | 246 | static int __init loglevel(char *str) |
247 | { | 247 | { |
248 | get_option(&str, &console_loglevel); | 248 | get_option(&str, &console_loglevel); |
249 | return 1; | 249 | return 1; |
250 | } | 250 | } |
251 | 251 | ||
252 | __setup("loglevel=", loglevel); | 252 | __setup("loglevel=", loglevel); |
253 | 253 | ||
254 | /* | 254 | /* |
255 | * Unknown boot options get handed to init, unless they look like | 255 | * Unknown boot options get handed to init, unless they look like |
256 | * failed parameters | 256 | * failed parameters |
257 | */ | 257 | */ |
258 | static int __init unknown_bootoption(char *param, char *val) | 258 | static int __init unknown_bootoption(char *param, char *val) |
259 | { | 259 | { |
260 | /* Change NUL term back to "=", to make "param" the whole string. */ | 260 | /* Change NUL term back to "=", to make "param" the whole string. */ |
261 | if (val) { | 261 | if (val) { |
262 | /* param=val or param="val"? */ | 262 | /* param=val or param="val"? */ |
263 | if (val == param+strlen(param)+1) | 263 | if (val == param+strlen(param)+1) |
264 | val[-1] = '='; | 264 | val[-1] = '='; |
265 | else if (val == param+strlen(param)+2) { | 265 | else if (val == param+strlen(param)+2) { |
266 | val[-2] = '='; | 266 | val[-2] = '='; |
267 | memmove(val-1, val, strlen(val)+1); | 267 | memmove(val-1, val, strlen(val)+1); |
268 | val--; | 268 | val--; |
269 | } else | 269 | } else |
270 | BUG(); | 270 | BUG(); |
271 | } | 271 | } |
272 | 272 | ||
273 | /* Handle obsolete-style parameters */ | 273 | /* Handle obsolete-style parameters */ |
274 | if (obsolete_checksetup(param)) | 274 | if (obsolete_checksetup(param)) |
275 | return 0; | 275 | return 0; |
276 | 276 | ||
277 | /* | 277 | /* |
278 | * Preemptive maintenance for "why didn't my mispelled command | 278 | * Preemptive maintenance for "why didn't my mispelled command |
279 | * line work?" | 279 | * line work?" |
280 | */ | 280 | */ |
281 | if (strchr(param, '.') && (!val || strchr(param, '.') < val)) { | 281 | if (strchr(param, '.') && (!val || strchr(param, '.') < val)) { |
282 | printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param); | 282 | printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param); |
283 | return 0; | 283 | return 0; |
284 | } | 284 | } |
285 | 285 | ||
286 | if (panic_later) | 286 | if (panic_later) |
287 | return 0; | 287 | return 0; |
288 | 288 | ||
289 | if (val) { | 289 | if (val) { |
290 | /* Environment option */ | 290 | /* Environment option */ |
291 | unsigned int i; | 291 | unsigned int i; |
292 | for (i = 0; envp_init[i]; i++) { | 292 | for (i = 0; envp_init[i]; i++) { |
293 | if (i == MAX_INIT_ENVS) { | 293 | if (i == MAX_INIT_ENVS) { |
294 | panic_later = "Too many boot env vars at `%s'"; | 294 | panic_later = "Too many boot env vars at `%s'"; |
295 | panic_param = param; | 295 | panic_param = param; |
296 | } | 296 | } |
297 | if (!strncmp(param, envp_init[i], val - param)) | 297 | if (!strncmp(param, envp_init[i], val - param)) |
298 | break; | 298 | break; |
299 | } | 299 | } |
300 | envp_init[i] = param; | 300 | envp_init[i] = param; |
301 | } else { | 301 | } else { |
302 | /* Command line option */ | 302 | /* Command line option */ |
303 | unsigned int i; | 303 | unsigned int i; |
304 | for (i = 0; argv_init[i]; i++) { | 304 | for (i = 0; argv_init[i]; i++) { |
305 | if (i == MAX_INIT_ARGS) { | 305 | if (i == MAX_INIT_ARGS) { |
306 | panic_later = "Too many boot init vars at `%s'"; | 306 | panic_later = "Too many boot init vars at `%s'"; |
307 | panic_param = param; | 307 | panic_param = param; |
308 | } | 308 | } |
309 | } | 309 | } |
310 | argv_init[i] = param; | 310 | argv_init[i] = param; |
311 | } | 311 | } |
312 | return 0; | 312 | return 0; |
313 | } | 313 | } |
314 | 314 | ||
315 | static int __init init_setup(char *str) | 315 | static int __init init_setup(char *str) |
316 | { | 316 | { |
317 | unsigned int i; | 317 | unsigned int i; |
318 | 318 | ||
319 | execute_command = str; | 319 | execute_command = str; |
320 | /* | 320 | /* |
321 | * In case LILO is going to boot us with default command line, | 321 | * In case LILO is going to boot us with default command line, |
322 | * it prepends "auto" before the whole cmdline which makes | 322 | * it prepends "auto" before the whole cmdline which makes |
323 | * the shell think it should execute a script with such name. | 323 | * the shell think it should execute a script with such name. |
324 | * So we ignore all arguments entered _before_ init=... [MJ] | 324 | * So we ignore all arguments entered _before_ init=... [MJ] |
325 | */ | 325 | */ |
326 | for (i = 1; i < MAX_INIT_ARGS; i++) | 326 | for (i = 1; i < MAX_INIT_ARGS; i++) |
327 | argv_init[i] = NULL; | 327 | argv_init[i] = NULL; |
328 | return 1; | 328 | return 1; |
329 | } | 329 | } |
330 | __setup("init=", init_setup); | 330 | __setup("init=", init_setup); |
331 | 331 | ||
332 | static int __init rdinit_setup(char *str) | 332 | static int __init rdinit_setup(char *str) |
333 | { | 333 | { |
334 | unsigned int i; | 334 | unsigned int i; |
335 | 335 | ||
336 | ramdisk_execute_command = str; | 336 | ramdisk_execute_command = str; |
337 | /* See "auto" comment in init_setup */ | 337 | /* See "auto" comment in init_setup */ |
338 | for (i = 1; i < MAX_INIT_ARGS; i++) | 338 | for (i = 1; i < MAX_INIT_ARGS; i++) |
339 | argv_init[i] = NULL; | 339 | argv_init[i] = NULL; |
340 | return 1; | 340 | return 1; |
341 | } | 341 | } |
342 | __setup("rdinit=", rdinit_setup); | 342 | __setup("rdinit=", rdinit_setup); |
343 | 343 | ||
344 | #ifndef CONFIG_SMP | 344 | #ifndef CONFIG_SMP |
345 | 345 | ||
346 | #ifdef CONFIG_X86_LOCAL_APIC | 346 | #ifdef CONFIG_X86_LOCAL_APIC |
347 | static void __init smp_init(void) | 347 | static void __init smp_init(void) |
348 | { | 348 | { |
349 | APIC_init_uniprocessor(); | 349 | APIC_init_uniprocessor(); |
350 | } | 350 | } |
351 | #else | 351 | #else |
352 | #define smp_init() do { } while (0) | 352 | #define smp_init() do { } while (0) |
353 | #endif | 353 | #endif |
354 | 354 | ||
355 | static inline void setup_per_cpu_areas(void) { } | 355 | static inline void setup_per_cpu_areas(void) { } |
356 | static inline void smp_prepare_cpus(unsigned int maxcpus) { } | 356 | static inline void smp_prepare_cpus(unsigned int maxcpus) { } |
357 | 357 | ||
358 | #else | 358 | #else |
359 | 359 | ||
360 | #ifdef __GENERIC_PER_CPU | 360 | #ifdef __GENERIC_PER_CPU |
361 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | 361 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; |
362 | 362 | ||
363 | EXPORT_SYMBOL(__per_cpu_offset); | 363 | EXPORT_SYMBOL(__per_cpu_offset); |
364 | 364 | ||
365 | static void __init setup_per_cpu_areas(void) | 365 | static void __init setup_per_cpu_areas(void) |
366 | { | 366 | { |
367 | unsigned long size, i; | 367 | unsigned long size, i; |
368 | char *ptr; | 368 | char *ptr; |
369 | unsigned long nr_possible_cpus = num_possible_cpus(); | 369 | unsigned long nr_possible_cpus = num_possible_cpus(); |
370 | 370 | ||
371 | /* Copy section for each CPU (we discard the original) */ | 371 | /* Copy section for each CPU (we discard the original) */ |
372 | size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); | 372 | size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); |
373 | ptr = alloc_bootmem_pages(size * nr_possible_cpus); | 373 | ptr = alloc_bootmem_pages(size * nr_possible_cpus); |
374 | 374 | ||
375 | for_each_possible_cpu(i) { | 375 | for_each_possible_cpu(i) { |
376 | __per_cpu_offset[i] = ptr - __per_cpu_start; | 376 | __per_cpu_offset[i] = ptr - __per_cpu_start; |
377 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | 377 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); |
378 | ptr += size; | 378 | ptr += size; |
379 | } | 379 | } |
380 | } | 380 | } |
381 | #endif /* !__GENERIC_PER_CPU */ | 381 | #endif /* !__GENERIC_PER_CPU */ |
382 | 382 | ||
383 | /* Called by boot processor to activate the rest. */ | 383 | /* Called by boot processor to activate the rest. */ |
384 | static void __init smp_init(void) | 384 | static void __init smp_init(void) |
385 | { | 385 | { |
386 | unsigned int cpu; | 386 | unsigned int cpu; |
387 | unsigned highest = 0; | ||
388 | |||
389 | for_each_cpu_mask(cpu, cpu_possible_map) | ||
390 | highest = cpu; | ||
391 | nr_cpu_ids = highest + 1; | ||
392 | 387 | ||
393 | /* FIXME: This should be done in userspace --RR */ | 388 | /* FIXME: This should be done in userspace --RR */ |
394 | for_each_present_cpu(cpu) { | 389 | for_each_present_cpu(cpu) { |
395 | if (num_online_cpus() >= max_cpus) | 390 | if (num_online_cpus() >= max_cpus) |
396 | break; | 391 | break; |
397 | if (!cpu_online(cpu)) | 392 | if (!cpu_online(cpu)) |
398 | cpu_up(cpu); | 393 | cpu_up(cpu); |
399 | } | 394 | } |
400 | 395 | ||
401 | /* Any cleanup work */ | 396 | /* Any cleanup work */ |
402 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); | 397 | printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); |
403 | smp_cpus_done(max_cpus); | 398 | smp_cpus_done(max_cpus); |
404 | } | 399 | } |
405 | 400 | ||
406 | #endif | 401 | #endif |
407 | 402 | ||
408 | /* | 403 | /* |
409 | * We need to store the untouched command line for future reference. | 404 | * We need to store the untouched command line for future reference. |
410 | * We also need to store the touched command line since the parameter | 405 | * We also need to store the touched command line since the parameter |
411 | * parsing is performed in place, and we should allow a component to | 406 | * parsing is performed in place, and we should allow a component to |
412 | * store reference of name/value for future reference. | 407 | * store reference of name/value for future reference. |
413 | */ | 408 | */ |
414 | static void __init setup_command_line(char *command_line) | 409 | static void __init setup_command_line(char *command_line) |
415 | { | 410 | { |
416 | saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); | 411 | saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); |
417 | static_command_line = alloc_bootmem(strlen (command_line)+1); | 412 | static_command_line = alloc_bootmem(strlen (command_line)+1); |
418 | strcpy (saved_command_line, boot_command_line); | 413 | strcpy (saved_command_line, boot_command_line); |
419 | strcpy (static_command_line, command_line); | 414 | strcpy (static_command_line, command_line); |
420 | } | 415 | } |
421 | 416 | ||
422 | /* | 417 | /* |
423 | * We need to finalize in a non-__init function or else race conditions | 418 | * We need to finalize in a non-__init function or else race conditions |
424 | * between the root thread and the init thread may cause start_kernel to | 419 | * between the root thread and the init thread may cause start_kernel to |
425 | * be reaped by free_initmem before the root thread has proceeded to | 420 | * be reaped by free_initmem before the root thread has proceeded to |
426 | * cpu_idle. | 421 | * cpu_idle. |
427 | * | 422 | * |
428 | * gcc-3.4 accidentally inlines this function, so use noinline. | 423 | * gcc-3.4 accidentally inlines this function, so use noinline. |
429 | */ | 424 | */ |
430 | 425 | ||
431 | static void noinline rest_init(void) | 426 | static void noinline rest_init(void) |
432 | __releases(kernel_lock) | 427 | __releases(kernel_lock) |
433 | { | 428 | { |
434 | kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); | 429 | kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); |
435 | numa_default_policy(); | 430 | numa_default_policy(); |
436 | unlock_kernel(); | 431 | unlock_kernel(); |
437 | 432 | ||
438 | /* | 433 | /* |
439 | * The boot idle thread must execute schedule() | 434 | * The boot idle thread must execute schedule() |
440 | * at least one to get things moving: | 435 | * at least one to get things moving: |
441 | */ | 436 | */ |
442 | preempt_enable_no_resched(); | 437 | preempt_enable_no_resched(); |
443 | schedule(); | 438 | schedule(); |
444 | preempt_disable(); | 439 | preempt_disable(); |
445 | 440 | ||
446 | /* Call into cpu_idle with preempt disabled */ | 441 | /* Call into cpu_idle with preempt disabled */ |
447 | cpu_idle(); | 442 | cpu_idle(); |
448 | } | 443 | } |
449 | 444 | ||
450 | /* Check for early params. */ | 445 | /* Check for early params. */ |
451 | static int __init do_early_param(char *param, char *val) | 446 | static int __init do_early_param(char *param, char *val) |
452 | { | 447 | { |
453 | struct obs_kernel_param *p; | 448 | struct obs_kernel_param *p; |
454 | 449 | ||
455 | for (p = __setup_start; p < __setup_end; p++) { | 450 | for (p = __setup_start; p < __setup_end; p++) { |
456 | if (p->early && strcmp(param, p->str) == 0) { | 451 | if (p->early && strcmp(param, p->str) == 0) { |
457 | if (p->setup_func(val) != 0) | 452 | if (p->setup_func(val) != 0) |
458 | printk(KERN_WARNING | 453 | printk(KERN_WARNING |
459 | "Malformed early option '%s'\n", param); | 454 | "Malformed early option '%s'\n", param); |
460 | } | 455 | } |
461 | } | 456 | } |
462 | /* We accept everything at this stage. */ | 457 | /* We accept everything at this stage. */ |
463 | return 0; | 458 | return 0; |
464 | } | 459 | } |
465 | 460 | ||
466 | /* Arch code calls this early on, or if not, just before other parsing. */ | 461 | /* Arch code calls this early on, or if not, just before other parsing. */ |
467 | void __init parse_early_param(void) | 462 | void __init parse_early_param(void) |
468 | { | 463 | { |
469 | static __initdata int done = 0; | 464 | static __initdata int done = 0; |
470 | static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; | 465 | static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; |
471 | 466 | ||
472 | if (done) | 467 | if (done) |
473 | return; | 468 | return; |
474 | 469 | ||
475 | /* All fall through to do_early_param. */ | 470 | /* All fall through to do_early_param. */ |
476 | strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); | 471 | strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); |
477 | parse_args("early options", tmp_cmdline, NULL, 0, do_early_param); | 472 | parse_args("early options", tmp_cmdline, NULL, 0, do_early_param); |
478 | done = 1; | 473 | done = 1; |
479 | } | 474 | } |
480 | 475 | ||
481 | /* | 476 | /* |
482 | * Activate the first processor. | 477 | * Activate the first processor. |
483 | */ | 478 | */ |
484 | 479 | ||
485 | static void __init boot_cpu_init(void) | 480 | static void __init boot_cpu_init(void) |
486 | { | 481 | { |
487 | int cpu = smp_processor_id(); | 482 | int cpu = smp_processor_id(); |
488 | /* Mark the boot cpu "present", "online" etc for SMP and UP case */ | 483 | /* Mark the boot cpu "present", "online" etc for SMP and UP case */ |
489 | cpu_set(cpu, cpu_online_map); | 484 | cpu_set(cpu, cpu_online_map); |
490 | cpu_set(cpu, cpu_present_map); | 485 | cpu_set(cpu, cpu_present_map); |
491 | cpu_set(cpu, cpu_possible_map); | 486 | cpu_set(cpu, cpu_possible_map); |
492 | } | 487 | } |
493 | 488 | ||
494 | void __init __attribute__((weak)) smp_setup_processor_id(void) | 489 | void __init __attribute__((weak)) smp_setup_processor_id(void) |
495 | { | 490 | { |
496 | } | 491 | } |
497 | 492 | ||
498 | asmlinkage void __init start_kernel(void) | 493 | asmlinkage void __init start_kernel(void) |
499 | { | 494 | { |
500 | char * command_line; | 495 | char * command_line; |
501 | extern struct kernel_param __start___param[], __stop___param[]; | 496 | extern struct kernel_param __start___param[], __stop___param[]; |
502 | 497 | ||
503 | smp_setup_processor_id(); | 498 | smp_setup_processor_id(); |
504 | 499 | ||
505 | /* | 500 | /* |
506 | * Need to run as early as possible, to initialize the | 501 | * Need to run as early as possible, to initialize the |
507 | * lockdep hash: | 502 | * lockdep hash: |
508 | */ | 503 | */ |
509 | unwind_init(); | 504 | unwind_init(); |
510 | lockdep_init(); | 505 | lockdep_init(); |
511 | 506 | ||
512 | local_irq_disable(); | 507 | local_irq_disable(); |
513 | early_boot_irqs_off(); | 508 | early_boot_irqs_off(); |
514 | early_init_irq_lock_class(); | 509 | early_init_irq_lock_class(); |
515 | 510 | ||
516 | /* | 511 | /* |
517 | * Interrupts are still disabled. Do necessary setups, then | 512 | * Interrupts are still disabled. Do necessary setups, then |
518 | * enable them | 513 | * enable them |
519 | */ | 514 | */ |
520 | lock_kernel(); | 515 | lock_kernel(); |
521 | tick_init(); | 516 | tick_init(); |
522 | boot_cpu_init(); | 517 | boot_cpu_init(); |
523 | page_address_init(); | 518 | page_address_init(); |
524 | printk(KERN_NOTICE); | 519 | printk(KERN_NOTICE); |
525 | printk(linux_banner); | 520 | printk(linux_banner); |
526 | setup_arch(&command_line); | 521 | setup_arch(&command_line); |
527 | setup_command_line(command_line); | 522 | setup_command_line(command_line); |
528 | unwind_setup(); | 523 | unwind_setup(); |
529 | setup_per_cpu_areas(); | 524 | setup_per_cpu_areas(); |
530 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ | 525 | smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ |
531 | 526 | ||
532 | /* | 527 | /* |
533 | * Set up the scheduler prior starting any interrupts (such as the | 528 | * Set up the scheduler prior starting any interrupts (such as the |
534 | * timer interrupt). Full topology setup happens at smp_init() | 529 | * timer interrupt). Full topology setup happens at smp_init() |
535 | * time - but meanwhile we still have a functioning scheduler. | 530 | * time - but meanwhile we still have a functioning scheduler. |
536 | */ | 531 | */ |
537 | sched_init(); | 532 | sched_init(); |
538 | /* | 533 | /* |
539 | * Disable preemption - early bootup scheduling is extremely | 534 | * Disable preemption - early bootup scheduling is extremely |
540 | * fragile until we cpu_idle() for the first time. | 535 | * fragile until we cpu_idle() for the first time. |
541 | */ | 536 | */ |
542 | preempt_disable(); | 537 | preempt_disable(); |
543 | build_all_zonelists(); | 538 | build_all_zonelists(); |
544 | page_alloc_init(); | 539 | page_alloc_init(); |
545 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); | 540 | printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); |
546 | parse_early_param(); | 541 | parse_early_param(); |
547 | parse_args("Booting kernel", static_command_line, __start___param, | 542 | parse_args("Booting kernel", static_command_line, __start___param, |
548 | __stop___param - __start___param, | 543 | __stop___param - __start___param, |
549 | &unknown_bootoption); | 544 | &unknown_bootoption); |
550 | if (!irqs_disabled()) { | 545 | if (!irqs_disabled()) { |
551 | printk(KERN_WARNING "start_kernel(): bug: interrupts were " | 546 | printk(KERN_WARNING "start_kernel(): bug: interrupts were " |
552 | "enabled *very* early, fixing it\n"); | 547 | "enabled *very* early, fixing it\n"); |
553 | local_irq_disable(); | 548 | local_irq_disable(); |
554 | } | 549 | } |
555 | sort_main_extable(); | 550 | sort_main_extable(); |
556 | trap_init(); | 551 | trap_init(); |
557 | rcu_init(); | 552 | rcu_init(); |
558 | init_IRQ(); | 553 | init_IRQ(); |
559 | pidhash_init(); | 554 | pidhash_init(); |
560 | init_timers(); | 555 | init_timers(); |
561 | hrtimers_init(); | 556 | hrtimers_init(); |
562 | softirq_init(); | 557 | softirq_init(); |
563 | timekeeping_init(); | 558 | timekeeping_init(); |
564 | time_init(); | 559 | time_init(); |
565 | profile_init(); | 560 | profile_init(); |
566 | if (!irqs_disabled()) | 561 | if (!irqs_disabled()) |
567 | printk("start_kernel(): bug: interrupts were enabled early\n"); | 562 | printk("start_kernel(): bug: interrupts were enabled early\n"); |
568 | early_boot_irqs_on(); | 563 | early_boot_irqs_on(); |
569 | local_irq_enable(); | 564 | local_irq_enable(); |
570 | 565 | ||
571 | /* | 566 | /* |
572 | * HACK ALERT! This is early. We're enabling the console before | 567 | * HACK ALERT! This is early. We're enabling the console before |
573 | * we've done PCI setups etc, and console_init() must be aware of | 568 | * we've done PCI setups etc, and console_init() must be aware of |
574 | * this. But we do want output early, in case something goes wrong. | 569 | * this. But we do want output early, in case something goes wrong. |
575 | */ | 570 | */ |
576 | console_init(); | 571 | console_init(); |
577 | if (panic_later) | 572 | if (panic_later) |
578 | panic(panic_later, panic_param); | 573 | panic(panic_later, panic_param); |
579 | 574 | ||
580 | lockdep_info(); | 575 | lockdep_info(); |
581 | 576 | ||
582 | /* | 577 | /* |
583 | * Need to run this when irqs are enabled, because it wants | 578 | * Need to run this when irqs are enabled, because it wants |
584 | * to self-test [hard/soft]-irqs on/off lock inversion bugs | 579 | * to self-test [hard/soft]-irqs on/off lock inversion bugs |
585 | * too: | 580 | * too: |
586 | */ | 581 | */ |
587 | locking_selftest(); | 582 | locking_selftest(); |
588 | 583 | ||
589 | #ifdef CONFIG_BLK_DEV_INITRD | 584 | #ifdef CONFIG_BLK_DEV_INITRD |
590 | if (initrd_start && !initrd_below_start_ok && | 585 | if (initrd_start && !initrd_below_start_ok && |
591 | initrd_start < min_low_pfn << PAGE_SHIFT) { | 586 | initrd_start < min_low_pfn << PAGE_SHIFT) { |
592 | printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " | 587 | printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " |
593 | "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); | 588 | "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); |
594 | initrd_start = 0; | 589 | initrd_start = 0; |
595 | } | 590 | } |
596 | #endif | 591 | #endif |
597 | vfs_caches_init_early(); | 592 | vfs_caches_init_early(); |
598 | cpuset_init_early(); | 593 | cpuset_init_early(); |
599 | mem_init(); | 594 | mem_init(); |
600 | kmem_cache_init(); | 595 | kmem_cache_init(); |
601 | setup_per_cpu_pageset(); | 596 | setup_per_cpu_pageset(); |
602 | numa_policy_init(); | 597 | numa_policy_init(); |
603 | if (late_time_init) | 598 | if (late_time_init) |
604 | late_time_init(); | 599 | late_time_init(); |
605 | calibrate_delay(); | 600 | calibrate_delay(); |
606 | pidmap_init(); | 601 | pidmap_init(); |
607 | pgtable_cache_init(); | 602 | pgtable_cache_init(); |
608 | prio_tree_init(); | 603 | prio_tree_init(); |
609 | anon_vma_init(); | 604 | anon_vma_init(); |
610 | #ifdef CONFIG_X86 | 605 | #ifdef CONFIG_X86 |
611 | if (efi_enabled) | 606 | if (efi_enabled) |
612 | efi_enter_virtual_mode(); | 607 | efi_enter_virtual_mode(); |
613 | #endif | 608 | #endif |
614 | fork_init(num_physpages); | 609 | fork_init(num_physpages); |
615 | proc_caches_init(); | 610 | proc_caches_init(); |
616 | buffer_init(); | 611 | buffer_init(); |
617 | unnamed_dev_init(); | 612 | unnamed_dev_init(); |
618 | key_init(); | 613 | key_init(); |
619 | security_init(); | 614 | security_init(); |
620 | vfs_caches_init(num_physpages); | 615 | vfs_caches_init(num_physpages); |
621 | radix_tree_init(); | 616 | radix_tree_init(); |
622 | signals_init(); | 617 | signals_init(); |
623 | /* rootfs populating might need page-writeback */ | 618 | /* rootfs populating might need page-writeback */ |
624 | page_writeback_init(); | 619 | page_writeback_init(); |
625 | #ifdef CONFIG_PROC_FS | 620 | #ifdef CONFIG_PROC_FS |
626 | proc_root_init(); | 621 | proc_root_init(); |
627 | #endif | 622 | #endif |
628 | cpuset_init(); | 623 | cpuset_init(); |
629 | taskstats_init_early(); | 624 | taskstats_init_early(); |
630 | delayacct_init(); | 625 | delayacct_init(); |
631 | 626 | ||
632 | check_bugs(); | 627 | check_bugs(); |
633 | 628 | ||
634 | acpi_early_init(); /* before LAPIC and SMP init */ | 629 | acpi_early_init(); /* before LAPIC and SMP init */ |
635 | 630 | ||
636 | /* Do the rest non-__init'ed, we're now alive */ | 631 | /* Do the rest non-__init'ed, we're now alive */ |
637 | rest_init(); | 632 | rest_init(); |
638 | } | 633 | } |
639 | 634 | ||
640 | static int __initdata initcall_debug; | 635 | static int __initdata initcall_debug; |
641 | 636 | ||
642 | static int __init initcall_debug_setup(char *str) | 637 | static int __init initcall_debug_setup(char *str) |
643 | { | 638 | { |
644 | initcall_debug = 1; | 639 | initcall_debug = 1; |
645 | return 1; | 640 | return 1; |
646 | } | 641 | } |
647 | __setup("initcall_debug", initcall_debug_setup); | 642 | __setup("initcall_debug", initcall_debug_setup); |
648 | 643 | ||
649 | extern initcall_t __initcall_start[], __initcall_end[]; | 644 | extern initcall_t __initcall_start[], __initcall_end[]; |
650 | 645 | ||
651 | static void __init do_initcalls(void) | 646 | static void __init do_initcalls(void) |
652 | { | 647 | { |
653 | initcall_t *call; | 648 | initcall_t *call; |
654 | int count = preempt_count(); | 649 | int count = preempt_count(); |
655 | 650 | ||
656 | for (call = __initcall_start; call < __initcall_end; call++) { | 651 | for (call = __initcall_start; call < __initcall_end; call++) { |
657 | char *msg = NULL; | 652 | char *msg = NULL; |
658 | char msgbuf[40]; | 653 | char msgbuf[40]; |
659 | int result; | 654 | int result; |
660 | 655 | ||
661 | if (initcall_debug) { | 656 | if (initcall_debug) { |
662 | printk("Calling initcall 0x%p", *call); | 657 | printk("Calling initcall 0x%p", *call); |
663 | print_fn_descriptor_symbol(": %s()", | 658 | print_fn_descriptor_symbol(": %s()", |
664 | (unsigned long) *call); | 659 | (unsigned long) *call); |
665 | printk("\n"); | 660 | printk("\n"); |
666 | } | 661 | } |
667 | 662 | ||
668 | result = (*call)(); | 663 | result = (*call)(); |
669 | 664 | ||
670 | if (result && result != -ENODEV && initcall_debug) { | 665 | if (result && result != -ENODEV && initcall_debug) { |
671 | sprintf(msgbuf, "error code %d", result); | 666 | sprintf(msgbuf, "error code %d", result); |
672 | msg = msgbuf; | 667 | msg = msgbuf; |
673 | } | 668 | } |
674 | if (preempt_count() != count) { | 669 | if (preempt_count() != count) { |
675 | msg = "preemption imbalance"; | 670 | msg = "preemption imbalance"; |
676 | preempt_count() = count; | 671 | preempt_count() = count; |
677 | } | 672 | } |
678 | if (irqs_disabled()) { | 673 | if (irqs_disabled()) { |
679 | msg = "disabled interrupts"; | 674 | msg = "disabled interrupts"; |
680 | local_irq_enable(); | 675 | local_irq_enable(); |
681 | } | 676 | } |
682 | if (msg) { | 677 | if (msg) { |
683 | printk(KERN_WARNING "initcall at 0x%p", *call); | 678 | printk(KERN_WARNING "initcall at 0x%p", *call); |
684 | print_fn_descriptor_symbol(": %s()", | 679 | print_fn_descriptor_symbol(": %s()", |
685 | (unsigned long) *call); | 680 | (unsigned long) *call); |
686 | printk(": returned with %s\n", msg); | 681 | printk(": returned with %s\n", msg); |
687 | } | 682 | } |
688 | } | 683 | } |
689 | 684 | ||
690 | /* Make sure there is no pending stuff from the initcall sequence */ | 685 | /* Make sure there is no pending stuff from the initcall sequence */ |
691 | flush_scheduled_work(); | 686 | flush_scheduled_work(); |
692 | } | 687 | } |
693 | 688 | ||
694 | /* | 689 | /* |
695 | * Ok, the machine is now initialized. None of the devices | 690 | * Ok, the machine is now initialized. None of the devices |
696 | * have been touched yet, but the CPU subsystem is up and | 691 | * have been touched yet, but the CPU subsystem is up and |
697 | * running, and memory and process management works. | 692 | * running, and memory and process management works. |
698 | * | 693 | * |
699 | * Now we can finally start doing some real work.. | 694 | * Now we can finally start doing some real work.. |
700 | */ | 695 | */ |
701 | static void __init do_basic_setup(void) | 696 | static void __init do_basic_setup(void) |
702 | { | 697 | { |
703 | /* drivers will send hotplug events */ | 698 | /* drivers will send hotplug events */ |
704 | init_workqueues(); | 699 | init_workqueues(); |
705 | usermodehelper_init(); | 700 | usermodehelper_init(); |
706 | driver_init(); | 701 | driver_init(); |
707 | init_irq_proc(); | 702 | init_irq_proc(); |
708 | do_initcalls(); | 703 | do_initcalls(); |
709 | } | 704 | } |
710 | 705 | ||
711 | static void __init do_pre_smp_initcalls(void) | 706 | static void __init do_pre_smp_initcalls(void) |
712 | { | 707 | { |
713 | extern int spawn_ksoftirqd(void); | 708 | extern int spawn_ksoftirqd(void); |
714 | #ifdef CONFIG_SMP | 709 | #ifdef CONFIG_SMP |
715 | extern int migration_init(void); | 710 | extern int migration_init(void); |
716 | 711 | ||
717 | migration_init(); | 712 | migration_init(); |
718 | #endif | 713 | #endif |
719 | spawn_ksoftirqd(); | 714 | spawn_ksoftirqd(); |
720 | spawn_softlockup_task(); | 715 | spawn_softlockup_task(); |
721 | } | 716 | } |
722 | 717 | ||
723 | static void run_init_process(char *init_filename) | 718 | static void run_init_process(char *init_filename) |
724 | { | 719 | { |
725 | argv_init[0] = init_filename; | 720 | argv_init[0] = init_filename; |
726 | kernel_execve(init_filename, argv_init, envp_init); | 721 | kernel_execve(init_filename, argv_init, envp_init); |
727 | } | 722 | } |
728 | 723 | ||
729 | /* This is a non __init function. Force it to be noinline otherwise gcc | 724 | /* This is a non __init function. Force it to be noinline otherwise gcc |
730 | * makes it inline to init() and it becomes part of init.text section | 725 | * makes it inline to init() and it becomes part of init.text section |
731 | */ | 726 | */ |
732 | static int noinline init_post(void) | 727 | static int noinline init_post(void) |
733 | { | 728 | { |
734 | free_initmem(); | 729 | free_initmem(); |
735 | unlock_kernel(); | 730 | unlock_kernel(); |
736 | mark_rodata_ro(); | 731 | mark_rodata_ro(); |
737 | system_state = SYSTEM_RUNNING; | 732 | system_state = SYSTEM_RUNNING; |
738 | numa_default_policy(); | 733 | numa_default_policy(); |
739 | 734 | ||
740 | if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) | 735 | if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) |
741 | printk(KERN_WARNING "Warning: unable to open an initial console.\n"); | 736 | printk(KERN_WARNING "Warning: unable to open an initial console.\n"); |
742 | 737 | ||
743 | (void) sys_dup(0); | 738 | (void) sys_dup(0); |
744 | (void) sys_dup(0); | 739 | (void) sys_dup(0); |
745 | 740 | ||
746 | if (ramdisk_execute_command) { | 741 | if (ramdisk_execute_command) { |
747 | run_init_process(ramdisk_execute_command); | 742 | run_init_process(ramdisk_execute_command); |
748 | printk(KERN_WARNING "Failed to execute %s\n", | 743 | printk(KERN_WARNING "Failed to execute %s\n", |
749 | ramdisk_execute_command); | 744 | ramdisk_execute_command); |
750 | } | 745 | } |
751 | 746 | ||
752 | /* | 747 | /* |
753 | * We try each of these until one succeeds. | 748 | * We try each of these until one succeeds. |
754 | * | 749 | * |
755 | * The Bourne shell can be used instead of init if we are | 750 | * The Bourne shell can be used instead of init if we are |
756 | * trying to recover a really broken machine. | 751 | * trying to recover a really broken machine. |
757 | */ | 752 | */ |
758 | if (execute_command) { | 753 | if (execute_command) { |
759 | run_init_process(execute_command); | 754 | run_init_process(execute_command); |
760 | printk(KERN_WARNING "Failed to execute %s. Attempting " | 755 | printk(KERN_WARNING "Failed to execute %s. Attempting " |
761 | "defaults...\n", execute_command); | 756 | "defaults...\n", execute_command); |
762 | } | 757 | } |
763 | run_init_process("/sbin/init"); | 758 | run_init_process("/sbin/init"); |
764 | run_init_process("/etc/init"); | 759 | run_init_process("/etc/init"); |
765 | run_init_process("/bin/init"); | 760 | run_init_process("/bin/init"); |
766 | run_init_process("/bin/sh"); | 761 | run_init_process("/bin/sh"); |
767 | 762 | ||
768 | panic("No init found. Try passing init= option to kernel."); | 763 | panic("No init found. Try passing init= option to kernel."); |
769 | } | 764 | } |
770 | 765 | ||
771 | static int __init kernel_init(void * unused) | 766 | static int __init kernel_init(void * unused) |
772 | { | 767 | { |
773 | lock_kernel(); | 768 | lock_kernel(); |
774 | /* | 769 | /* |
775 | * init can run on any cpu. | 770 | * init can run on any cpu. |
776 | */ | 771 | */ |
777 | set_cpus_allowed(current, CPU_MASK_ALL); | 772 | set_cpus_allowed(current, CPU_MASK_ALL); |
778 | /* | 773 | /* |
779 | * Tell the world that we're going to be the grim | 774 | * Tell the world that we're going to be the grim |
780 | * reaper of innocent orphaned children. | 775 | * reaper of innocent orphaned children. |
781 | * | 776 | * |
782 | * We don't want people to have to make incorrect | 777 | * We don't want people to have to make incorrect |
783 | * assumptions about where in the task array this | 778 | * assumptions about where in the task array this |
784 | * can be found. | 779 | * can be found. |
785 | */ | 780 | */ |
786 | init_pid_ns.child_reaper = current; | 781 | init_pid_ns.child_reaper = current; |
787 | 782 | ||
788 | cad_pid = task_pid(current); | 783 | cad_pid = task_pid(current); |
789 | 784 | ||
790 | smp_prepare_cpus(max_cpus); | 785 | smp_prepare_cpus(max_cpus); |
791 | 786 | ||
792 | do_pre_smp_initcalls(); | 787 | do_pre_smp_initcalls(); |
793 | 788 | ||
794 | smp_init(); | 789 | smp_init(); |
795 | sched_init_smp(); | 790 | sched_init_smp(); |
796 | 791 | ||
797 | cpuset_init_smp(); | 792 | cpuset_init_smp(); |
798 | 793 | ||
799 | do_basic_setup(); | 794 | do_basic_setup(); |
800 | 795 | ||
801 | /* | 796 | /* |
802 | * check if there is an early userspace init. If yes, let it do all | 797 | * check if there is an early userspace init. If yes, let it do all |
803 | * the work | 798 | * the work |
804 | */ | 799 | */ |
805 | 800 | ||
806 | if (!ramdisk_execute_command) | 801 | if (!ramdisk_execute_command) |
807 | ramdisk_execute_command = "/init"; | 802 | ramdisk_execute_command = "/init"; |
808 | 803 | ||
809 | if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { | 804 | if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { |
810 | ramdisk_execute_command = NULL; | 805 | ramdisk_execute_command = NULL; |
811 | prepare_namespace(); | 806 | prepare_namespace(); |
812 | } | 807 | } |
813 | 808 | ||
814 | /* | 809 | /* |
815 | * Ok, we have completed the initial bootup, and | 810 | * Ok, we have completed the initial bootup, and |
816 | * we're essentially up and running. Get rid of the | 811 | * we're essentially up and running. Get rid of the |
817 | * initmem segments and start the user-mode stuff.. | 812 | * initmem segments and start the user-mode stuff.. |
818 | */ | 813 | */ |
819 | init_post(); | 814 | init_post(); |
820 | return 0; | 815 | return 0; |
821 | } | 816 | } |
822 | 817 |
kernel/sched.c
1 | /* | 1 | /* |
2 | * kernel/sched.c | 2 | * kernel/sched.c |
3 | * | 3 | * |
4 | * Kernel scheduler and related syscalls | 4 | * Kernel scheduler and related syscalls |
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | * | 7 | * |
8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | 8 | * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and |
9 | * make semaphores SMP safe | 9 | * make semaphores SMP safe |
10 | * 1998-11-19 Implemented schedule_timeout() and related stuff | 10 | * 1998-11-19 Implemented schedule_timeout() and related stuff |
11 | * by Andrea Arcangeli | 11 | * by Andrea Arcangeli |
12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: | 12 | * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: |
13 | * hybrid priority-list and round-robin design with | 13 | * hybrid priority-list and round-robin design with |
14 | * an array-switch method of distributing timeslices | 14 | * an array-switch method of distributing timeslices |
15 | * and per-CPU runqueues. Cleanups and useful suggestions | 15 | * and per-CPU runqueues. Cleanups and useful suggestions |
16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. | 16 | * by Davide Libenzi, preemptible kernel bits by Robert Love. |
17 | * 2003-09-03 Interactivity tuning by Con Kolivas. | 17 | * 2003-09-03 Interactivity tuning by Con Kolivas. |
18 | * 2004-04-02 Scheduler domains code by Nick Piggin | 18 | * 2004-04-02 Scheduler domains code by Nick Piggin |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/mm.h> | 21 | #include <linux/mm.h> |
22 | #include <linux/module.h> | 22 | #include <linux/module.h> |
23 | #include <linux/nmi.h> | 23 | #include <linux/nmi.h> |
24 | #include <linux/init.h> | 24 | #include <linux/init.h> |
25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
26 | #include <linux/highmem.h> | 26 | #include <linux/highmem.h> |
27 | #include <linux/smp_lock.h> | 27 | #include <linux/smp_lock.h> |
28 | #include <asm/mmu_context.h> | 28 | #include <asm/mmu_context.h> |
29 | #include <linux/interrupt.h> | 29 | #include <linux/interrupt.h> |
30 | #include <linux/capability.h> | 30 | #include <linux/capability.h> |
31 | #include <linux/completion.h> | 31 | #include <linux/completion.h> |
32 | #include <linux/kernel_stat.h> | 32 | #include <linux/kernel_stat.h> |
33 | #include <linux/debug_locks.h> | 33 | #include <linux/debug_locks.h> |
34 | #include <linux/security.h> | 34 | #include <linux/security.h> |
35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
36 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
37 | #include <linux/freezer.h> | 37 | #include <linux/freezer.h> |
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/blkdev.h> | 39 | #include <linux/blkdev.h> |
40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
41 | #include <linux/smp.h> | 41 | #include <linux/smp.h> |
42 | #include <linux/threads.h> | 42 | #include <linux/threads.h> |
43 | #include <linux/timer.h> | 43 | #include <linux/timer.h> |
44 | #include <linux/rcupdate.h> | 44 | #include <linux/rcupdate.h> |
45 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
46 | #include <linux/cpuset.h> | 46 | #include <linux/cpuset.h> |
47 | #include <linux/percpu.h> | 47 | #include <linux/percpu.h> |
48 | #include <linux/kthread.h> | 48 | #include <linux/kthread.h> |
49 | #include <linux/seq_file.h> | 49 | #include <linux/seq_file.h> |
50 | #include <linux/syscalls.h> | 50 | #include <linux/syscalls.h> |
51 | #include <linux/times.h> | 51 | #include <linux/times.h> |
52 | #include <linux/tsacct_kern.h> | 52 | #include <linux/tsacct_kern.h> |
53 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
54 | #include <linux/delayacct.h> | 54 | #include <linux/delayacct.h> |
55 | #include <asm/tlb.h> | 55 | #include <asm/tlb.h> |
56 | 56 | ||
57 | #include <asm/unistd.h> | 57 | #include <asm/unistd.h> |
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Scheduler clock - returns current time in nanosec units. | 60 | * Scheduler clock - returns current time in nanosec units. |
61 | * This is default implementation. | 61 | * This is default implementation. |
62 | * Architectures and sub-architectures can override this. | 62 | * Architectures and sub-architectures can override this. |
63 | */ | 63 | */ |
64 | unsigned long long __attribute__((weak)) sched_clock(void) | 64 | unsigned long long __attribute__((weak)) sched_clock(void) |
65 | { | 65 | { |
66 | return (unsigned long long)jiffies * (1000000000 / HZ); | 66 | return (unsigned long long)jiffies * (1000000000 / HZ); |
67 | } | 67 | } |
68 | 68 | ||
69 | /* | 69 | /* |
70 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 70 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
71 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 71 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |
72 | * and back. | 72 | * and back. |
73 | */ | 73 | */ |
74 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) | 74 | #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) |
75 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) | 75 | #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) |
76 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) | 76 | #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) |
77 | 77 | ||
78 | /* | 78 | /* |
79 | * 'User priority' is the nice value converted to something we | 79 | * 'User priority' is the nice value converted to something we |
80 | * can work with better when scaling various scheduler parameters, | 80 | * can work with better when scaling various scheduler parameters, |
81 | * it's a [ 0 ... 39 ] range. | 81 | * it's a [ 0 ... 39 ] range. |
82 | */ | 82 | */ |
83 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) | 83 | #define USER_PRIO(p) ((p)-MAX_RT_PRIO) |
84 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) | 84 | #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) |
85 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 85 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
86 | 86 | ||
87 | /* | 87 | /* |
88 | * Some helpers for converting nanosecond timing to jiffy resolution | 88 | * Some helpers for converting nanosecond timing to jiffy resolution |
89 | */ | 89 | */ |
90 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) | 90 | #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) |
91 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | 91 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) |
92 | 92 | ||
93 | /* | 93 | /* |
94 | * These are the 'tuning knobs' of the scheduler: | 94 | * These are the 'tuning knobs' of the scheduler: |
95 | * | 95 | * |
96 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), | 96 | * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), |
97 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. | 97 | * default timeslice is 100 msecs, maximum timeslice is 800 msecs. |
98 | * Timeslices get refilled after they expire. | 98 | * Timeslices get refilled after they expire. |
99 | */ | 99 | */ |
100 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) | 100 | #define MIN_TIMESLICE max(5 * HZ / 1000, 1) |
101 | #define DEF_TIMESLICE (100 * HZ / 1000) | 101 | #define DEF_TIMESLICE (100 * HZ / 1000) |
102 | #define ON_RUNQUEUE_WEIGHT 30 | 102 | #define ON_RUNQUEUE_WEIGHT 30 |
103 | #define CHILD_PENALTY 95 | 103 | #define CHILD_PENALTY 95 |
104 | #define PARENT_PENALTY 100 | 104 | #define PARENT_PENALTY 100 |
105 | #define EXIT_WEIGHT 3 | 105 | #define EXIT_WEIGHT 3 |
106 | #define PRIO_BONUS_RATIO 25 | 106 | #define PRIO_BONUS_RATIO 25 |
107 | #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) | 107 | #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) |
108 | #define INTERACTIVE_DELTA 2 | 108 | #define INTERACTIVE_DELTA 2 |
109 | #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) | 109 | #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) |
110 | #define STARVATION_LIMIT (MAX_SLEEP_AVG) | 110 | #define STARVATION_LIMIT (MAX_SLEEP_AVG) |
111 | #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) | 111 | #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) |
112 | 112 | ||
113 | /* | 113 | /* |
114 | * If a task is 'interactive' then we reinsert it in the active | 114 | * If a task is 'interactive' then we reinsert it in the active |
115 | * array after it has expired its current timeslice. (it will not | 115 | * array after it has expired its current timeslice. (it will not |
116 | * continue to run immediately, it will still roundrobin with | 116 | * continue to run immediately, it will still roundrobin with |
117 | * other interactive tasks.) | 117 | * other interactive tasks.) |
118 | * | 118 | * |
119 | * This part scales the interactivity limit depending on niceness. | 119 | * This part scales the interactivity limit depending on niceness. |
120 | * | 120 | * |
121 | * We scale it linearly, offset by the INTERACTIVE_DELTA delta. | 121 | * We scale it linearly, offset by the INTERACTIVE_DELTA delta. |
122 | * Here are a few examples of different nice levels: | 122 | * Here are a few examples of different nice levels: |
123 | * | 123 | * |
124 | * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] | 124 | * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] |
125 | * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] | 125 | * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] |
126 | * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] | 126 | * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] |
127 | * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] | 127 | * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] |
128 | * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] | 128 | * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] |
129 | * | 129 | * |
130 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic | 130 | * (the X axis represents the possible -5 ... 0 ... +5 dynamic |
131 | * priority range a task can explore, a value of '1' means the | 131 | * priority range a task can explore, a value of '1' means the |
132 | * task is rated interactive.) | 132 | * task is rated interactive.) |
133 | * | 133 | * |
134 | * Ie. nice +19 tasks can never get 'interactive' enough to be | 134 | * Ie. nice +19 tasks can never get 'interactive' enough to be |
135 | * reinserted into the active array. And only heavily CPU-hog nice -20 | 135 | * reinserted into the active array. And only heavily CPU-hog nice -20 |
136 | * tasks will be expired. Default nice 0 tasks are somewhere between, | 136 | * tasks will be expired. Default nice 0 tasks are somewhere between, |
137 | * it takes some effort for them to get interactive, but it's not | 137 | * it takes some effort for them to get interactive, but it's not |
138 | * too hard. | 138 | * too hard. |
139 | */ | 139 | */ |
140 | 140 | ||
141 | #define CURRENT_BONUS(p) \ | 141 | #define CURRENT_BONUS(p) \ |
142 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ | 142 | (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ |
143 | MAX_SLEEP_AVG) | 143 | MAX_SLEEP_AVG) |
144 | 144 | ||
145 | #define GRANULARITY (10 * HZ / 1000 ? : 1) | 145 | #define GRANULARITY (10 * HZ / 1000 ? : 1) |
146 | 146 | ||
147 | #ifdef CONFIG_SMP | 147 | #ifdef CONFIG_SMP |
148 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | 148 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
149 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ | 149 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ |
150 | num_online_cpus()) | 150 | num_online_cpus()) |
151 | #else | 151 | #else |
152 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ | 152 | #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ |
153 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) | 153 | (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) |
154 | #endif | 154 | #endif |
155 | 155 | ||
156 | #define SCALE(v1,v1_max,v2_max) \ | 156 | #define SCALE(v1,v1_max,v2_max) \ |
157 | (v1) * (v2_max) / (v1_max) | 157 | (v1) * (v2_max) / (v1_max) |
158 | 158 | ||
159 | #define DELTA(p) \ | 159 | #define DELTA(p) \ |
160 | (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ | 160 | (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ |
161 | INTERACTIVE_DELTA) | 161 | INTERACTIVE_DELTA) |
162 | 162 | ||
163 | #define TASK_INTERACTIVE(p) \ | 163 | #define TASK_INTERACTIVE(p) \ |
164 | ((p)->prio <= (p)->static_prio - DELTA(p)) | 164 | ((p)->prio <= (p)->static_prio - DELTA(p)) |
165 | 165 | ||
166 | #define INTERACTIVE_SLEEP(p) \ | 166 | #define INTERACTIVE_SLEEP(p) \ |
167 | (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ | 167 | (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ |
168 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) | 168 | (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) |
169 | 169 | ||
170 | #define TASK_PREEMPTS_CURR(p, rq) \ | 170 | #define TASK_PREEMPTS_CURR(p, rq) \ |
171 | ((p)->prio < (rq)->curr->prio) | 171 | ((p)->prio < (rq)->curr->prio) |
172 | 172 | ||
173 | #define SCALE_PRIO(x, prio) \ | 173 | #define SCALE_PRIO(x, prio) \ |
174 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) | 174 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
175 | 175 | ||
176 | static unsigned int static_prio_timeslice(int static_prio) | 176 | static unsigned int static_prio_timeslice(int static_prio) |
177 | { | 177 | { |
178 | if (static_prio < NICE_TO_PRIO(0)) | 178 | if (static_prio < NICE_TO_PRIO(0)) |
179 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); | 179 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
180 | else | 180 | else |
181 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); | 181 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
182 | } | 182 | } |
183 | 183 | ||
184 | /* | 184 | /* |
185 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] | 185 | * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] |
186 | * to time slice values: [800ms ... 100ms ... 5ms] | 186 | * to time slice values: [800ms ... 100ms ... 5ms] |
187 | * | 187 | * |
188 | * The higher a thread's priority, the bigger timeslices | 188 | * The higher a thread's priority, the bigger timeslices |
189 | * it gets during one round of execution. But even the lowest | 189 | * it gets during one round of execution. But even the lowest |
190 | * priority thread gets MIN_TIMESLICE worth of execution time. | 190 | * priority thread gets MIN_TIMESLICE worth of execution time. |
191 | */ | 191 | */ |
192 | 192 | ||
193 | static inline unsigned int task_timeslice(struct task_struct *p) | 193 | static inline unsigned int task_timeslice(struct task_struct *p) |
194 | { | 194 | { |
195 | return static_prio_timeslice(p->static_prio); | 195 | return static_prio_timeslice(p->static_prio); |
196 | } | 196 | } |
197 | 197 | ||
198 | /* | 198 | /* |
199 | * These are the runqueue data structures: | 199 | * These are the runqueue data structures: |
200 | */ | 200 | */ |
201 | 201 | ||
202 | struct prio_array { | 202 | struct prio_array { |
203 | unsigned int nr_active; | 203 | unsigned int nr_active; |
204 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ | 204 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
205 | struct list_head queue[MAX_PRIO]; | 205 | struct list_head queue[MAX_PRIO]; |
206 | }; | 206 | }; |
207 | 207 | ||
208 | /* | 208 | /* |
209 | * This is the main, per-CPU runqueue data structure. | 209 | * This is the main, per-CPU runqueue data structure. |
210 | * | 210 | * |
211 | * Locking rule: those places that want to lock multiple runqueues | 211 | * Locking rule: those places that want to lock multiple runqueues |
212 | * (such as the load balancing or the thread migration code), lock | 212 | * (such as the load balancing or the thread migration code), lock |
213 | * acquire operations must be ordered by ascending &runqueue. | 213 | * acquire operations must be ordered by ascending &runqueue. |
214 | */ | 214 | */ |
215 | struct rq { | 215 | struct rq { |
216 | spinlock_t lock; | 216 | spinlock_t lock; |
217 | 217 | ||
218 | /* | 218 | /* |
219 | * nr_running and cpu_load should be in the same cacheline because | 219 | * nr_running and cpu_load should be in the same cacheline because |
220 | * remote CPUs use both these fields when doing load calculation. | 220 | * remote CPUs use both these fields when doing load calculation. |
221 | */ | 221 | */ |
222 | unsigned long nr_running; | 222 | unsigned long nr_running; |
223 | unsigned long raw_weighted_load; | 223 | unsigned long raw_weighted_load; |
224 | #ifdef CONFIG_SMP | 224 | #ifdef CONFIG_SMP |
225 | unsigned long cpu_load[3]; | 225 | unsigned long cpu_load[3]; |
226 | #endif | 226 | #endif |
227 | unsigned long long nr_switches; | 227 | unsigned long long nr_switches; |
228 | 228 | ||
229 | /* | 229 | /* |
230 | * This is part of a global counter where only the total sum | 230 | * This is part of a global counter where only the total sum |
231 | * over all CPUs matters. A task can increase this counter on | 231 | * over all CPUs matters. A task can increase this counter on |
232 | * one CPU and if it got migrated afterwards it may decrease | 232 | * one CPU and if it got migrated afterwards it may decrease |
233 | * it on another CPU. Always updated under the runqueue lock: | 233 | * it on another CPU. Always updated under the runqueue lock: |
234 | */ | 234 | */ |
235 | unsigned long nr_uninterruptible; | 235 | unsigned long nr_uninterruptible; |
236 | 236 | ||
237 | unsigned long expired_timestamp; | 237 | unsigned long expired_timestamp; |
238 | /* Cached timestamp set by update_cpu_clock() */ | 238 | /* Cached timestamp set by update_cpu_clock() */ |
239 | unsigned long long most_recent_timestamp; | 239 | unsigned long long most_recent_timestamp; |
240 | struct task_struct *curr, *idle; | 240 | struct task_struct *curr, *idle; |
241 | unsigned long next_balance; | 241 | unsigned long next_balance; |
242 | struct mm_struct *prev_mm; | 242 | struct mm_struct *prev_mm; |
243 | struct prio_array *active, *expired, arrays[2]; | 243 | struct prio_array *active, *expired, arrays[2]; |
244 | int best_expired_prio; | 244 | int best_expired_prio; |
245 | atomic_t nr_iowait; | 245 | atomic_t nr_iowait; |
246 | 246 | ||
247 | #ifdef CONFIG_SMP | 247 | #ifdef CONFIG_SMP |
248 | struct sched_domain *sd; | 248 | struct sched_domain *sd; |
249 | 249 | ||
250 | /* For active balancing */ | 250 | /* For active balancing */ |
251 | int active_balance; | 251 | int active_balance; |
252 | int push_cpu; | 252 | int push_cpu; |
253 | int cpu; /* cpu of this runqueue */ | 253 | int cpu; /* cpu of this runqueue */ |
254 | 254 | ||
255 | struct task_struct *migration_thread; | 255 | struct task_struct *migration_thread; |
256 | struct list_head migration_queue; | 256 | struct list_head migration_queue; |
257 | #endif | 257 | #endif |
258 | 258 | ||
259 | #ifdef CONFIG_SCHEDSTATS | 259 | #ifdef CONFIG_SCHEDSTATS |
260 | /* latency stats */ | 260 | /* latency stats */ |
261 | struct sched_info rq_sched_info; | 261 | struct sched_info rq_sched_info; |
262 | 262 | ||
263 | /* sys_sched_yield() stats */ | 263 | /* sys_sched_yield() stats */ |
264 | unsigned long yld_exp_empty; | 264 | unsigned long yld_exp_empty; |
265 | unsigned long yld_act_empty; | 265 | unsigned long yld_act_empty; |
266 | unsigned long yld_both_empty; | 266 | unsigned long yld_both_empty; |
267 | unsigned long yld_cnt; | 267 | unsigned long yld_cnt; |
268 | 268 | ||
269 | /* schedule() stats */ | 269 | /* schedule() stats */ |
270 | unsigned long sched_switch; | 270 | unsigned long sched_switch; |
271 | unsigned long sched_cnt; | 271 | unsigned long sched_cnt; |
272 | unsigned long sched_goidle; | 272 | unsigned long sched_goidle; |
273 | 273 | ||
274 | /* try_to_wake_up() stats */ | 274 | /* try_to_wake_up() stats */ |
275 | unsigned long ttwu_cnt; | 275 | unsigned long ttwu_cnt; |
276 | unsigned long ttwu_local; | 276 | unsigned long ttwu_local; |
277 | #endif | 277 | #endif |
278 | struct lock_class_key rq_lock_key; | 278 | struct lock_class_key rq_lock_key; |
279 | }; | 279 | }; |
280 | 280 | ||
281 | static DEFINE_PER_CPU(struct rq, runqueues); | 281 | static DEFINE_PER_CPU(struct rq, runqueues); |
282 | 282 | ||
283 | static inline int cpu_of(struct rq *rq) | 283 | static inline int cpu_of(struct rq *rq) |
284 | { | 284 | { |
285 | #ifdef CONFIG_SMP | 285 | #ifdef CONFIG_SMP |
286 | return rq->cpu; | 286 | return rq->cpu; |
287 | #else | 287 | #else |
288 | return 0; | 288 | return 0; |
289 | #endif | 289 | #endif |
290 | } | 290 | } |
291 | 291 | ||
292 | /* | 292 | /* |
293 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. | 293 | * The domain tree (rq->sd) is protected by RCU's quiescent state transition. |
294 | * See detach_destroy_domains: synchronize_sched for details. | 294 | * See detach_destroy_domains: synchronize_sched for details. |
295 | * | 295 | * |
296 | * The domain tree of any CPU may only be accessed from within | 296 | * The domain tree of any CPU may only be accessed from within |
297 | * preempt-disabled sections. | 297 | * preempt-disabled sections. |
298 | */ | 298 | */ |
299 | #define for_each_domain(cpu, __sd) \ | 299 | #define for_each_domain(cpu, __sd) \ |
300 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) | 300 | for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) |
301 | 301 | ||
302 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) | 302 | #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) |
303 | #define this_rq() (&__get_cpu_var(runqueues)) | 303 | #define this_rq() (&__get_cpu_var(runqueues)) |
304 | #define task_rq(p) cpu_rq(task_cpu(p)) | 304 | #define task_rq(p) cpu_rq(task_cpu(p)) |
305 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 305 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
306 | 306 | ||
307 | #ifndef prepare_arch_switch | 307 | #ifndef prepare_arch_switch |
308 | # define prepare_arch_switch(next) do { } while (0) | 308 | # define prepare_arch_switch(next) do { } while (0) |
309 | #endif | 309 | #endif |
310 | #ifndef finish_arch_switch | 310 | #ifndef finish_arch_switch |
311 | # define finish_arch_switch(prev) do { } while (0) | 311 | # define finish_arch_switch(prev) do { } while (0) |
312 | #endif | 312 | #endif |
313 | 313 | ||
314 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 314 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
315 | static inline int task_running(struct rq *rq, struct task_struct *p) | 315 | static inline int task_running(struct rq *rq, struct task_struct *p) |
316 | { | 316 | { |
317 | return rq->curr == p; | 317 | return rq->curr == p; |
318 | } | 318 | } |
319 | 319 | ||
320 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 320 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
321 | { | 321 | { |
322 | } | 322 | } |
323 | 323 | ||
324 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 324 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
325 | { | 325 | { |
326 | #ifdef CONFIG_DEBUG_SPINLOCK | 326 | #ifdef CONFIG_DEBUG_SPINLOCK |
327 | /* this is a valid case when another task releases the spinlock */ | 327 | /* this is a valid case when another task releases the spinlock */ |
328 | rq->lock.owner = current; | 328 | rq->lock.owner = current; |
329 | #endif | 329 | #endif |
330 | /* | 330 | /* |
331 | * If we are tracking spinlock dependencies then we have to | 331 | * If we are tracking spinlock dependencies then we have to |
332 | * fix up the runqueue lock - which gets 'carried over' from | 332 | * fix up the runqueue lock - which gets 'carried over' from |
333 | * prev into current: | 333 | * prev into current: |
334 | */ | 334 | */ |
335 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); | 335 | spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); |
336 | 336 | ||
337 | spin_unlock_irq(&rq->lock); | 337 | spin_unlock_irq(&rq->lock); |
338 | } | 338 | } |
339 | 339 | ||
340 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ | 340 | #else /* __ARCH_WANT_UNLOCKED_CTXSW */ |
341 | static inline int task_running(struct rq *rq, struct task_struct *p) | 341 | static inline int task_running(struct rq *rq, struct task_struct *p) |
342 | { | 342 | { |
343 | #ifdef CONFIG_SMP | 343 | #ifdef CONFIG_SMP |
344 | return p->oncpu; | 344 | return p->oncpu; |
345 | #else | 345 | #else |
346 | return rq->curr == p; | 346 | return rq->curr == p; |
347 | #endif | 347 | #endif |
348 | } | 348 | } |
349 | 349 | ||
350 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 350 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
351 | { | 351 | { |
352 | #ifdef CONFIG_SMP | 352 | #ifdef CONFIG_SMP |
353 | /* | 353 | /* |
354 | * We can optimise this out completely for !SMP, because the | 354 | * We can optimise this out completely for !SMP, because the |
355 | * SMP rebalancing from interrupt is the only thing that cares | 355 | * SMP rebalancing from interrupt is the only thing that cares |
356 | * here. | 356 | * here. |
357 | */ | 357 | */ |
358 | next->oncpu = 1; | 358 | next->oncpu = 1; |
359 | #endif | 359 | #endif |
360 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 360 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
361 | spin_unlock_irq(&rq->lock); | 361 | spin_unlock_irq(&rq->lock); |
362 | #else | 362 | #else |
363 | spin_unlock(&rq->lock); | 363 | spin_unlock(&rq->lock); |
364 | #endif | 364 | #endif |
365 | } | 365 | } |
366 | 366 | ||
367 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 367 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
368 | { | 368 | { |
369 | #ifdef CONFIG_SMP | 369 | #ifdef CONFIG_SMP |
370 | /* | 370 | /* |
371 | * After ->oncpu is cleared, the task can be moved to a different CPU. | 371 | * After ->oncpu is cleared, the task can be moved to a different CPU. |
372 | * We must ensure this doesn't happen until the switch is completely | 372 | * We must ensure this doesn't happen until the switch is completely |
373 | * finished. | 373 | * finished. |
374 | */ | 374 | */ |
375 | smp_wmb(); | 375 | smp_wmb(); |
376 | prev->oncpu = 0; | 376 | prev->oncpu = 0; |
377 | #endif | 377 | #endif |
378 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | 378 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW |
379 | local_irq_enable(); | 379 | local_irq_enable(); |
380 | #endif | 380 | #endif |
381 | } | 381 | } |
382 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 382 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
383 | 383 | ||
384 | /* | 384 | /* |
385 | * __task_rq_lock - lock the runqueue a given task resides on. | 385 | * __task_rq_lock - lock the runqueue a given task resides on. |
386 | * Must be called interrupts disabled. | 386 | * Must be called interrupts disabled. |
387 | */ | 387 | */ |
388 | static inline struct rq *__task_rq_lock(struct task_struct *p) | 388 | static inline struct rq *__task_rq_lock(struct task_struct *p) |
389 | __acquires(rq->lock) | 389 | __acquires(rq->lock) |
390 | { | 390 | { |
391 | struct rq *rq; | 391 | struct rq *rq; |
392 | 392 | ||
393 | repeat_lock_task: | 393 | repeat_lock_task: |
394 | rq = task_rq(p); | 394 | rq = task_rq(p); |
395 | spin_lock(&rq->lock); | 395 | spin_lock(&rq->lock); |
396 | if (unlikely(rq != task_rq(p))) { | 396 | if (unlikely(rq != task_rq(p))) { |
397 | spin_unlock(&rq->lock); | 397 | spin_unlock(&rq->lock); |
398 | goto repeat_lock_task; | 398 | goto repeat_lock_task; |
399 | } | 399 | } |
400 | return rq; | 400 | return rq; |
401 | } | 401 | } |
402 | 402 | ||
403 | /* | 403 | /* |
404 | * task_rq_lock - lock the runqueue a given task resides on and disable | 404 | * task_rq_lock - lock the runqueue a given task resides on and disable |
405 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 405 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
406 | * explicitly disabling preemption. | 406 | * explicitly disabling preemption. |
407 | */ | 407 | */ |
408 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 408 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
409 | __acquires(rq->lock) | 409 | __acquires(rq->lock) |
410 | { | 410 | { |
411 | struct rq *rq; | 411 | struct rq *rq; |
412 | 412 | ||
413 | repeat_lock_task: | 413 | repeat_lock_task: |
414 | local_irq_save(*flags); | 414 | local_irq_save(*flags); |
415 | rq = task_rq(p); | 415 | rq = task_rq(p); |
416 | spin_lock(&rq->lock); | 416 | spin_lock(&rq->lock); |
417 | if (unlikely(rq != task_rq(p))) { | 417 | if (unlikely(rq != task_rq(p))) { |
418 | spin_unlock_irqrestore(&rq->lock, *flags); | 418 | spin_unlock_irqrestore(&rq->lock, *flags); |
419 | goto repeat_lock_task; | 419 | goto repeat_lock_task; |
420 | } | 420 | } |
421 | return rq; | 421 | return rq; |
422 | } | 422 | } |
423 | 423 | ||
424 | static inline void __task_rq_unlock(struct rq *rq) | 424 | static inline void __task_rq_unlock(struct rq *rq) |
425 | __releases(rq->lock) | 425 | __releases(rq->lock) |
426 | { | 426 | { |
427 | spin_unlock(&rq->lock); | 427 | spin_unlock(&rq->lock); |
428 | } | 428 | } |
429 | 429 | ||
430 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | 430 | static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) |
431 | __releases(rq->lock) | 431 | __releases(rq->lock) |
432 | { | 432 | { |
433 | spin_unlock_irqrestore(&rq->lock, *flags); | 433 | spin_unlock_irqrestore(&rq->lock, *flags); |
434 | } | 434 | } |
435 | 435 | ||
436 | #ifdef CONFIG_SCHEDSTATS | 436 | #ifdef CONFIG_SCHEDSTATS |
437 | /* | 437 | /* |
438 | * bump this up when changing the output format or the meaning of an existing | 438 | * bump this up when changing the output format or the meaning of an existing |
439 | * format, so that tools can adapt (or abort) | 439 | * format, so that tools can adapt (or abort) |
440 | */ | 440 | */ |
441 | #define SCHEDSTAT_VERSION 14 | 441 | #define SCHEDSTAT_VERSION 14 |
442 | 442 | ||
443 | static int show_schedstat(struct seq_file *seq, void *v) | 443 | static int show_schedstat(struct seq_file *seq, void *v) |
444 | { | 444 | { |
445 | int cpu; | 445 | int cpu; |
446 | 446 | ||
447 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); | 447 | seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); |
448 | seq_printf(seq, "timestamp %lu\n", jiffies); | 448 | seq_printf(seq, "timestamp %lu\n", jiffies); |
449 | for_each_online_cpu(cpu) { | 449 | for_each_online_cpu(cpu) { |
450 | struct rq *rq = cpu_rq(cpu); | 450 | struct rq *rq = cpu_rq(cpu); |
451 | #ifdef CONFIG_SMP | 451 | #ifdef CONFIG_SMP |
452 | struct sched_domain *sd; | 452 | struct sched_domain *sd; |
453 | int dcnt = 0; | 453 | int dcnt = 0; |
454 | #endif | 454 | #endif |
455 | 455 | ||
456 | /* runqueue-specific stats */ | 456 | /* runqueue-specific stats */ |
457 | seq_printf(seq, | 457 | seq_printf(seq, |
458 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", | 458 | "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", |
459 | cpu, rq->yld_both_empty, | 459 | cpu, rq->yld_both_empty, |
460 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, | 460 | rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, |
461 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, | 461 | rq->sched_switch, rq->sched_cnt, rq->sched_goidle, |
462 | rq->ttwu_cnt, rq->ttwu_local, | 462 | rq->ttwu_cnt, rq->ttwu_local, |
463 | rq->rq_sched_info.cpu_time, | 463 | rq->rq_sched_info.cpu_time, |
464 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); | 464 | rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); |
465 | 465 | ||
466 | seq_printf(seq, "\n"); | 466 | seq_printf(seq, "\n"); |
467 | 467 | ||
468 | #ifdef CONFIG_SMP | 468 | #ifdef CONFIG_SMP |
469 | /* domain-specific stats */ | 469 | /* domain-specific stats */ |
470 | preempt_disable(); | 470 | preempt_disable(); |
471 | for_each_domain(cpu, sd) { | 471 | for_each_domain(cpu, sd) { |
472 | enum idle_type itype; | 472 | enum idle_type itype; |
473 | char mask_str[NR_CPUS]; | 473 | char mask_str[NR_CPUS]; |
474 | 474 | ||
475 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); | 475 | cpumask_scnprintf(mask_str, NR_CPUS, sd->span); |
476 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 476 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); |
477 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | 477 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; |
478 | itype++) { | 478 | itype++) { |
479 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " | 479 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
480 | "%lu", | 480 | "%lu", |
481 | sd->lb_cnt[itype], | 481 | sd->lb_cnt[itype], |
482 | sd->lb_balanced[itype], | 482 | sd->lb_balanced[itype], |
483 | sd->lb_failed[itype], | 483 | sd->lb_failed[itype], |
484 | sd->lb_imbalance[itype], | 484 | sd->lb_imbalance[itype], |
485 | sd->lb_gained[itype], | 485 | sd->lb_gained[itype], |
486 | sd->lb_hot_gained[itype], | 486 | sd->lb_hot_gained[itype], |
487 | sd->lb_nobusyq[itype], | 487 | sd->lb_nobusyq[itype], |
488 | sd->lb_nobusyg[itype]); | 488 | sd->lb_nobusyg[itype]); |
489 | } | 489 | } |
490 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" | 490 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
491 | " %lu %lu %lu\n", | 491 | " %lu %lu %lu\n", |
492 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 492 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
493 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 493 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
494 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 494 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, |
495 | sd->ttwu_wake_remote, sd->ttwu_move_affine, | 495 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
496 | sd->ttwu_move_balance); | 496 | sd->ttwu_move_balance); |
497 | } | 497 | } |
498 | preempt_enable(); | 498 | preempt_enable(); |
499 | #endif | 499 | #endif |
500 | } | 500 | } |
501 | return 0; | 501 | return 0; |
502 | } | 502 | } |
503 | 503 | ||
504 | static int schedstat_open(struct inode *inode, struct file *file) | 504 | static int schedstat_open(struct inode *inode, struct file *file) |
505 | { | 505 | { |
506 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); | 506 | unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); |
507 | char *buf = kmalloc(size, GFP_KERNEL); | 507 | char *buf = kmalloc(size, GFP_KERNEL); |
508 | struct seq_file *m; | 508 | struct seq_file *m; |
509 | int res; | 509 | int res; |
510 | 510 | ||
511 | if (!buf) | 511 | if (!buf) |
512 | return -ENOMEM; | 512 | return -ENOMEM; |
513 | res = single_open(file, show_schedstat, NULL); | 513 | res = single_open(file, show_schedstat, NULL); |
514 | if (!res) { | 514 | if (!res) { |
515 | m = file->private_data; | 515 | m = file->private_data; |
516 | m->buf = buf; | 516 | m->buf = buf; |
517 | m->size = size; | 517 | m->size = size; |
518 | } else | 518 | } else |
519 | kfree(buf); | 519 | kfree(buf); |
520 | return res; | 520 | return res; |
521 | } | 521 | } |
522 | 522 | ||
523 | const struct file_operations proc_schedstat_operations = { | 523 | const struct file_operations proc_schedstat_operations = { |
524 | .open = schedstat_open, | 524 | .open = schedstat_open, |
525 | .read = seq_read, | 525 | .read = seq_read, |
526 | .llseek = seq_lseek, | 526 | .llseek = seq_lseek, |
527 | .release = single_release, | 527 | .release = single_release, |
528 | }; | 528 | }; |
529 | 529 | ||
530 | /* | 530 | /* |
531 | * Expects runqueue lock to be held for atomicity of update | 531 | * Expects runqueue lock to be held for atomicity of update |
532 | */ | 532 | */ |
533 | static inline void | 533 | static inline void |
534 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | 534 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) |
535 | { | 535 | { |
536 | if (rq) { | 536 | if (rq) { |
537 | rq->rq_sched_info.run_delay += delta_jiffies; | 537 | rq->rq_sched_info.run_delay += delta_jiffies; |
538 | rq->rq_sched_info.pcnt++; | 538 | rq->rq_sched_info.pcnt++; |
539 | } | 539 | } |
540 | } | 540 | } |
541 | 541 | ||
542 | /* | 542 | /* |
543 | * Expects runqueue lock to be held for atomicity of update | 543 | * Expects runqueue lock to be held for atomicity of update |
544 | */ | 544 | */ |
545 | static inline void | 545 | static inline void |
546 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | 546 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) |
547 | { | 547 | { |
548 | if (rq) | 548 | if (rq) |
549 | rq->rq_sched_info.cpu_time += delta_jiffies; | 549 | rq->rq_sched_info.cpu_time += delta_jiffies; |
550 | } | 550 | } |
551 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) | 551 | # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) |
552 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) | 552 | # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) |
553 | #else /* !CONFIG_SCHEDSTATS */ | 553 | #else /* !CONFIG_SCHEDSTATS */ |
554 | static inline void | 554 | static inline void |
555 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) | 555 | rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) |
556 | {} | 556 | {} |
557 | static inline void | 557 | static inline void |
558 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | 558 | rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) |
559 | {} | 559 | {} |
560 | # define schedstat_inc(rq, field) do { } while (0) | 560 | # define schedstat_inc(rq, field) do { } while (0) |
561 | # define schedstat_add(rq, field, amt) do { } while (0) | 561 | # define schedstat_add(rq, field, amt) do { } while (0) |
562 | #endif | 562 | #endif |
563 | 563 | ||
564 | /* | 564 | /* |
565 | * this_rq_lock - lock this runqueue and disable interrupts. | 565 | * this_rq_lock - lock this runqueue and disable interrupts. |
566 | */ | 566 | */ |
567 | static inline struct rq *this_rq_lock(void) | 567 | static inline struct rq *this_rq_lock(void) |
568 | __acquires(rq->lock) | 568 | __acquires(rq->lock) |
569 | { | 569 | { |
570 | struct rq *rq; | 570 | struct rq *rq; |
571 | 571 | ||
572 | local_irq_disable(); | 572 | local_irq_disable(); |
573 | rq = this_rq(); | 573 | rq = this_rq(); |
574 | spin_lock(&rq->lock); | 574 | spin_lock(&rq->lock); |
575 | 575 | ||
576 | return rq; | 576 | return rq; |
577 | } | 577 | } |
578 | 578 | ||
579 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 579 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
580 | /* | 580 | /* |
581 | * Called when a process is dequeued from the active array and given | 581 | * Called when a process is dequeued from the active array and given |
582 | * the cpu. We should note that with the exception of interactive | 582 | * the cpu. We should note that with the exception of interactive |
583 | * tasks, the expired queue will become the active queue after the active | 583 | * tasks, the expired queue will become the active queue after the active |
584 | * queue is empty, without explicitly dequeuing and requeuing tasks in the | 584 | * queue is empty, without explicitly dequeuing and requeuing tasks in the |
585 | * expired queue. (Interactive tasks may be requeued directly to the | 585 | * expired queue. (Interactive tasks may be requeued directly to the |
586 | * active queue, thus delaying tasks in the expired queue from running; | 586 | * active queue, thus delaying tasks in the expired queue from running; |
587 | * see scheduler_tick()). | 587 | * see scheduler_tick()). |
588 | * | 588 | * |
589 | * This function is only called from sched_info_arrive(), rather than | 589 | * This function is only called from sched_info_arrive(), rather than |
590 | * dequeue_task(). Even though a task may be queued and dequeued multiple | 590 | * dequeue_task(). Even though a task may be queued and dequeued multiple |
591 | * times as it is shuffled about, we're really interested in knowing how | 591 | * times as it is shuffled about, we're really interested in knowing how |
592 | * long it was from the *first* time it was queued to the time that it | 592 | * long it was from the *first* time it was queued to the time that it |
593 | * finally hit a cpu. | 593 | * finally hit a cpu. |
594 | */ | 594 | */ |
595 | static inline void sched_info_dequeued(struct task_struct *t) | 595 | static inline void sched_info_dequeued(struct task_struct *t) |
596 | { | 596 | { |
597 | t->sched_info.last_queued = 0; | 597 | t->sched_info.last_queued = 0; |
598 | } | 598 | } |
599 | 599 | ||
600 | /* | 600 | /* |
601 | * Called when a task finally hits the cpu. We can now calculate how | 601 | * Called when a task finally hits the cpu. We can now calculate how |
602 | * long it was waiting to run. We also note when it began so that we | 602 | * long it was waiting to run. We also note when it began so that we |
603 | * can keep stats on how long its timeslice is. | 603 | * can keep stats on how long its timeslice is. |
604 | */ | 604 | */ |
605 | static void sched_info_arrive(struct task_struct *t) | 605 | static void sched_info_arrive(struct task_struct *t) |
606 | { | 606 | { |
607 | unsigned long now = jiffies, delta_jiffies = 0; | 607 | unsigned long now = jiffies, delta_jiffies = 0; |
608 | 608 | ||
609 | if (t->sched_info.last_queued) | 609 | if (t->sched_info.last_queued) |
610 | delta_jiffies = now - t->sched_info.last_queued; | 610 | delta_jiffies = now - t->sched_info.last_queued; |
611 | sched_info_dequeued(t); | 611 | sched_info_dequeued(t); |
612 | t->sched_info.run_delay += delta_jiffies; | 612 | t->sched_info.run_delay += delta_jiffies; |
613 | t->sched_info.last_arrival = now; | 613 | t->sched_info.last_arrival = now; |
614 | t->sched_info.pcnt++; | 614 | t->sched_info.pcnt++; |
615 | 615 | ||
616 | rq_sched_info_arrive(task_rq(t), delta_jiffies); | 616 | rq_sched_info_arrive(task_rq(t), delta_jiffies); |
617 | } | 617 | } |
618 | 618 | ||
619 | /* | 619 | /* |
620 | * Called when a process is queued into either the active or expired | 620 | * Called when a process is queued into either the active or expired |
621 | * array. The time is noted and later used to determine how long we | 621 | * array. The time is noted and later used to determine how long we |
622 | * had to wait for us to reach the cpu. Since the expired queue will | 622 | * had to wait for us to reach the cpu. Since the expired queue will |
623 | * become the active queue after active queue is empty, without dequeuing | 623 | * become the active queue after active queue is empty, without dequeuing |
624 | * and requeuing any tasks, we are interested in queuing to either. It | 624 | * and requeuing any tasks, we are interested in queuing to either. It |
625 | * is unusual but not impossible for tasks to be dequeued and immediately | 625 | * is unusual but not impossible for tasks to be dequeued and immediately |
626 | * requeued in the same or another array: this can happen in sched_yield(), | 626 | * requeued in the same or another array: this can happen in sched_yield(), |
627 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue | 627 | * set_user_nice(), and even load_balance() as it moves tasks from runqueue |
628 | * to runqueue. | 628 | * to runqueue. |
629 | * | 629 | * |
630 | * This function is only called from enqueue_task(), but also only updates | 630 | * This function is only called from enqueue_task(), but also only updates |
631 | * the timestamp if it is already not set. It's assumed that | 631 | * the timestamp if it is already not set. It's assumed that |
632 | * sched_info_dequeued() will clear that stamp when appropriate. | 632 | * sched_info_dequeued() will clear that stamp when appropriate. |
633 | */ | 633 | */ |
634 | static inline void sched_info_queued(struct task_struct *t) | 634 | static inline void sched_info_queued(struct task_struct *t) |
635 | { | 635 | { |
636 | if (unlikely(sched_info_on())) | 636 | if (unlikely(sched_info_on())) |
637 | if (!t->sched_info.last_queued) | 637 | if (!t->sched_info.last_queued) |
638 | t->sched_info.last_queued = jiffies; | 638 | t->sched_info.last_queued = jiffies; |
639 | } | 639 | } |
640 | 640 | ||
641 | /* | 641 | /* |
642 | * Called when a process ceases being the active-running process, either | 642 | * Called when a process ceases being the active-running process, either |
643 | * voluntarily or involuntarily. Now we can calculate how long we ran. | 643 | * voluntarily or involuntarily. Now we can calculate how long we ran. |
644 | */ | 644 | */ |
645 | static inline void sched_info_depart(struct task_struct *t) | 645 | static inline void sched_info_depart(struct task_struct *t) |
646 | { | 646 | { |
647 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; | 647 | unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; |
648 | 648 | ||
649 | t->sched_info.cpu_time += delta_jiffies; | 649 | t->sched_info.cpu_time += delta_jiffies; |
650 | rq_sched_info_depart(task_rq(t), delta_jiffies); | 650 | rq_sched_info_depart(task_rq(t), delta_jiffies); |
651 | } | 651 | } |
652 | 652 | ||
653 | /* | 653 | /* |
654 | * Called when tasks are switched involuntarily due, typically, to expiring | 654 | * Called when tasks are switched involuntarily due, typically, to expiring |
655 | * their time slice. (This may also be called when switching to or from | 655 | * their time slice. (This may also be called when switching to or from |
656 | * the idle task.) We are only called when prev != next. | 656 | * the idle task.) We are only called when prev != next. |
657 | */ | 657 | */ |
658 | static inline void | 658 | static inline void |
659 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) | 659 | __sched_info_switch(struct task_struct *prev, struct task_struct *next) |
660 | { | 660 | { |
661 | struct rq *rq = task_rq(prev); | 661 | struct rq *rq = task_rq(prev); |
662 | 662 | ||
663 | /* | 663 | /* |
664 | * prev now departs the cpu. It's not interesting to record | 664 | * prev now departs the cpu. It's not interesting to record |
665 | * stats about how efficient we were at scheduling the idle | 665 | * stats about how efficient we were at scheduling the idle |
666 | * process, however. | 666 | * process, however. |
667 | */ | 667 | */ |
668 | if (prev != rq->idle) | 668 | if (prev != rq->idle) |
669 | sched_info_depart(prev); | 669 | sched_info_depart(prev); |
670 | 670 | ||
671 | if (next != rq->idle) | 671 | if (next != rq->idle) |
672 | sched_info_arrive(next); | 672 | sched_info_arrive(next); |
673 | } | 673 | } |
674 | static inline void | 674 | static inline void |
675 | sched_info_switch(struct task_struct *prev, struct task_struct *next) | 675 | sched_info_switch(struct task_struct *prev, struct task_struct *next) |
676 | { | 676 | { |
677 | if (unlikely(sched_info_on())) | 677 | if (unlikely(sched_info_on())) |
678 | __sched_info_switch(prev, next); | 678 | __sched_info_switch(prev, next); |
679 | } | 679 | } |
680 | #else | 680 | #else |
681 | #define sched_info_queued(t) do { } while (0) | 681 | #define sched_info_queued(t) do { } while (0) |
682 | #define sched_info_switch(t, next) do { } while (0) | 682 | #define sched_info_switch(t, next) do { } while (0) |
683 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ | 683 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
684 | 684 | ||
685 | /* | 685 | /* |
686 | * Adding/removing a task to/from a priority array: | 686 | * Adding/removing a task to/from a priority array: |
687 | */ | 687 | */ |
688 | static void dequeue_task(struct task_struct *p, struct prio_array *array) | 688 | static void dequeue_task(struct task_struct *p, struct prio_array *array) |
689 | { | 689 | { |
690 | array->nr_active--; | 690 | array->nr_active--; |
691 | list_del(&p->run_list); | 691 | list_del(&p->run_list); |
692 | if (list_empty(array->queue + p->prio)) | 692 | if (list_empty(array->queue + p->prio)) |
693 | __clear_bit(p->prio, array->bitmap); | 693 | __clear_bit(p->prio, array->bitmap); |
694 | } | 694 | } |
695 | 695 | ||
696 | static void enqueue_task(struct task_struct *p, struct prio_array *array) | 696 | static void enqueue_task(struct task_struct *p, struct prio_array *array) |
697 | { | 697 | { |
698 | sched_info_queued(p); | 698 | sched_info_queued(p); |
699 | list_add_tail(&p->run_list, array->queue + p->prio); | 699 | list_add_tail(&p->run_list, array->queue + p->prio); |
700 | __set_bit(p->prio, array->bitmap); | 700 | __set_bit(p->prio, array->bitmap); |
701 | array->nr_active++; | 701 | array->nr_active++; |
702 | p->array = array; | 702 | p->array = array; |
703 | } | 703 | } |
704 | 704 | ||
705 | /* | 705 | /* |
706 | * Put task to the end of the run list without the overhead of dequeue | 706 | * Put task to the end of the run list without the overhead of dequeue |
707 | * followed by enqueue. | 707 | * followed by enqueue. |
708 | */ | 708 | */ |
709 | static void requeue_task(struct task_struct *p, struct prio_array *array) | 709 | static void requeue_task(struct task_struct *p, struct prio_array *array) |
710 | { | 710 | { |
711 | list_move_tail(&p->run_list, array->queue + p->prio); | 711 | list_move_tail(&p->run_list, array->queue + p->prio); |
712 | } | 712 | } |
713 | 713 | ||
714 | static inline void | 714 | static inline void |
715 | enqueue_task_head(struct task_struct *p, struct prio_array *array) | 715 | enqueue_task_head(struct task_struct *p, struct prio_array *array) |
716 | { | 716 | { |
717 | list_add(&p->run_list, array->queue + p->prio); | 717 | list_add(&p->run_list, array->queue + p->prio); |
718 | __set_bit(p->prio, array->bitmap); | 718 | __set_bit(p->prio, array->bitmap); |
719 | array->nr_active++; | 719 | array->nr_active++; |
720 | p->array = array; | 720 | p->array = array; |
721 | } | 721 | } |
722 | 722 | ||
723 | /* | 723 | /* |
724 | * __normal_prio - return the priority that is based on the static | 724 | * __normal_prio - return the priority that is based on the static |
725 | * priority but is modified by bonuses/penalties. | 725 | * priority but is modified by bonuses/penalties. |
726 | * | 726 | * |
727 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 727 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
728 | * into the -5 ... 0 ... +5 bonus/penalty range. | 728 | * into the -5 ... 0 ... +5 bonus/penalty range. |
729 | * | 729 | * |
730 | * We use 25% of the full 0...39 priority range so that: | 730 | * We use 25% of the full 0...39 priority range so that: |
731 | * | 731 | * |
732 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. | 732 | * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. |
733 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. | 733 | * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. |
734 | * | 734 | * |
735 | * Both properties are important to certain workloads. | 735 | * Both properties are important to certain workloads. |
736 | */ | 736 | */ |
737 | 737 | ||
738 | static inline int __normal_prio(struct task_struct *p) | 738 | static inline int __normal_prio(struct task_struct *p) |
739 | { | 739 | { |
740 | int bonus, prio; | 740 | int bonus, prio; |
741 | 741 | ||
742 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 742 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
743 | 743 | ||
744 | prio = p->static_prio - bonus; | 744 | prio = p->static_prio - bonus; |
745 | if (prio < MAX_RT_PRIO) | 745 | if (prio < MAX_RT_PRIO) |
746 | prio = MAX_RT_PRIO; | 746 | prio = MAX_RT_PRIO; |
747 | if (prio > MAX_PRIO-1) | 747 | if (prio > MAX_PRIO-1) |
748 | prio = MAX_PRIO-1; | 748 | prio = MAX_PRIO-1; |
749 | return prio; | 749 | return prio; |
750 | } | 750 | } |
751 | 751 | ||
752 | /* | 752 | /* |
753 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 753 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
754 | * of tasks with abnormal "nice" values across CPUs the contribution that | 754 | * of tasks with abnormal "nice" values across CPUs the contribution that |
755 | * each task makes to its run queue's load is weighted according to its | 755 | * each task makes to its run queue's load is weighted according to its |
756 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | 756 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a |
757 | * scaled version of the new time slice allocation that they receive on time | 757 | * scaled version of the new time slice allocation that they receive on time |
758 | * slice expiry etc. | 758 | * slice expiry etc. |
759 | */ | 759 | */ |
760 | 760 | ||
761 | /* | 761 | /* |
762 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | 762 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE |
763 | * If static_prio_timeslice() is ever changed to break this assumption then | 763 | * If static_prio_timeslice() is ever changed to break this assumption then |
764 | * this code will need modification | 764 | * this code will need modification |
765 | */ | 765 | */ |
766 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | 766 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE |
767 | #define LOAD_WEIGHT(lp) \ | 767 | #define LOAD_WEIGHT(lp) \ |
768 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | 768 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) |
769 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | 769 | #define PRIO_TO_LOAD_WEIGHT(prio) \ |
770 | LOAD_WEIGHT(static_prio_timeslice(prio)) | 770 | LOAD_WEIGHT(static_prio_timeslice(prio)) |
771 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | 771 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ |
772 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | 772 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) |
773 | 773 | ||
774 | static void set_load_weight(struct task_struct *p) | 774 | static void set_load_weight(struct task_struct *p) |
775 | { | 775 | { |
776 | if (has_rt_policy(p)) { | 776 | if (has_rt_policy(p)) { |
777 | #ifdef CONFIG_SMP | 777 | #ifdef CONFIG_SMP |
778 | if (p == task_rq(p)->migration_thread) | 778 | if (p == task_rq(p)->migration_thread) |
779 | /* | 779 | /* |
780 | * The migration thread does the actual balancing. | 780 | * The migration thread does the actual balancing. |
781 | * Giving its load any weight will skew balancing | 781 | * Giving its load any weight will skew balancing |
782 | * adversely. | 782 | * adversely. |
783 | */ | 783 | */ |
784 | p->load_weight = 0; | 784 | p->load_weight = 0; |
785 | else | 785 | else |
786 | #endif | 786 | #endif |
787 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | 787 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); |
788 | } else | 788 | } else |
789 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | 789 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); |
790 | } | 790 | } |
791 | 791 | ||
792 | static inline void | 792 | static inline void |
793 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 793 | inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) |
794 | { | 794 | { |
795 | rq->raw_weighted_load += p->load_weight; | 795 | rq->raw_weighted_load += p->load_weight; |
796 | } | 796 | } |
797 | 797 | ||
798 | static inline void | 798 | static inline void |
799 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) | 799 | dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) |
800 | { | 800 | { |
801 | rq->raw_weighted_load -= p->load_weight; | 801 | rq->raw_weighted_load -= p->load_weight; |
802 | } | 802 | } |
803 | 803 | ||
804 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) | 804 | static inline void inc_nr_running(struct task_struct *p, struct rq *rq) |
805 | { | 805 | { |
806 | rq->nr_running++; | 806 | rq->nr_running++; |
807 | inc_raw_weighted_load(rq, p); | 807 | inc_raw_weighted_load(rq, p); |
808 | } | 808 | } |
809 | 809 | ||
810 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) | 810 | static inline void dec_nr_running(struct task_struct *p, struct rq *rq) |
811 | { | 811 | { |
812 | rq->nr_running--; | 812 | rq->nr_running--; |
813 | dec_raw_weighted_load(rq, p); | 813 | dec_raw_weighted_load(rq, p); |
814 | } | 814 | } |
815 | 815 | ||
816 | /* | 816 | /* |
817 | * Calculate the expected normal priority: i.e. priority | 817 | * Calculate the expected normal priority: i.e. priority |
818 | * without taking RT-inheritance into account. Might be | 818 | * without taking RT-inheritance into account. Might be |
819 | * boosted by interactivity modifiers. Changes upon fork, | 819 | * boosted by interactivity modifiers. Changes upon fork, |
820 | * setprio syscalls, and whenever the interactivity | 820 | * setprio syscalls, and whenever the interactivity |
821 | * estimator recalculates. | 821 | * estimator recalculates. |
822 | */ | 822 | */ |
823 | static inline int normal_prio(struct task_struct *p) | 823 | static inline int normal_prio(struct task_struct *p) |
824 | { | 824 | { |
825 | int prio; | 825 | int prio; |
826 | 826 | ||
827 | if (has_rt_policy(p)) | 827 | if (has_rt_policy(p)) |
828 | prio = MAX_RT_PRIO-1 - p->rt_priority; | 828 | prio = MAX_RT_PRIO-1 - p->rt_priority; |
829 | else | 829 | else |
830 | prio = __normal_prio(p); | 830 | prio = __normal_prio(p); |
831 | return prio; | 831 | return prio; |
832 | } | 832 | } |
833 | 833 | ||
834 | /* | 834 | /* |
835 | * Calculate the current priority, i.e. the priority | 835 | * Calculate the current priority, i.e. the priority |
836 | * taken into account by the scheduler. This value might | 836 | * taken into account by the scheduler. This value might |
837 | * be boosted by RT tasks, or might be boosted by | 837 | * be boosted by RT tasks, or might be boosted by |
838 | * interactivity modifiers. Will be RT if the task got | 838 | * interactivity modifiers. Will be RT if the task got |
839 | * RT-boosted. If not then it returns p->normal_prio. | 839 | * RT-boosted. If not then it returns p->normal_prio. |
840 | */ | 840 | */ |
841 | static int effective_prio(struct task_struct *p) | 841 | static int effective_prio(struct task_struct *p) |
842 | { | 842 | { |
843 | p->normal_prio = normal_prio(p); | 843 | p->normal_prio = normal_prio(p); |
844 | /* | 844 | /* |
845 | * If we are RT tasks or we were boosted to RT priority, | 845 | * If we are RT tasks or we were boosted to RT priority, |
846 | * keep the priority unchanged. Otherwise, update priority | 846 | * keep the priority unchanged. Otherwise, update priority |
847 | * to the normal priority: | 847 | * to the normal priority: |
848 | */ | 848 | */ |
849 | if (!rt_prio(p->prio)) | 849 | if (!rt_prio(p->prio)) |
850 | return p->normal_prio; | 850 | return p->normal_prio; |
851 | return p->prio; | 851 | return p->prio; |
852 | } | 852 | } |
853 | 853 | ||
854 | /* | 854 | /* |
855 | * __activate_task - move a task to the runqueue. | 855 | * __activate_task - move a task to the runqueue. |
856 | */ | 856 | */ |
857 | static void __activate_task(struct task_struct *p, struct rq *rq) | 857 | static void __activate_task(struct task_struct *p, struct rq *rq) |
858 | { | 858 | { |
859 | struct prio_array *target = rq->active; | 859 | struct prio_array *target = rq->active; |
860 | 860 | ||
861 | if (batch_task(p)) | 861 | if (batch_task(p)) |
862 | target = rq->expired; | 862 | target = rq->expired; |
863 | enqueue_task(p, target); | 863 | enqueue_task(p, target); |
864 | inc_nr_running(p, rq); | 864 | inc_nr_running(p, rq); |
865 | } | 865 | } |
866 | 866 | ||
867 | /* | 867 | /* |
868 | * __activate_idle_task - move idle task to the _front_ of runqueue. | 868 | * __activate_idle_task - move idle task to the _front_ of runqueue. |
869 | */ | 869 | */ |
870 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) | 870 | static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) |
871 | { | 871 | { |
872 | enqueue_task_head(p, rq->active); | 872 | enqueue_task_head(p, rq->active); |
873 | inc_nr_running(p, rq); | 873 | inc_nr_running(p, rq); |
874 | } | 874 | } |
875 | 875 | ||
876 | /* | 876 | /* |
877 | * Recalculate p->normal_prio and p->prio after having slept, | 877 | * Recalculate p->normal_prio and p->prio after having slept, |
878 | * updating the sleep-average too: | 878 | * updating the sleep-average too: |
879 | */ | 879 | */ |
880 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) | 880 | static int recalc_task_prio(struct task_struct *p, unsigned long long now) |
881 | { | 881 | { |
882 | /* Caller must always ensure 'now >= p->timestamp' */ | 882 | /* Caller must always ensure 'now >= p->timestamp' */ |
883 | unsigned long sleep_time = now - p->timestamp; | 883 | unsigned long sleep_time = now - p->timestamp; |
884 | 884 | ||
885 | if (batch_task(p)) | 885 | if (batch_task(p)) |
886 | sleep_time = 0; | 886 | sleep_time = 0; |
887 | 887 | ||
888 | if (likely(sleep_time > 0)) { | 888 | if (likely(sleep_time > 0)) { |
889 | /* | 889 | /* |
890 | * This ceiling is set to the lowest priority that would allow | 890 | * This ceiling is set to the lowest priority that would allow |
891 | * a task to be reinserted into the active array on timeslice | 891 | * a task to be reinserted into the active array on timeslice |
892 | * completion. | 892 | * completion. |
893 | */ | 893 | */ |
894 | unsigned long ceiling = INTERACTIVE_SLEEP(p); | 894 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
895 | 895 | ||
896 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { | 896 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
897 | /* | 897 | /* |
898 | * Prevents user tasks from achieving best priority | 898 | * Prevents user tasks from achieving best priority |
899 | * with one single large enough sleep. | 899 | * with one single large enough sleep. |
900 | */ | 900 | */ |
901 | p->sleep_avg = ceiling; | 901 | p->sleep_avg = ceiling; |
902 | /* | 902 | /* |
903 | * Using INTERACTIVE_SLEEP() as a ceiling places a | 903 | * Using INTERACTIVE_SLEEP() as a ceiling places a |
904 | * nice(0) task 1ms sleep away from promotion, and | 904 | * nice(0) task 1ms sleep away from promotion, and |
905 | * gives it 700ms to round-robin with no chance of | 905 | * gives it 700ms to round-robin with no chance of |
906 | * being demoted. This is more than generous, so | 906 | * being demoted. This is more than generous, so |
907 | * mark this sleep as non-interactive to prevent the | 907 | * mark this sleep as non-interactive to prevent the |
908 | * on-runqueue bonus logic from intervening should | 908 | * on-runqueue bonus logic from intervening should |
909 | * this task not receive cpu immediately. | 909 | * this task not receive cpu immediately. |
910 | */ | 910 | */ |
911 | p->sleep_type = SLEEP_NONINTERACTIVE; | 911 | p->sleep_type = SLEEP_NONINTERACTIVE; |
912 | } else { | 912 | } else { |
913 | /* | 913 | /* |
914 | * Tasks waking from uninterruptible sleep are | 914 | * Tasks waking from uninterruptible sleep are |
915 | * limited in their sleep_avg rise as they | 915 | * limited in their sleep_avg rise as they |
916 | * are likely to be waiting on I/O | 916 | * are likely to be waiting on I/O |
917 | */ | 917 | */ |
918 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 918 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
919 | if (p->sleep_avg >= ceiling) | 919 | if (p->sleep_avg >= ceiling) |
920 | sleep_time = 0; | 920 | sleep_time = 0; |
921 | else if (p->sleep_avg + sleep_time >= | 921 | else if (p->sleep_avg + sleep_time >= |
922 | ceiling) { | 922 | ceiling) { |
923 | p->sleep_avg = ceiling; | 923 | p->sleep_avg = ceiling; |
924 | sleep_time = 0; | 924 | sleep_time = 0; |
925 | } | 925 | } |
926 | } | 926 | } |
927 | 927 | ||
928 | /* | 928 | /* |
929 | * This code gives a bonus to interactive tasks. | 929 | * This code gives a bonus to interactive tasks. |
930 | * | 930 | * |
931 | * The boost works by updating the 'average sleep time' | 931 | * The boost works by updating the 'average sleep time' |
932 | * value here, based on ->timestamp. The more time a | 932 | * value here, based on ->timestamp. The more time a |
933 | * task spends sleeping, the higher the average gets - | 933 | * task spends sleeping, the higher the average gets - |
934 | * and the higher the priority boost gets as well. | 934 | * and the higher the priority boost gets as well. |
935 | */ | 935 | */ |
936 | p->sleep_avg += sleep_time; | 936 | p->sleep_avg += sleep_time; |
937 | 937 | ||
938 | } | 938 | } |
939 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | 939 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) |
940 | p->sleep_avg = NS_MAX_SLEEP_AVG; | 940 | p->sleep_avg = NS_MAX_SLEEP_AVG; |
941 | } | 941 | } |
942 | 942 | ||
943 | return effective_prio(p); | 943 | return effective_prio(p); |
944 | } | 944 | } |
945 | 945 | ||
946 | /* | 946 | /* |
947 | * activate_task - move a task to the runqueue and do priority recalculation | 947 | * activate_task - move a task to the runqueue and do priority recalculation |
948 | * | 948 | * |
949 | * Update all the scheduling statistics stuff. (sleep average | 949 | * Update all the scheduling statistics stuff. (sleep average |
950 | * calculation, priority modifiers, etc.) | 950 | * calculation, priority modifiers, etc.) |
951 | */ | 951 | */ |
952 | static void activate_task(struct task_struct *p, struct rq *rq, int local) | 952 | static void activate_task(struct task_struct *p, struct rq *rq, int local) |
953 | { | 953 | { |
954 | unsigned long long now; | 954 | unsigned long long now; |
955 | 955 | ||
956 | if (rt_task(p)) | 956 | if (rt_task(p)) |
957 | goto out; | 957 | goto out; |
958 | 958 | ||
959 | now = sched_clock(); | 959 | now = sched_clock(); |
960 | #ifdef CONFIG_SMP | 960 | #ifdef CONFIG_SMP |
961 | if (!local) { | 961 | if (!local) { |
962 | /* Compensate for drifting sched_clock */ | 962 | /* Compensate for drifting sched_clock */ |
963 | struct rq *this_rq = this_rq(); | 963 | struct rq *this_rq = this_rq(); |
964 | now = (now - this_rq->most_recent_timestamp) | 964 | now = (now - this_rq->most_recent_timestamp) |
965 | + rq->most_recent_timestamp; | 965 | + rq->most_recent_timestamp; |
966 | } | 966 | } |
967 | #endif | 967 | #endif |
968 | 968 | ||
969 | /* | 969 | /* |
970 | * Sleep time is in units of nanosecs, so shift by 20 to get a | 970 | * Sleep time is in units of nanosecs, so shift by 20 to get a |
971 | * milliseconds-range estimation of the amount of time that the task | 971 | * milliseconds-range estimation of the amount of time that the task |
972 | * spent sleeping: | 972 | * spent sleeping: |
973 | */ | 973 | */ |
974 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 974 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
975 | if (p->state == TASK_UNINTERRUPTIBLE) | 975 | if (p->state == TASK_UNINTERRUPTIBLE) |
976 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | 976 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), |
977 | (now - p->timestamp) >> 20); | 977 | (now - p->timestamp) >> 20); |
978 | } | 978 | } |
979 | 979 | ||
980 | p->prio = recalc_task_prio(p, now); | 980 | p->prio = recalc_task_prio(p, now); |
981 | 981 | ||
982 | /* | 982 | /* |
983 | * This checks to make sure it's not an uninterruptible task | 983 | * This checks to make sure it's not an uninterruptible task |
984 | * that is now waking up. | 984 | * that is now waking up. |
985 | */ | 985 | */ |
986 | if (p->sleep_type == SLEEP_NORMAL) { | 986 | if (p->sleep_type == SLEEP_NORMAL) { |
987 | /* | 987 | /* |
988 | * Tasks which were woken up by interrupts (ie. hw events) | 988 | * Tasks which were woken up by interrupts (ie. hw events) |
989 | * are most likely of interactive nature. So we give them | 989 | * are most likely of interactive nature. So we give them |
990 | * the credit of extending their sleep time to the period | 990 | * the credit of extending their sleep time to the period |
991 | * of time they spend on the runqueue, waiting for execution | 991 | * of time they spend on the runqueue, waiting for execution |
992 | * on a CPU, first time around: | 992 | * on a CPU, first time around: |
993 | */ | 993 | */ |
994 | if (in_interrupt()) | 994 | if (in_interrupt()) |
995 | p->sleep_type = SLEEP_INTERRUPTED; | 995 | p->sleep_type = SLEEP_INTERRUPTED; |
996 | else { | 996 | else { |
997 | /* | 997 | /* |
998 | * Normal first-time wakeups get a credit too for | 998 | * Normal first-time wakeups get a credit too for |
999 | * on-runqueue time, but it will be weighted down: | 999 | * on-runqueue time, but it will be weighted down: |
1000 | */ | 1000 | */ |
1001 | p->sleep_type = SLEEP_INTERACTIVE; | 1001 | p->sleep_type = SLEEP_INTERACTIVE; |
1002 | } | 1002 | } |
1003 | } | 1003 | } |
1004 | p->timestamp = now; | 1004 | p->timestamp = now; |
1005 | out: | 1005 | out: |
1006 | __activate_task(p, rq); | 1006 | __activate_task(p, rq); |
1007 | } | 1007 | } |
1008 | 1008 | ||
1009 | /* | 1009 | /* |
1010 | * deactivate_task - remove a task from the runqueue. | 1010 | * deactivate_task - remove a task from the runqueue. |
1011 | */ | 1011 | */ |
1012 | static void deactivate_task(struct task_struct *p, struct rq *rq) | 1012 | static void deactivate_task(struct task_struct *p, struct rq *rq) |
1013 | { | 1013 | { |
1014 | dec_nr_running(p, rq); | 1014 | dec_nr_running(p, rq); |
1015 | dequeue_task(p, p->array); | 1015 | dequeue_task(p, p->array); |
1016 | p->array = NULL; | 1016 | p->array = NULL; |
1017 | } | 1017 | } |
1018 | 1018 | ||
1019 | /* | 1019 | /* |
1020 | * resched_task - mark a task 'to be rescheduled now'. | 1020 | * resched_task - mark a task 'to be rescheduled now'. |
1021 | * | 1021 | * |
1022 | * On UP this means the setting of the need_resched flag, on SMP it | 1022 | * On UP this means the setting of the need_resched flag, on SMP it |
1023 | * might also involve a cross-CPU call to trigger the scheduler on | 1023 | * might also involve a cross-CPU call to trigger the scheduler on |
1024 | * the target CPU. | 1024 | * the target CPU. |
1025 | */ | 1025 | */ |
1026 | #ifdef CONFIG_SMP | 1026 | #ifdef CONFIG_SMP |
1027 | 1027 | ||
1028 | #ifndef tsk_is_polling | 1028 | #ifndef tsk_is_polling |
1029 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1029 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
1030 | #endif | 1030 | #endif |
1031 | 1031 | ||
1032 | static void resched_task(struct task_struct *p) | 1032 | static void resched_task(struct task_struct *p) |
1033 | { | 1033 | { |
1034 | int cpu; | 1034 | int cpu; |
1035 | 1035 | ||
1036 | assert_spin_locked(&task_rq(p)->lock); | 1036 | assert_spin_locked(&task_rq(p)->lock); |
1037 | 1037 | ||
1038 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1038 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) |
1039 | return; | 1039 | return; |
1040 | 1040 | ||
1041 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1041 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); |
1042 | 1042 | ||
1043 | cpu = task_cpu(p); | 1043 | cpu = task_cpu(p); |
1044 | if (cpu == smp_processor_id()) | 1044 | if (cpu == smp_processor_id()) |
1045 | return; | 1045 | return; |
1046 | 1046 | ||
1047 | /* NEED_RESCHED must be visible before we test polling */ | 1047 | /* NEED_RESCHED must be visible before we test polling */ |
1048 | smp_mb(); | 1048 | smp_mb(); |
1049 | if (!tsk_is_polling(p)) | 1049 | if (!tsk_is_polling(p)) |
1050 | smp_send_reschedule(cpu); | 1050 | smp_send_reschedule(cpu); |
1051 | } | 1051 | } |
1052 | #else | 1052 | #else |
1053 | static inline void resched_task(struct task_struct *p) | 1053 | static inline void resched_task(struct task_struct *p) |
1054 | { | 1054 | { |
1055 | assert_spin_locked(&task_rq(p)->lock); | 1055 | assert_spin_locked(&task_rq(p)->lock); |
1056 | set_tsk_need_resched(p); | 1056 | set_tsk_need_resched(p); |
1057 | } | 1057 | } |
1058 | #endif | 1058 | #endif |
1059 | 1059 | ||
1060 | /** | 1060 | /** |
1061 | * task_curr - is this task currently executing on a CPU? | 1061 | * task_curr - is this task currently executing on a CPU? |
1062 | * @p: the task in question. | 1062 | * @p: the task in question. |
1063 | */ | 1063 | */ |
1064 | inline int task_curr(const struct task_struct *p) | 1064 | inline int task_curr(const struct task_struct *p) |
1065 | { | 1065 | { |
1066 | return cpu_curr(task_cpu(p)) == p; | 1066 | return cpu_curr(task_cpu(p)) == p; |
1067 | } | 1067 | } |
1068 | 1068 | ||
1069 | /* Used instead of source_load when we know the type == 0 */ | 1069 | /* Used instead of source_load when we know the type == 0 */ |
1070 | unsigned long weighted_cpuload(const int cpu) | 1070 | unsigned long weighted_cpuload(const int cpu) |
1071 | { | 1071 | { |
1072 | return cpu_rq(cpu)->raw_weighted_load; | 1072 | return cpu_rq(cpu)->raw_weighted_load; |
1073 | } | 1073 | } |
1074 | 1074 | ||
1075 | #ifdef CONFIG_SMP | 1075 | #ifdef CONFIG_SMP |
1076 | struct migration_req { | 1076 | struct migration_req { |
1077 | struct list_head list; | 1077 | struct list_head list; |
1078 | 1078 | ||
1079 | struct task_struct *task; | 1079 | struct task_struct *task; |
1080 | int dest_cpu; | 1080 | int dest_cpu; |
1081 | 1081 | ||
1082 | struct completion done; | 1082 | struct completion done; |
1083 | }; | 1083 | }; |
1084 | 1084 | ||
1085 | /* | 1085 | /* |
1086 | * The task's runqueue lock must be held. | 1086 | * The task's runqueue lock must be held. |
1087 | * Returns true if you have to wait for migration thread. | 1087 | * Returns true if you have to wait for migration thread. |
1088 | */ | 1088 | */ |
1089 | static int | 1089 | static int |
1090 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) | 1090 | migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) |
1091 | { | 1091 | { |
1092 | struct rq *rq = task_rq(p); | 1092 | struct rq *rq = task_rq(p); |
1093 | 1093 | ||
1094 | /* | 1094 | /* |
1095 | * If the task is not on a runqueue (and not running), then | 1095 | * If the task is not on a runqueue (and not running), then |
1096 | * it is sufficient to simply update the task's cpu field. | 1096 | * it is sufficient to simply update the task's cpu field. |
1097 | */ | 1097 | */ |
1098 | if (!p->array && !task_running(rq, p)) { | 1098 | if (!p->array && !task_running(rq, p)) { |
1099 | set_task_cpu(p, dest_cpu); | 1099 | set_task_cpu(p, dest_cpu); |
1100 | return 0; | 1100 | return 0; |
1101 | } | 1101 | } |
1102 | 1102 | ||
1103 | init_completion(&req->done); | 1103 | init_completion(&req->done); |
1104 | req->task = p; | 1104 | req->task = p; |
1105 | req->dest_cpu = dest_cpu; | 1105 | req->dest_cpu = dest_cpu; |
1106 | list_add(&req->list, &rq->migration_queue); | 1106 | list_add(&req->list, &rq->migration_queue); |
1107 | 1107 | ||
1108 | return 1; | 1108 | return 1; |
1109 | } | 1109 | } |
1110 | 1110 | ||
1111 | /* | 1111 | /* |
1112 | * wait_task_inactive - wait for a thread to unschedule. | 1112 | * wait_task_inactive - wait for a thread to unschedule. |
1113 | * | 1113 | * |
1114 | * The caller must ensure that the task *will* unschedule sometime soon, | 1114 | * The caller must ensure that the task *will* unschedule sometime soon, |
1115 | * else this function might spin for a *long* time. This function can't | 1115 | * else this function might spin for a *long* time. This function can't |
1116 | * be called with interrupts off, or it may introduce deadlock with | 1116 | * be called with interrupts off, or it may introduce deadlock with |
1117 | * smp_call_function() if an IPI is sent by the same process we are | 1117 | * smp_call_function() if an IPI is sent by the same process we are |
1118 | * waiting to become inactive. | 1118 | * waiting to become inactive. |
1119 | */ | 1119 | */ |
1120 | void wait_task_inactive(struct task_struct *p) | 1120 | void wait_task_inactive(struct task_struct *p) |
1121 | { | 1121 | { |
1122 | unsigned long flags; | 1122 | unsigned long flags; |
1123 | struct rq *rq; | 1123 | struct rq *rq; |
1124 | int preempted; | 1124 | int preempted; |
1125 | 1125 | ||
1126 | repeat: | 1126 | repeat: |
1127 | rq = task_rq_lock(p, &flags); | 1127 | rq = task_rq_lock(p, &flags); |
1128 | /* Must be off runqueue entirely, not preempted. */ | 1128 | /* Must be off runqueue entirely, not preempted. */ |
1129 | if (unlikely(p->array || task_running(rq, p))) { | 1129 | if (unlikely(p->array || task_running(rq, p))) { |
1130 | /* If it's preempted, we yield. It could be a while. */ | 1130 | /* If it's preempted, we yield. It could be a while. */ |
1131 | preempted = !task_running(rq, p); | 1131 | preempted = !task_running(rq, p); |
1132 | task_rq_unlock(rq, &flags); | 1132 | task_rq_unlock(rq, &flags); |
1133 | cpu_relax(); | 1133 | cpu_relax(); |
1134 | if (preempted) | 1134 | if (preempted) |
1135 | yield(); | 1135 | yield(); |
1136 | goto repeat; | 1136 | goto repeat; |
1137 | } | 1137 | } |
1138 | task_rq_unlock(rq, &flags); | 1138 | task_rq_unlock(rq, &flags); |
1139 | } | 1139 | } |
1140 | 1140 | ||
1141 | /*** | 1141 | /*** |
1142 | * kick_process - kick a running thread to enter/exit the kernel | 1142 | * kick_process - kick a running thread to enter/exit the kernel |
1143 | * @p: the to-be-kicked thread | 1143 | * @p: the to-be-kicked thread |
1144 | * | 1144 | * |
1145 | * Cause a process which is running on another CPU to enter | 1145 | * Cause a process which is running on another CPU to enter |
1146 | * kernel-mode, without any delay. (to get signals handled.) | 1146 | * kernel-mode, without any delay. (to get signals handled.) |
1147 | * | 1147 | * |
1148 | * NOTE: this function doesnt have to take the runqueue lock, | 1148 | * NOTE: this function doesnt have to take the runqueue lock, |
1149 | * because all it wants to ensure is that the remote task enters | 1149 | * because all it wants to ensure is that the remote task enters |
1150 | * the kernel. If the IPI races and the task has been migrated | 1150 | * the kernel. If the IPI races and the task has been migrated |
1151 | * to another CPU then no harm is done and the purpose has been | 1151 | * to another CPU then no harm is done and the purpose has been |
1152 | * achieved as well. | 1152 | * achieved as well. |
1153 | */ | 1153 | */ |
1154 | void kick_process(struct task_struct *p) | 1154 | void kick_process(struct task_struct *p) |
1155 | { | 1155 | { |
1156 | int cpu; | 1156 | int cpu; |
1157 | 1157 | ||
1158 | preempt_disable(); | 1158 | preempt_disable(); |
1159 | cpu = task_cpu(p); | 1159 | cpu = task_cpu(p); |
1160 | if ((cpu != smp_processor_id()) && task_curr(p)) | 1160 | if ((cpu != smp_processor_id()) && task_curr(p)) |
1161 | smp_send_reschedule(cpu); | 1161 | smp_send_reschedule(cpu); |
1162 | preempt_enable(); | 1162 | preempt_enable(); |
1163 | } | 1163 | } |
1164 | 1164 | ||
1165 | /* | 1165 | /* |
1166 | * Return a low guess at the load of a migration-source cpu weighted | 1166 | * Return a low guess at the load of a migration-source cpu weighted |
1167 | * according to the scheduling class and "nice" value. | 1167 | * according to the scheduling class and "nice" value. |
1168 | * | 1168 | * |
1169 | * We want to under-estimate the load of migration sources, to | 1169 | * We want to under-estimate the load of migration sources, to |
1170 | * balance conservatively. | 1170 | * balance conservatively. |
1171 | */ | 1171 | */ |
1172 | static inline unsigned long source_load(int cpu, int type) | 1172 | static inline unsigned long source_load(int cpu, int type) |
1173 | { | 1173 | { |
1174 | struct rq *rq = cpu_rq(cpu); | 1174 | struct rq *rq = cpu_rq(cpu); |
1175 | 1175 | ||
1176 | if (type == 0) | 1176 | if (type == 0) |
1177 | return rq->raw_weighted_load; | 1177 | return rq->raw_weighted_load; |
1178 | 1178 | ||
1179 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); | 1179 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
1180 | } | 1180 | } |
1181 | 1181 | ||
1182 | /* | 1182 | /* |
1183 | * Return a high guess at the load of a migration-target cpu weighted | 1183 | * Return a high guess at the load of a migration-target cpu weighted |
1184 | * according to the scheduling class and "nice" value. | 1184 | * according to the scheduling class and "nice" value. |
1185 | */ | 1185 | */ |
1186 | static inline unsigned long target_load(int cpu, int type) | 1186 | static inline unsigned long target_load(int cpu, int type) |
1187 | { | 1187 | { |
1188 | struct rq *rq = cpu_rq(cpu); | 1188 | struct rq *rq = cpu_rq(cpu); |
1189 | 1189 | ||
1190 | if (type == 0) | 1190 | if (type == 0) |
1191 | return rq->raw_weighted_load; | 1191 | return rq->raw_weighted_load; |
1192 | 1192 | ||
1193 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | 1193 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); |
1194 | } | 1194 | } |
1195 | 1195 | ||
1196 | /* | 1196 | /* |
1197 | * Return the average load per task on the cpu's run queue | 1197 | * Return the average load per task on the cpu's run queue |
1198 | */ | 1198 | */ |
1199 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1199 | static inline unsigned long cpu_avg_load_per_task(int cpu) |
1200 | { | 1200 | { |
1201 | struct rq *rq = cpu_rq(cpu); | 1201 | struct rq *rq = cpu_rq(cpu); |
1202 | unsigned long n = rq->nr_running; | 1202 | unsigned long n = rq->nr_running; |
1203 | 1203 | ||
1204 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; | 1204 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; |
1205 | } | 1205 | } |
1206 | 1206 | ||
1207 | /* | 1207 | /* |
1208 | * find_idlest_group finds and returns the least busy CPU group within the | 1208 | * find_idlest_group finds and returns the least busy CPU group within the |
1209 | * domain. | 1209 | * domain. |
1210 | */ | 1210 | */ |
1211 | static struct sched_group * | 1211 | static struct sched_group * |
1212 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | 1212 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) |
1213 | { | 1213 | { |
1214 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | 1214 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; |
1215 | unsigned long min_load = ULONG_MAX, this_load = 0; | 1215 | unsigned long min_load = ULONG_MAX, this_load = 0; |
1216 | int load_idx = sd->forkexec_idx; | 1216 | int load_idx = sd->forkexec_idx; |
1217 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 1217 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
1218 | 1218 | ||
1219 | do { | 1219 | do { |
1220 | unsigned long load, avg_load; | 1220 | unsigned long load, avg_load; |
1221 | int local_group; | 1221 | int local_group; |
1222 | int i; | 1222 | int i; |
1223 | 1223 | ||
1224 | /* Skip over this group if it has no CPUs allowed */ | 1224 | /* Skip over this group if it has no CPUs allowed */ |
1225 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | 1225 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) |
1226 | goto nextgroup; | 1226 | goto nextgroup; |
1227 | 1227 | ||
1228 | local_group = cpu_isset(this_cpu, group->cpumask); | 1228 | local_group = cpu_isset(this_cpu, group->cpumask); |
1229 | 1229 | ||
1230 | /* Tally up the load of all CPUs in the group */ | 1230 | /* Tally up the load of all CPUs in the group */ |
1231 | avg_load = 0; | 1231 | avg_load = 0; |
1232 | 1232 | ||
1233 | for_each_cpu_mask(i, group->cpumask) { | 1233 | for_each_cpu_mask(i, group->cpumask) { |
1234 | /* Bias balancing toward cpus of our domain */ | 1234 | /* Bias balancing toward cpus of our domain */ |
1235 | if (local_group) | 1235 | if (local_group) |
1236 | load = source_load(i, load_idx); | 1236 | load = source_load(i, load_idx); |
1237 | else | 1237 | else |
1238 | load = target_load(i, load_idx); | 1238 | load = target_load(i, load_idx); |
1239 | 1239 | ||
1240 | avg_load += load; | 1240 | avg_load += load; |
1241 | } | 1241 | } |
1242 | 1242 | ||
1243 | /* Adjust by relative CPU power of the group */ | 1243 | /* Adjust by relative CPU power of the group */ |
1244 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 1244 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1245 | 1245 | ||
1246 | if (local_group) { | 1246 | if (local_group) { |
1247 | this_load = avg_load; | 1247 | this_load = avg_load; |
1248 | this = group; | 1248 | this = group; |
1249 | } else if (avg_load < min_load) { | 1249 | } else if (avg_load < min_load) { |
1250 | min_load = avg_load; | 1250 | min_load = avg_load; |
1251 | idlest = group; | 1251 | idlest = group; |
1252 | } | 1252 | } |
1253 | nextgroup: | 1253 | nextgroup: |
1254 | group = group->next; | 1254 | group = group->next; |
1255 | } while (group != sd->groups); | 1255 | } while (group != sd->groups); |
1256 | 1256 | ||
1257 | if (!idlest || 100*this_load < imbalance*min_load) | 1257 | if (!idlest || 100*this_load < imbalance*min_load) |
1258 | return NULL; | 1258 | return NULL; |
1259 | return idlest; | 1259 | return idlest; |
1260 | } | 1260 | } |
1261 | 1261 | ||
1262 | /* | 1262 | /* |
1263 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | 1263 | * find_idlest_cpu - find the idlest cpu among the cpus in group. |
1264 | */ | 1264 | */ |
1265 | static int | 1265 | static int |
1266 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 1266 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
1267 | { | 1267 | { |
1268 | cpumask_t tmp; | 1268 | cpumask_t tmp; |
1269 | unsigned long load, min_load = ULONG_MAX; | 1269 | unsigned long load, min_load = ULONG_MAX; |
1270 | int idlest = -1; | 1270 | int idlest = -1; |
1271 | int i; | 1271 | int i; |
1272 | 1272 | ||
1273 | /* Traverse only the allowed CPUs */ | 1273 | /* Traverse only the allowed CPUs */ |
1274 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1274 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1275 | 1275 | ||
1276 | for_each_cpu_mask(i, tmp) { | 1276 | for_each_cpu_mask(i, tmp) { |
1277 | load = weighted_cpuload(i); | 1277 | load = weighted_cpuload(i); |
1278 | 1278 | ||
1279 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1279 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1280 | min_load = load; | 1280 | min_load = load; |
1281 | idlest = i; | 1281 | idlest = i; |
1282 | } | 1282 | } |
1283 | } | 1283 | } |
1284 | 1284 | ||
1285 | return idlest; | 1285 | return idlest; |
1286 | } | 1286 | } |
1287 | 1287 | ||
1288 | /* | 1288 | /* |
1289 | * sched_balance_self: balance the current task (running on cpu) in domains | 1289 | * sched_balance_self: balance the current task (running on cpu) in domains |
1290 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | 1290 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and |
1291 | * SD_BALANCE_EXEC. | 1291 | * SD_BALANCE_EXEC. |
1292 | * | 1292 | * |
1293 | * Balance, ie. select the least loaded group. | 1293 | * Balance, ie. select the least loaded group. |
1294 | * | 1294 | * |
1295 | * Returns the target CPU number, or the same CPU if no balancing is needed. | 1295 | * Returns the target CPU number, or the same CPU if no balancing is needed. |
1296 | * | 1296 | * |
1297 | * preempt must be disabled. | 1297 | * preempt must be disabled. |
1298 | */ | 1298 | */ |
1299 | static int sched_balance_self(int cpu, int flag) | 1299 | static int sched_balance_self(int cpu, int flag) |
1300 | { | 1300 | { |
1301 | struct task_struct *t = current; | 1301 | struct task_struct *t = current; |
1302 | struct sched_domain *tmp, *sd = NULL; | 1302 | struct sched_domain *tmp, *sd = NULL; |
1303 | 1303 | ||
1304 | for_each_domain(cpu, tmp) { | 1304 | for_each_domain(cpu, tmp) { |
1305 | /* | 1305 | /* |
1306 | * If power savings logic is enabled for a domain, stop there. | 1306 | * If power savings logic is enabled for a domain, stop there. |
1307 | */ | 1307 | */ |
1308 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | 1308 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) |
1309 | break; | 1309 | break; |
1310 | if (tmp->flags & flag) | 1310 | if (tmp->flags & flag) |
1311 | sd = tmp; | 1311 | sd = tmp; |
1312 | } | 1312 | } |
1313 | 1313 | ||
1314 | while (sd) { | 1314 | while (sd) { |
1315 | cpumask_t span; | 1315 | cpumask_t span; |
1316 | struct sched_group *group; | 1316 | struct sched_group *group; |
1317 | int new_cpu, weight; | 1317 | int new_cpu, weight; |
1318 | 1318 | ||
1319 | if (!(sd->flags & flag)) { | 1319 | if (!(sd->flags & flag)) { |
1320 | sd = sd->child; | 1320 | sd = sd->child; |
1321 | continue; | 1321 | continue; |
1322 | } | 1322 | } |
1323 | 1323 | ||
1324 | span = sd->span; | 1324 | span = sd->span; |
1325 | group = find_idlest_group(sd, t, cpu); | 1325 | group = find_idlest_group(sd, t, cpu); |
1326 | if (!group) { | 1326 | if (!group) { |
1327 | sd = sd->child; | 1327 | sd = sd->child; |
1328 | continue; | 1328 | continue; |
1329 | } | 1329 | } |
1330 | 1330 | ||
1331 | new_cpu = find_idlest_cpu(group, t, cpu); | 1331 | new_cpu = find_idlest_cpu(group, t, cpu); |
1332 | if (new_cpu == -1 || new_cpu == cpu) { | 1332 | if (new_cpu == -1 || new_cpu == cpu) { |
1333 | /* Now try balancing at a lower domain level of cpu */ | 1333 | /* Now try balancing at a lower domain level of cpu */ |
1334 | sd = sd->child; | 1334 | sd = sd->child; |
1335 | continue; | 1335 | continue; |
1336 | } | 1336 | } |
1337 | 1337 | ||
1338 | /* Now try balancing at a lower domain level of new_cpu */ | 1338 | /* Now try balancing at a lower domain level of new_cpu */ |
1339 | cpu = new_cpu; | 1339 | cpu = new_cpu; |
1340 | sd = NULL; | 1340 | sd = NULL; |
1341 | weight = cpus_weight(span); | 1341 | weight = cpus_weight(span); |
1342 | for_each_domain(cpu, tmp) { | 1342 | for_each_domain(cpu, tmp) { |
1343 | if (weight <= cpus_weight(tmp->span)) | 1343 | if (weight <= cpus_weight(tmp->span)) |
1344 | break; | 1344 | break; |
1345 | if (tmp->flags & flag) | 1345 | if (tmp->flags & flag) |
1346 | sd = tmp; | 1346 | sd = tmp; |
1347 | } | 1347 | } |
1348 | /* while loop will break here if sd == NULL */ | 1348 | /* while loop will break here if sd == NULL */ |
1349 | } | 1349 | } |
1350 | 1350 | ||
1351 | return cpu; | 1351 | return cpu; |
1352 | } | 1352 | } |
1353 | 1353 | ||
1354 | #endif /* CONFIG_SMP */ | 1354 | #endif /* CONFIG_SMP */ |
1355 | 1355 | ||
1356 | /* | 1356 | /* |
1357 | * wake_idle() will wake a task on an idle cpu if task->cpu is | 1357 | * wake_idle() will wake a task on an idle cpu if task->cpu is |
1358 | * not idle and an idle cpu is available. The span of cpus to | 1358 | * not idle and an idle cpu is available. The span of cpus to |
1359 | * search starts with cpus closest then further out as needed, | 1359 | * search starts with cpus closest then further out as needed, |
1360 | * so we always favor a closer, idle cpu. | 1360 | * so we always favor a closer, idle cpu. |
1361 | * | 1361 | * |
1362 | * Returns the CPU we should wake onto. | 1362 | * Returns the CPU we should wake onto. |
1363 | */ | 1363 | */ |
1364 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | 1364 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) |
1365 | static int wake_idle(int cpu, struct task_struct *p) | 1365 | static int wake_idle(int cpu, struct task_struct *p) |
1366 | { | 1366 | { |
1367 | cpumask_t tmp; | 1367 | cpumask_t tmp; |
1368 | struct sched_domain *sd; | 1368 | struct sched_domain *sd; |
1369 | int i; | 1369 | int i; |
1370 | 1370 | ||
1371 | if (idle_cpu(cpu)) | 1371 | if (idle_cpu(cpu)) |
1372 | return cpu; | 1372 | return cpu; |
1373 | 1373 | ||
1374 | for_each_domain(cpu, sd) { | 1374 | for_each_domain(cpu, sd) { |
1375 | if (sd->flags & SD_WAKE_IDLE) { | 1375 | if (sd->flags & SD_WAKE_IDLE) { |
1376 | cpus_and(tmp, sd->span, p->cpus_allowed); | 1376 | cpus_and(tmp, sd->span, p->cpus_allowed); |
1377 | for_each_cpu_mask(i, tmp) { | 1377 | for_each_cpu_mask(i, tmp) { |
1378 | if (idle_cpu(i)) | 1378 | if (idle_cpu(i)) |
1379 | return i; | 1379 | return i; |
1380 | } | 1380 | } |
1381 | } | 1381 | } |
1382 | else | 1382 | else |
1383 | break; | 1383 | break; |
1384 | } | 1384 | } |
1385 | return cpu; | 1385 | return cpu; |
1386 | } | 1386 | } |
1387 | #else | 1387 | #else |
1388 | static inline int wake_idle(int cpu, struct task_struct *p) | 1388 | static inline int wake_idle(int cpu, struct task_struct *p) |
1389 | { | 1389 | { |
1390 | return cpu; | 1390 | return cpu; |
1391 | } | 1391 | } |
1392 | #endif | 1392 | #endif |
1393 | 1393 | ||
1394 | /*** | 1394 | /*** |
1395 | * try_to_wake_up - wake up a thread | 1395 | * try_to_wake_up - wake up a thread |
1396 | * @p: the to-be-woken-up thread | 1396 | * @p: the to-be-woken-up thread |
1397 | * @state: the mask of task states that can be woken | 1397 | * @state: the mask of task states that can be woken |
1398 | * @sync: do a synchronous wakeup? | 1398 | * @sync: do a synchronous wakeup? |
1399 | * | 1399 | * |
1400 | * Put it on the run-queue if it's not already there. The "current" | 1400 | * Put it on the run-queue if it's not already there. The "current" |
1401 | * thread is always on the run-queue (except when the actual | 1401 | * thread is always on the run-queue (except when the actual |
1402 | * re-schedule is in progress), and as such you're allowed to do | 1402 | * re-schedule is in progress), and as such you're allowed to do |
1403 | * the simpler "current->state = TASK_RUNNING" to mark yourself | 1403 | * the simpler "current->state = TASK_RUNNING" to mark yourself |
1404 | * runnable without the overhead of this. | 1404 | * runnable without the overhead of this. |
1405 | * | 1405 | * |
1406 | * returns failure only if the task is already active. | 1406 | * returns failure only if the task is already active. |
1407 | */ | 1407 | */ |
1408 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 1408 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) |
1409 | { | 1409 | { |
1410 | int cpu, this_cpu, success = 0; | 1410 | int cpu, this_cpu, success = 0; |
1411 | unsigned long flags; | 1411 | unsigned long flags; |
1412 | long old_state; | 1412 | long old_state; |
1413 | struct rq *rq; | 1413 | struct rq *rq; |
1414 | #ifdef CONFIG_SMP | 1414 | #ifdef CONFIG_SMP |
1415 | struct sched_domain *sd, *this_sd = NULL; | 1415 | struct sched_domain *sd, *this_sd = NULL; |
1416 | unsigned long load, this_load; | 1416 | unsigned long load, this_load; |
1417 | int new_cpu; | 1417 | int new_cpu; |
1418 | #endif | 1418 | #endif |
1419 | 1419 | ||
1420 | rq = task_rq_lock(p, &flags); | 1420 | rq = task_rq_lock(p, &flags); |
1421 | old_state = p->state; | 1421 | old_state = p->state; |
1422 | if (!(old_state & state)) | 1422 | if (!(old_state & state)) |
1423 | goto out; | 1423 | goto out; |
1424 | 1424 | ||
1425 | if (p->array) | 1425 | if (p->array) |
1426 | goto out_running; | 1426 | goto out_running; |
1427 | 1427 | ||
1428 | cpu = task_cpu(p); | 1428 | cpu = task_cpu(p); |
1429 | this_cpu = smp_processor_id(); | 1429 | this_cpu = smp_processor_id(); |
1430 | 1430 | ||
1431 | #ifdef CONFIG_SMP | 1431 | #ifdef CONFIG_SMP |
1432 | if (unlikely(task_running(rq, p))) | 1432 | if (unlikely(task_running(rq, p))) |
1433 | goto out_activate; | 1433 | goto out_activate; |
1434 | 1434 | ||
1435 | new_cpu = cpu; | 1435 | new_cpu = cpu; |
1436 | 1436 | ||
1437 | schedstat_inc(rq, ttwu_cnt); | 1437 | schedstat_inc(rq, ttwu_cnt); |
1438 | if (cpu == this_cpu) { | 1438 | if (cpu == this_cpu) { |
1439 | schedstat_inc(rq, ttwu_local); | 1439 | schedstat_inc(rq, ttwu_local); |
1440 | goto out_set_cpu; | 1440 | goto out_set_cpu; |
1441 | } | 1441 | } |
1442 | 1442 | ||
1443 | for_each_domain(this_cpu, sd) { | 1443 | for_each_domain(this_cpu, sd) { |
1444 | if (cpu_isset(cpu, sd->span)) { | 1444 | if (cpu_isset(cpu, sd->span)) { |
1445 | schedstat_inc(sd, ttwu_wake_remote); | 1445 | schedstat_inc(sd, ttwu_wake_remote); |
1446 | this_sd = sd; | 1446 | this_sd = sd; |
1447 | break; | 1447 | break; |
1448 | } | 1448 | } |
1449 | } | 1449 | } |
1450 | 1450 | ||
1451 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | 1451 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) |
1452 | goto out_set_cpu; | 1452 | goto out_set_cpu; |
1453 | 1453 | ||
1454 | /* | 1454 | /* |
1455 | * Check for affine wakeup and passive balancing possibilities. | 1455 | * Check for affine wakeup and passive balancing possibilities. |
1456 | */ | 1456 | */ |
1457 | if (this_sd) { | 1457 | if (this_sd) { |
1458 | int idx = this_sd->wake_idx; | 1458 | int idx = this_sd->wake_idx; |
1459 | unsigned int imbalance; | 1459 | unsigned int imbalance; |
1460 | 1460 | ||
1461 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | 1461 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; |
1462 | 1462 | ||
1463 | load = source_load(cpu, idx); | 1463 | load = source_load(cpu, idx); |
1464 | this_load = target_load(this_cpu, idx); | 1464 | this_load = target_load(this_cpu, idx); |
1465 | 1465 | ||
1466 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | 1466 | new_cpu = this_cpu; /* Wake to this CPU if we can */ |
1467 | 1467 | ||
1468 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1468 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1469 | unsigned long tl = this_load; | 1469 | unsigned long tl = this_load; |
1470 | unsigned long tl_per_task; | 1470 | unsigned long tl_per_task; |
1471 | 1471 | ||
1472 | tl_per_task = cpu_avg_load_per_task(this_cpu); | 1472 | tl_per_task = cpu_avg_load_per_task(this_cpu); |
1473 | 1473 | ||
1474 | /* | 1474 | /* |
1475 | * If sync wakeup then subtract the (maximum possible) | 1475 | * If sync wakeup then subtract the (maximum possible) |
1476 | * effect of the currently running task from the load | 1476 | * effect of the currently running task from the load |
1477 | * of the current CPU: | 1477 | * of the current CPU: |
1478 | */ | 1478 | */ |
1479 | if (sync) | 1479 | if (sync) |
1480 | tl -= current->load_weight; | 1480 | tl -= current->load_weight; |
1481 | 1481 | ||
1482 | if ((tl <= load && | 1482 | if ((tl <= load && |
1483 | tl + target_load(cpu, idx) <= tl_per_task) || | 1483 | tl + target_load(cpu, idx) <= tl_per_task) || |
1484 | 100*(tl + p->load_weight) <= imbalance*load) { | 1484 | 100*(tl + p->load_weight) <= imbalance*load) { |
1485 | /* | 1485 | /* |
1486 | * This domain has SD_WAKE_AFFINE and | 1486 | * This domain has SD_WAKE_AFFINE and |
1487 | * p is cache cold in this domain, and | 1487 | * p is cache cold in this domain, and |
1488 | * there is no bad imbalance. | 1488 | * there is no bad imbalance. |
1489 | */ | 1489 | */ |
1490 | schedstat_inc(this_sd, ttwu_move_affine); | 1490 | schedstat_inc(this_sd, ttwu_move_affine); |
1491 | goto out_set_cpu; | 1491 | goto out_set_cpu; |
1492 | } | 1492 | } |
1493 | } | 1493 | } |
1494 | 1494 | ||
1495 | /* | 1495 | /* |
1496 | * Start passive balancing when half the imbalance_pct | 1496 | * Start passive balancing when half the imbalance_pct |
1497 | * limit is reached. | 1497 | * limit is reached. |
1498 | */ | 1498 | */ |
1499 | if (this_sd->flags & SD_WAKE_BALANCE) { | 1499 | if (this_sd->flags & SD_WAKE_BALANCE) { |
1500 | if (imbalance*this_load <= 100*load) { | 1500 | if (imbalance*this_load <= 100*load) { |
1501 | schedstat_inc(this_sd, ttwu_move_balance); | 1501 | schedstat_inc(this_sd, ttwu_move_balance); |
1502 | goto out_set_cpu; | 1502 | goto out_set_cpu; |
1503 | } | 1503 | } |
1504 | } | 1504 | } |
1505 | } | 1505 | } |
1506 | 1506 | ||
1507 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | 1507 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ |
1508 | out_set_cpu: | 1508 | out_set_cpu: |
1509 | new_cpu = wake_idle(new_cpu, p); | 1509 | new_cpu = wake_idle(new_cpu, p); |
1510 | if (new_cpu != cpu) { | 1510 | if (new_cpu != cpu) { |
1511 | set_task_cpu(p, new_cpu); | 1511 | set_task_cpu(p, new_cpu); |
1512 | task_rq_unlock(rq, &flags); | 1512 | task_rq_unlock(rq, &flags); |
1513 | /* might preempt at this point */ | 1513 | /* might preempt at this point */ |
1514 | rq = task_rq_lock(p, &flags); | 1514 | rq = task_rq_lock(p, &flags); |
1515 | old_state = p->state; | 1515 | old_state = p->state; |
1516 | if (!(old_state & state)) | 1516 | if (!(old_state & state)) |
1517 | goto out; | 1517 | goto out; |
1518 | if (p->array) | 1518 | if (p->array) |
1519 | goto out_running; | 1519 | goto out_running; |
1520 | 1520 | ||
1521 | this_cpu = smp_processor_id(); | 1521 | this_cpu = smp_processor_id(); |
1522 | cpu = task_cpu(p); | 1522 | cpu = task_cpu(p); |
1523 | } | 1523 | } |
1524 | 1524 | ||
1525 | out_activate: | 1525 | out_activate: |
1526 | #endif /* CONFIG_SMP */ | 1526 | #endif /* CONFIG_SMP */ |
1527 | if (old_state == TASK_UNINTERRUPTIBLE) { | 1527 | if (old_state == TASK_UNINTERRUPTIBLE) { |
1528 | rq->nr_uninterruptible--; | 1528 | rq->nr_uninterruptible--; |
1529 | /* | 1529 | /* |
1530 | * Tasks on involuntary sleep don't earn | 1530 | * Tasks on involuntary sleep don't earn |
1531 | * sleep_avg beyond just interactive state. | 1531 | * sleep_avg beyond just interactive state. |
1532 | */ | 1532 | */ |
1533 | p->sleep_type = SLEEP_NONINTERACTIVE; | 1533 | p->sleep_type = SLEEP_NONINTERACTIVE; |
1534 | } else | 1534 | } else |
1535 | 1535 | ||
1536 | /* | 1536 | /* |
1537 | * Tasks that have marked their sleep as noninteractive get | 1537 | * Tasks that have marked their sleep as noninteractive get |
1538 | * woken up with their sleep average not weighted in an | 1538 | * woken up with their sleep average not weighted in an |
1539 | * interactive way. | 1539 | * interactive way. |
1540 | */ | 1540 | */ |
1541 | if (old_state & TASK_NONINTERACTIVE) | 1541 | if (old_state & TASK_NONINTERACTIVE) |
1542 | p->sleep_type = SLEEP_NONINTERACTIVE; | 1542 | p->sleep_type = SLEEP_NONINTERACTIVE; |
1543 | 1543 | ||
1544 | 1544 | ||
1545 | activate_task(p, rq, cpu == this_cpu); | 1545 | activate_task(p, rq, cpu == this_cpu); |
1546 | /* | 1546 | /* |
1547 | * Sync wakeups (i.e. those types of wakeups where the waker | 1547 | * Sync wakeups (i.e. those types of wakeups where the waker |
1548 | * has indicated that it will leave the CPU in short order) | 1548 | * has indicated that it will leave the CPU in short order) |
1549 | * don't trigger a preemption, if the woken up task will run on | 1549 | * don't trigger a preemption, if the woken up task will run on |
1550 | * this cpu. (in this case the 'I will reschedule' promise of | 1550 | * this cpu. (in this case the 'I will reschedule' promise of |
1551 | * the waker guarantees that the freshly woken up task is going | 1551 | * the waker guarantees that the freshly woken up task is going |
1552 | * to be considered on this CPU.) | 1552 | * to be considered on this CPU.) |
1553 | */ | 1553 | */ |
1554 | if (!sync || cpu != this_cpu) { | 1554 | if (!sync || cpu != this_cpu) { |
1555 | if (TASK_PREEMPTS_CURR(p, rq)) | 1555 | if (TASK_PREEMPTS_CURR(p, rq)) |
1556 | resched_task(rq->curr); | 1556 | resched_task(rq->curr); |
1557 | } | 1557 | } |
1558 | success = 1; | 1558 | success = 1; |
1559 | 1559 | ||
1560 | out_running: | 1560 | out_running: |
1561 | p->state = TASK_RUNNING; | 1561 | p->state = TASK_RUNNING; |
1562 | out: | 1562 | out: |
1563 | task_rq_unlock(rq, &flags); | 1563 | task_rq_unlock(rq, &flags); |
1564 | 1564 | ||
1565 | return success; | 1565 | return success; |
1566 | } | 1566 | } |
1567 | 1567 | ||
1568 | int fastcall wake_up_process(struct task_struct *p) | 1568 | int fastcall wake_up_process(struct task_struct *p) |
1569 | { | 1569 | { |
1570 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1570 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1571 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1571 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
1572 | } | 1572 | } |
1573 | EXPORT_SYMBOL(wake_up_process); | 1573 | EXPORT_SYMBOL(wake_up_process); |
1574 | 1574 | ||
1575 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) | 1575 | int fastcall wake_up_state(struct task_struct *p, unsigned int state) |
1576 | { | 1576 | { |
1577 | return try_to_wake_up(p, state, 0); | 1577 | return try_to_wake_up(p, state, 0); |
1578 | } | 1578 | } |
1579 | 1579 | ||
1580 | static void task_running_tick(struct rq *rq, struct task_struct *p); | 1580 | static void task_running_tick(struct rq *rq, struct task_struct *p); |
1581 | /* | 1581 | /* |
1582 | * Perform scheduler related setup for a newly forked process p. | 1582 | * Perform scheduler related setup for a newly forked process p. |
1583 | * p is forked by current. | 1583 | * p is forked by current. |
1584 | */ | 1584 | */ |
1585 | void fastcall sched_fork(struct task_struct *p, int clone_flags) | 1585 | void fastcall sched_fork(struct task_struct *p, int clone_flags) |
1586 | { | 1586 | { |
1587 | int cpu = get_cpu(); | 1587 | int cpu = get_cpu(); |
1588 | 1588 | ||
1589 | #ifdef CONFIG_SMP | 1589 | #ifdef CONFIG_SMP |
1590 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 1590 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); |
1591 | #endif | 1591 | #endif |
1592 | set_task_cpu(p, cpu); | 1592 | set_task_cpu(p, cpu); |
1593 | 1593 | ||
1594 | /* | 1594 | /* |
1595 | * We mark the process as running here, but have not actually | 1595 | * We mark the process as running here, but have not actually |
1596 | * inserted it onto the runqueue yet. This guarantees that | 1596 | * inserted it onto the runqueue yet. This guarantees that |
1597 | * nobody will actually run it, and a signal or other external | 1597 | * nobody will actually run it, and a signal or other external |
1598 | * event cannot wake it up and insert it on the runqueue either. | 1598 | * event cannot wake it up and insert it on the runqueue either. |
1599 | */ | 1599 | */ |
1600 | p->state = TASK_RUNNING; | 1600 | p->state = TASK_RUNNING; |
1601 | 1601 | ||
1602 | /* | 1602 | /* |
1603 | * Make sure we do not leak PI boosting priority to the child: | 1603 | * Make sure we do not leak PI boosting priority to the child: |
1604 | */ | 1604 | */ |
1605 | p->prio = current->normal_prio; | 1605 | p->prio = current->normal_prio; |
1606 | 1606 | ||
1607 | INIT_LIST_HEAD(&p->run_list); | 1607 | INIT_LIST_HEAD(&p->run_list); |
1608 | p->array = NULL; | 1608 | p->array = NULL; |
1609 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 1609 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
1610 | if (unlikely(sched_info_on())) | 1610 | if (unlikely(sched_info_on())) |
1611 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 1611 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
1612 | #endif | 1612 | #endif |
1613 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 1613 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
1614 | p->oncpu = 0; | 1614 | p->oncpu = 0; |
1615 | #endif | 1615 | #endif |
1616 | #ifdef CONFIG_PREEMPT | 1616 | #ifdef CONFIG_PREEMPT |
1617 | /* Want to start with kernel preemption disabled. */ | 1617 | /* Want to start with kernel preemption disabled. */ |
1618 | task_thread_info(p)->preempt_count = 1; | 1618 | task_thread_info(p)->preempt_count = 1; |
1619 | #endif | 1619 | #endif |
1620 | /* | 1620 | /* |
1621 | * Share the timeslice between parent and child, thus the | 1621 | * Share the timeslice between parent and child, thus the |
1622 | * total amount of pending timeslices in the system doesn't change, | 1622 | * total amount of pending timeslices in the system doesn't change, |
1623 | * resulting in more scheduling fairness. | 1623 | * resulting in more scheduling fairness. |
1624 | */ | 1624 | */ |
1625 | local_irq_disable(); | 1625 | local_irq_disable(); |
1626 | p->time_slice = (current->time_slice + 1) >> 1; | 1626 | p->time_slice = (current->time_slice + 1) >> 1; |
1627 | /* | 1627 | /* |
1628 | * The remainder of the first timeslice might be recovered by | 1628 | * The remainder of the first timeslice might be recovered by |
1629 | * the parent if the child exits early enough. | 1629 | * the parent if the child exits early enough. |
1630 | */ | 1630 | */ |
1631 | p->first_time_slice = 1; | 1631 | p->first_time_slice = 1; |
1632 | current->time_slice >>= 1; | 1632 | current->time_slice >>= 1; |
1633 | p->timestamp = sched_clock(); | 1633 | p->timestamp = sched_clock(); |
1634 | if (unlikely(!current->time_slice)) { | 1634 | if (unlikely(!current->time_slice)) { |
1635 | /* | 1635 | /* |
1636 | * This case is rare, it happens when the parent has only | 1636 | * This case is rare, it happens when the parent has only |
1637 | * a single jiffy left from its timeslice. Taking the | 1637 | * a single jiffy left from its timeslice. Taking the |
1638 | * runqueue lock is not a problem. | 1638 | * runqueue lock is not a problem. |
1639 | */ | 1639 | */ |
1640 | current->time_slice = 1; | 1640 | current->time_slice = 1; |
1641 | task_running_tick(cpu_rq(cpu), current); | 1641 | task_running_tick(cpu_rq(cpu), current); |
1642 | } | 1642 | } |
1643 | local_irq_enable(); | 1643 | local_irq_enable(); |
1644 | put_cpu(); | 1644 | put_cpu(); |
1645 | } | 1645 | } |
1646 | 1646 | ||
1647 | /* | 1647 | /* |
1648 | * wake_up_new_task - wake up a newly created task for the first time. | 1648 | * wake_up_new_task - wake up a newly created task for the first time. |
1649 | * | 1649 | * |
1650 | * This function will do some initial scheduler statistics housekeeping | 1650 | * This function will do some initial scheduler statistics housekeeping |
1651 | * that must be done for every newly created context, then puts the task | 1651 | * that must be done for every newly created context, then puts the task |
1652 | * on the runqueue and wakes it. | 1652 | * on the runqueue and wakes it. |
1653 | */ | 1653 | */ |
1654 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | 1654 | void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) |
1655 | { | 1655 | { |
1656 | struct rq *rq, *this_rq; | 1656 | struct rq *rq, *this_rq; |
1657 | unsigned long flags; | 1657 | unsigned long flags; |
1658 | int this_cpu, cpu; | 1658 | int this_cpu, cpu; |
1659 | 1659 | ||
1660 | rq = task_rq_lock(p, &flags); | 1660 | rq = task_rq_lock(p, &flags); |
1661 | BUG_ON(p->state != TASK_RUNNING); | 1661 | BUG_ON(p->state != TASK_RUNNING); |
1662 | this_cpu = smp_processor_id(); | 1662 | this_cpu = smp_processor_id(); |
1663 | cpu = task_cpu(p); | 1663 | cpu = task_cpu(p); |
1664 | 1664 | ||
1665 | /* | 1665 | /* |
1666 | * We decrease the sleep average of forking parents | 1666 | * We decrease the sleep average of forking parents |
1667 | * and children as well, to keep max-interactive tasks | 1667 | * and children as well, to keep max-interactive tasks |
1668 | * from forking tasks that are max-interactive. The parent | 1668 | * from forking tasks that are max-interactive. The parent |
1669 | * (current) is done further down, under its lock. | 1669 | * (current) is done further down, under its lock. |
1670 | */ | 1670 | */ |
1671 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * | 1671 | p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * |
1672 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | 1672 | CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
1673 | 1673 | ||
1674 | p->prio = effective_prio(p); | 1674 | p->prio = effective_prio(p); |
1675 | 1675 | ||
1676 | if (likely(cpu == this_cpu)) { | 1676 | if (likely(cpu == this_cpu)) { |
1677 | if (!(clone_flags & CLONE_VM)) { | 1677 | if (!(clone_flags & CLONE_VM)) { |
1678 | /* | 1678 | /* |
1679 | * The VM isn't cloned, so we're in a good position to | 1679 | * The VM isn't cloned, so we're in a good position to |
1680 | * do child-runs-first in anticipation of an exec. This | 1680 | * do child-runs-first in anticipation of an exec. This |
1681 | * usually avoids a lot of COW overhead. | 1681 | * usually avoids a lot of COW overhead. |
1682 | */ | 1682 | */ |
1683 | if (unlikely(!current->array)) | 1683 | if (unlikely(!current->array)) |
1684 | __activate_task(p, rq); | 1684 | __activate_task(p, rq); |
1685 | else { | 1685 | else { |
1686 | p->prio = current->prio; | 1686 | p->prio = current->prio; |
1687 | p->normal_prio = current->normal_prio; | 1687 | p->normal_prio = current->normal_prio; |
1688 | list_add_tail(&p->run_list, ¤t->run_list); | 1688 | list_add_tail(&p->run_list, ¤t->run_list); |
1689 | p->array = current->array; | 1689 | p->array = current->array; |
1690 | p->array->nr_active++; | 1690 | p->array->nr_active++; |
1691 | inc_nr_running(p, rq); | 1691 | inc_nr_running(p, rq); |
1692 | } | 1692 | } |
1693 | set_need_resched(); | 1693 | set_need_resched(); |
1694 | } else | 1694 | } else |
1695 | /* Run child last */ | 1695 | /* Run child last */ |
1696 | __activate_task(p, rq); | 1696 | __activate_task(p, rq); |
1697 | /* | 1697 | /* |
1698 | * We skip the following code due to cpu == this_cpu | 1698 | * We skip the following code due to cpu == this_cpu |
1699 | * | 1699 | * |
1700 | * task_rq_unlock(rq, &flags); | 1700 | * task_rq_unlock(rq, &flags); |
1701 | * this_rq = task_rq_lock(current, &flags); | 1701 | * this_rq = task_rq_lock(current, &flags); |
1702 | */ | 1702 | */ |
1703 | this_rq = rq; | 1703 | this_rq = rq; |
1704 | } else { | 1704 | } else { |
1705 | this_rq = cpu_rq(this_cpu); | 1705 | this_rq = cpu_rq(this_cpu); |
1706 | 1706 | ||
1707 | /* | 1707 | /* |
1708 | * Not the local CPU - must adjust timestamp. This should | 1708 | * Not the local CPU - must adjust timestamp. This should |
1709 | * get optimised away in the !CONFIG_SMP case. | 1709 | * get optimised away in the !CONFIG_SMP case. |
1710 | */ | 1710 | */ |
1711 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) | 1711 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) |
1712 | + rq->most_recent_timestamp; | 1712 | + rq->most_recent_timestamp; |
1713 | __activate_task(p, rq); | 1713 | __activate_task(p, rq); |
1714 | if (TASK_PREEMPTS_CURR(p, rq)) | 1714 | if (TASK_PREEMPTS_CURR(p, rq)) |
1715 | resched_task(rq->curr); | 1715 | resched_task(rq->curr); |
1716 | 1716 | ||
1717 | /* | 1717 | /* |
1718 | * Parent and child are on different CPUs, now get the | 1718 | * Parent and child are on different CPUs, now get the |
1719 | * parent runqueue to update the parent's ->sleep_avg: | 1719 | * parent runqueue to update the parent's ->sleep_avg: |
1720 | */ | 1720 | */ |
1721 | task_rq_unlock(rq, &flags); | 1721 | task_rq_unlock(rq, &flags); |
1722 | this_rq = task_rq_lock(current, &flags); | 1722 | this_rq = task_rq_lock(current, &flags); |
1723 | } | 1723 | } |
1724 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * | 1724 | current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * |
1725 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); | 1725 | PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); |
1726 | task_rq_unlock(this_rq, &flags); | 1726 | task_rq_unlock(this_rq, &flags); |
1727 | } | 1727 | } |
1728 | 1728 | ||
1729 | /* | 1729 | /* |
1730 | * Potentially available exiting-child timeslices are | 1730 | * Potentially available exiting-child timeslices are |
1731 | * retrieved here - this way the parent does not get | 1731 | * retrieved here - this way the parent does not get |
1732 | * penalized for creating too many threads. | 1732 | * penalized for creating too many threads. |
1733 | * | 1733 | * |
1734 | * (this cannot be used to 'generate' timeslices | 1734 | * (this cannot be used to 'generate' timeslices |
1735 | * artificially, because any timeslice recovered here | 1735 | * artificially, because any timeslice recovered here |
1736 | * was given away by the parent in the first place.) | 1736 | * was given away by the parent in the first place.) |
1737 | */ | 1737 | */ |
1738 | void fastcall sched_exit(struct task_struct *p) | 1738 | void fastcall sched_exit(struct task_struct *p) |
1739 | { | 1739 | { |
1740 | unsigned long flags; | 1740 | unsigned long flags; |
1741 | struct rq *rq; | 1741 | struct rq *rq; |
1742 | 1742 | ||
1743 | /* | 1743 | /* |
1744 | * If the child was a (relative-) CPU hog then decrease | 1744 | * If the child was a (relative-) CPU hog then decrease |
1745 | * the sleep_avg of the parent as well. | 1745 | * the sleep_avg of the parent as well. |
1746 | */ | 1746 | */ |
1747 | rq = task_rq_lock(p->parent, &flags); | 1747 | rq = task_rq_lock(p->parent, &flags); |
1748 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { | 1748 | if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { |
1749 | p->parent->time_slice += p->time_slice; | 1749 | p->parent->time_slice += p->time_slice; |
1750 | if (unlikely(p->parent->time_slice > task_timeslice(p))) | 1750 | if (unlikely(p->parent->time_slice > task_timeslice(p))) |
1751 | p->parent->time_slice = task_timeslice(p); | 1751 | p->parent->time_slice = task_timeslice(p); |
1752 | } | 1752 | } |
1753 | if (p->sleep_avg < p->parent->sleep_avg) | 1753 | if (p->sleep_avg < p->parent->sleep_avg) |
1754 | p->parent->sleep_avg = p->parent->sleep_avg / | 1754 | p->parent->sleep_avg = p->parent->sleep_avg / |
1755 | (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / | 1755 | (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / |
1756 | (EXIT_WEIGHT + 1); | 1756 | (EXIT_WEIGHT + 1); |
1757 | task_rq_unlock(rq, &flags); | 1757 | task_rq_unlock(rq, &flags); |
1758 | } | 1758 | } |
1759 | 1759 | ||
1760 | /** | 1760 | /** |
1761 | * prepare_task_switch - prepare to switch tasks | 1761 | * prepare_task_switch - prepare to switch tasks |
1762 | * @rq: the runqueue preparing to switch | 1762 | * @rq: the runqueue preparing to switch |
1763 | * @next: the task we are going to switch to. | 1763 | * @next: the task we are going to switch to. |
1764 | * | 1764 | * |
1765 | * This is called with the rq lock held and interrupts off. It must | 1765 | * This is called with the rq lock held and interrupts off. It must |
1766 | * be paired with a subsequent finish_task_switch after the context | 1766 | * be paired with a subsequent finish_task_switch after the context |
1767 | * switch. | 1767 | * switch. |
1768 | * | 1768 | * |
1769 | * prepare_task_switch sets up locking and calls architecture specific | 1769 | * prepare_task_switch sets up locking and calls architecture specific |
1770 | * hooks. | 1770 | * hooks. |
1771 | */ | 1771 | */ |
1772 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) | 1772 | static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) |
1773 | { | 1773 | { |
1774 | prepare_lock_switch(rq, next); | 1774 | prepare_lock_switch(rq, next); |
1775 | prepare_arch_switch(next); | 1775 | prepare_arch_switch(next); |
1776 | } | 1776 | } |
1777 | 1777 | ||
1778 | /** | 1778 | /** |
1779 | * finish_task_switch - clean up after a task-switch | 1779 | * finish_task_switch - clean up after a task-switch |
1780 | * @rq: runqueue associated with task-switch | 1780 | * @rq: runqueue associated with task-switch |
1781 | * @prev: the thread we just switched away from. | 1781 | * @prev: the thread we just switched away from. |
1782 | * | 1782 | * |
1783 | * finish_task_switch must be called after the context switch, paired | 1783 | * finish_task_switch must be called after the context switch, paired |
1784 | * with a prepare_task_switch call before the context switch. | 1784 | * with a prepare_task_switch call before the context switch. |
1785 | * finish_task_switch will reconcile locking set up by prepare_task_switch, | 1785 | * finish_task_switch will reconcile locking set up by prepare_task_switch, |
1786 | * and do any other architecture-specific cleanup actions. | 1786 | * and do any other architecture-specific cleanup actions. |
1787 | * | 1787 | * |
1788 | * Note that we may have delayed dropping an mm in context_switch(). If | 1788 | * Note that we may have delayed dropping an mm in context_switch(). If |
1789 | * so, we finish that here outside of the runqueue lock. (Doing it | 1789 | * so, we finish that here outside of the runqueue lock. (Doing it |
1790 | * with the lock held can cause deadlocks; see schedule() for | 1790 | * with the lock held can cause deadlocks; see schedule() for |
1791 | * details.) | 1791 | * details.) |
1792 | */ | 1792 | */ |
1793 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) | 1793 | static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) |
1794 | __releases(rq->lock) | 1794 | __releases(rq->lock) |
1795 | { | 1795 | { |
1796 | struct mm_struct *mm = rq->prev_mm; | 1796 | struct mm_struct *mm = rq->prev_mm; |
1797 | long prev_state; | 1797 | long prev_state; |
1798 | 1798 | ||
1799 | rq->prev_mm = NULL; | 1799 | rq->prev_mm = NULL; |
1800 | 1800 | ||
1801 | /* | 1801 | /* |
1802 | * A task struct has one reference for the use as "current". | 1802 | * A task struct has one reference for the use as "current". |
1803 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls | 1803 | * If a task dies, then it sets TASK_DEAD in tsk->state and calls |
1804 | * schedule one last time. The schedule call will never return, and | 1804 | * schedule one last time. The schedule call will never return, and |
1805 | * the scheduled task must drop that reference. | 1805 | * the scheduled task must drop that reference. |
1806 | * The test for TASK_DEAD must occur while the runqueue locks are | 1806 | * The test for TASK_DEAD must occur while the runqueue locks are |
1807 | * still held, otherwise prev could be scheduled on another cpu, die | 1807 | * still held, otherwise prev could be scheduled on another cpu, die |
1808 | * there before we look at prev->state, and then the reference would | 1808 | * there before we look at prev->state, and then the reference would |
1809 | * be dropped twice. | 1809 | * be dropped twice. |
1810 | * Manfred Spraul <manfred@colorfullife.com> | 1810 | * Manfred Spraul <manfred@colorfullife.com> |
1811 | */ | 1811 | */ |
1812 | prev_state = prev->state; | 1812 | prev_state = prev->state; |
1813 | finish_arch_switch(prev); | 1813 | finish_arch_switch(prev); |
1814 | finish_lock_switch(rq, prev); | 1814 | finish_lock_switch(rq, prev); |
1815 | if (mm) | 1815 | if (mm) |
1816 | mmdrop(mm); | 1816 | mmdrop(mm); |
1817 | if (unlikely(prev_state == TASK_DEAD)) { | 1817 | if (unlikely(prev_state == TASK_DEAD)) { |
1818 | /* | 1818 | /* |
1819 | * Remove function-return probe instances associated with this | 1819 | * Remove function-return probe instances associated with this |
1820 | * task and put them back on the free list. | 1820 | * task and put them back on the free list. |
1821 | */ | 1821 | */ |
1822 | kprobe_flush_task(prev); | 1822 | kprobe_flush_task(prev); |
1823 | put_task_struct(prev); | 1823 | put_task_struct(prev); |
1824 | } | 1824 | } |
1825 | } | 1825 | } |
1826 | 1826 | ||
1827 | /** | 1827 | /** |
1828 | * schedule_tail - first thing a freshly forked thread must call. | 1828 | * schedule_tail - first thing a freshly forked thread must call. |
1829 | * @prev: the thread we just switched away from. | 1829 | * @prev: the thread we just switched away from. |
1830 | */ | 1830 | */ |
1831 | asmlinkage void schedule_tail(struct task_struct *prev) | 1831 | asmlinkage void schedule_tail(struct task_struct *prev) |
1832 | __releases(rq->lock) | 1832 | __releases(rq->lock) |
1833 | { | 1833 | { |
1834 | struct rq *rq = this_rq(); | 1834 | struct rq *rq = this_rq(); |
1835 | 1835 | ||
1836 | finish_task_switch(rq, prev); | 1836 | finish_task_switch(rq, prev); |
1837 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 1837 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
1838 | /* In this case, finish_task_switch does not reenable preemption */ | 1838 | /* In this case, finish_task_switch does not reenable preemption */ |
1839 | preempt_enable(); | 1839 | preempt_enable(); |
1840 | #endif | 1840 | #endif |
1841 | if (current->set_child_tid) | 1841 | if (current->set_child_tid) |
1842 | put_user(current->pid, current->set_child_tid); | 1842 | put_user(current->pid, current->set_child_tid); |
1843 | } | 1843 | } |
1844 | 1844 | ||
1845 | /* | 1845 | /* |
1846 | * context_switch - switch to the new MM and the new | 1846 | * context_switch - switch to the new MM and the new |
1847 | * thread's register state. | 1847 | * thread's register state. |
1848 | */ | 1848 | */ |
1849 | static inline struct task_struct * | 1849 | static inline struct task_struct * |
1850 | context_switch(struct rq *rq, struct task_struct *prev, | 1850 | context_switch(struct rq *rq, struct task_struct *prev, |
1851 | struct task_struct *next) | 1851 | struct task_struct *next) |
1852 | { | 1852 | { |
1853 | struct mm_struct *mm = next->mm; | 1853 | struct mm_struct *mm = next->mm; |
1854 | struct mm_struct *oldmm = prev->active_mm; | 1854 | struct mm_struct *oldmm = prev->active_mm; |
1855 | 1855 | ||
1856 | /* | 1856 | /* |
1857 | * For paravirt, this is coupled with an exit in switch_to to | 1857 | * For paravirt, this is coupled with an exit in switch_to to |
1858 | * combine the page table reload and the switch backend into | 1858 | * combine the page table reload and the switch backend into |
1859 | * one hypercall. | 1859 | * one hypercall. |
1860 | */ | 1860 | */ |
1861 | arch_enter_lazy_cpu_mode(); | 1861 | arch_enter_lazy_cpu_mode(); |
1862 | 1862 | ||
1863 | if (!mm) { | 1863 | if (!mm) { |
1864 | next->active_mm = oldmm; | 1864 | next->active_mm = oldmm; |
1865 | atomic_inc(&oldmm->mm_count); | 1865 | atomic_inc(&oldmm->mm_count); |
1866 | enter_lazy_tlb(oldmm, next); | 1866 | enter_lazy_tlb(oldmm, next); |
1867 | } else | 1867 | } else |
1868 | switch_mm(oldmm, mm, next); | 1868 | switch_mm(oldmm, mm, next); |
1869 | 1869 | ||
1870 | if (!prev->mm) { | 1870 | if (!prev->mm) { |
1871 | prev->active_mm = NULL; | 1871 | prev->active_mm = NULL; |
1872 | WARN_ON(rq->prev_mm); | 1872 | WARN_ON(rq->prev_mm); |
1873 | rq->prev_mm = oldmm; | 1873 | rq->prev_mm = oldmm; |
1874 | } | 1874 | } |
1875 | /* | 1875 | /* |
1876 | * Since the runqueue lock will be released by the next | 1876 | * Since the runqueue lock will be released by the next |
1877 | * task (which is an invalid locking op but in the case | 1877 | * task (which is an invalid locking op but in the case |
1878 | * of the scheduler it's an obvious special-case), so we | 1878 | * of the scheduler it's an obvious special-case), so we |
1879 | * do an early lockdep release here: | 1879 | * do an early lockdep release here: |
1880 | */ | 1880 | */ |
1881 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 1881 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
1882 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 1882 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
1883 | #endif | 1883 | #endif |
1884 | 1884 | ||
1885 | /* Here we just switch the register state and the stack. */ | 1885 | /* Here we just switch the register state and the stack. */ |
1886 | switch_to(prev, next, prev); | 1886 | switch_to(prev, next, prev); |
1887 | 1887 | ||
1888 | return prev; | 1888 | return prev; |
1889 | } | 1889 | } |
1890 | 1890 | ||
1891 | /* | 1891 | /* |
1892 | * nr_running, nr_uninterruptible and nr_context_switches: | 1892 | * nr_running, nr_uninterruptible and nr_context_switches: |
1893 | * | 1893 | * |
1894 | * externally visible scheduler statistics: current number of runnable | 1894 | * externally visible scheduler statistics: current number of runnable |
1895 | * threads, current number of uninterruptible-sleeping threads, total | 1895 | * threads, current number of uninterruptible-sleeping threads, total |
1896 | * number of context switches performed since bootup. | 1896 | * number of context switches performed since bootup. |
1897 | */ | 1897 | */ |
1898 | unsigned long nr_running(void) | 1898 | unsigned long nr_running(void) |
1899 | { | 1899 | { |
1900 | unsigned long i, sum = 0; | 1900 | unsigned long i, sum = 0; |
1901 | 1901 | ||
1902 | for_each_online_cpu(i) | 1902 | for_each_online_cpu(i) |
1903 | sum += cpu_rq(i)->nr_running; | 1903 | sum += cpu_rq(i)->nr_running; |
1904 | 1904 | ||
1905 | return sum; | 1905 | return sum; |
1906 | } | 1906 | } |
1907 | 1907 | ||
1908 | unsigned long nr_uninterruptible(void) | 1908 | unsigned long nr_uninterruptible(void) |
1909 | { | 1909 | { |
1910 | unsigned long i, sum = 0; | 1910 | unsigned long i, sum = 0; |
1911 | 1911 | ||
1912 | for_each_possible_cpu(i) | 1912 | for_each_possible_cpu(i) |
1913 | sum += cpu_rq(i)->nr_uninterruptible; | 1913 | sum += cpu_rq(i)->nr_uninterruptible; |
1914 | 1914 | ||
1915 | /* | 1915 | /* |
1916 | * Since we read the counters lockless, it might be slightly | 1916 | * Since we read the counters lockless, it might be slightly |
1917 | * inaccurate. Do not allow it to go below zero though: | 1917 | * inaccurate. Do not allow it to go below zero though: |
1918 | */ | 1918 | */ |
1919 | if (unlikely((long)sum < 0)) | 1919 | if (unlikely((long)sum < 0)) |
1920 | sum = 0; | 1920 | sum = 0; |
1921 | 1921 | ||
1922 | return sum; | 1922 | return sum; |
1923 | } | 1923 | } |
1924 | 1924 | ||
1925 | unsigned long long nr_context_switches(void) | 1925 | unsigned long long nr_context_switches(void) |
1926 | { | 1926 | { |
1927 | int i; | 1927 | int i; |
1928 | unsigned long long sum = 0; | 1928 | unsigned long long sum = 0; |
1929 | 1929 | ||
1930 | for_each_possible_cpu(i) | 1930 | for_each_possible_cpu(i) |
1931 | sum += cpu_rq(i)->nr_switches; | 1931 | sum += cpu_rq(i)->nr_switches; |
1932 | 1932 | ||
1933 | return sum; | 1933 | return sum; |
1934 | } | 1934 | } |
1935 | 1935 | ||
1936 | unsigned long nr_iowait(void) | 1936 | unsigned long nr_iowait(void) |
1937 | { | 1937 | { |
1938 | unsigned long i, sum = 0; | 1938 | unsigned long i, sum = 0; |
1939 | 1939 | ||
1940 | for_each_possible_cpu(i) | 1940 | for_each_possible_cpu(i) |
1941 | sum += atomic_read(&cpu_rq(i)->nr_iowait); | 1941 | sum += atomic_read(&cpu_rq(i)->nr_iowait); |
1942 | 1942 | ||
1943 | return sum; | 1943 | return sum; |
1944 | } | 1944 | } |
1945 | 1945 | ||
1946 | unsigned long nr_active(void) | 1946 | unsigned long nr_active(void) |
1947 | { | 1947 | { |
1948 | unsigned long i, running = 0, uninterruptible = 0; | 1948 | unsigned long i, running = 0, uninterruptible = 0; |
1949 | 1949 | ||
1950 | for_each_online_cpu(i) { | 1950 | for_each_online_cpu(i) { |
1951 | running += cpu_rq(i)->nr_running; | 1951 | running += cpu_rq(i)->nr_running; |
1952 | uninterruptible += cpu_rq(i)->nr_uninterruptible; | 1952 | uninterruptible += cpu_rq(i)->nr_uninterruptible; |
1953 | } | 1953 | } |
1954 | 1954 | ||
1955 | if (unlikely((long)uninterruptible < 0)) | 1955 | if (unlikely((long)uninterruptible < 0)) |
1956 | uninterruptible = 0; | 1956 | uninterruptible = 0; |
1957 | 1957 | ||
1958 | return running + uninterruptible; | 1958 | return running + uninterruptible; |
1959 | } | 1959 | } |
1960 | 1960 | ||
1961 | #ifdef CONFIG_SMP | 1961 | #ifdef CONFIG_SMP |
1962 | 1962 | ||
1963 | /* | 1963 | /* |
1964 | * Is this task likely cache-hot: | 1964 | * Is this task likely cache-hot: |
1965 | */ | 1965 | */ |
1966 | static inline int | 1966 | static inline int |
1967 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) | 1967 | task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) |
1968 | { | 1968 | { |
1969 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; | 1969 | return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; |
1970 | } | 1970 | } |
1971 | 1971 | ||
1972 | /* | 1972 | /* |
1973 | * double_rq_lock - safely lock two runqueues | 1973 | * double_rq_lock - safely lock two runqueues |
1974 | * | 1974 | * |
1975 | * Note this does not disable interrupts like task_rq_lock, | 1975 | * Note this does not disable interrupts like task_rq_lock, |
1976 | * you need to do so manually before calling. | 1976 | * you need to do so manually before calling. |
1977 | */ | 1977 | */ |
1978 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) | 1978 | static void double_rq_lock(struct rq *rq1, struct rq *rq2) |
1979 | __acquires(rq1->lock) | 1979 | __acquires(rq1->lock) |
1980 | __acquires(rq2->lock) | 1980 | __acquires(rq2->lock) |
1981 | { | 1981 | { |
1982 | BUG_ON(!irqs_disabled()); | 1982 | BUG_ON(!irqs_disabled()); |
1983 | if (rq1 == rq2) { | 1983 | if (rq1 == rq2) { |
1984 | spin_lock(&rq1->lock); | 1984 | spin_lock(&rq1->lock); |
1985 | __acquire(rq2->lock); /* Fake it out ;) */ | 1985 | __acquire(rq2->lock); /* Fake it out ;) */ |
1986 | } else { | 1986 | } else { |
1987 | if (rq1 < rq2) { | 1987 | if (rq1 < rq2) { |
1988 | spin_lock(&rq1->lock); | 1988 | spin_lock(&rq1->lock); |
1989 | spin_lock(&rq2->lock); | 1989 | spin_lock(&rq2->lock); |
1990 | } else { | 1990 | } else { |
1991 | spin_lock(&rq2->lock); | 1991 | spin_lock(&rq2->lock); |
1992 | spin_lock(&rq1->lock); | 1992 | spin_lock(&rq1->lock); |
1993 | } | 1993 | } |
1994 | } | 1994 | } |
1995 | } | 1995 | } |
1996 | 1996 | ||
1997 | /* | 1997 | /* |
1998 | * double_rq_unlock - safely unlock two runqueues | 1998 | * double_rq_unlock - safely unlock two runqueues |
1999 | * | 1999 | * |
2000 | * Note this does not restore interrupts like task_rq_unlock, | 2000 | * Note this does not restore interrupts like task_rq_unlock, |
2001 | * you need to do so manually after calling. | 2001 | * you need to do so manually after calling. |
2002 | */ | 2002 | */ |
2003 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | 2003 | static void double_rq_unlock(struct rq *rq1, struct rq *rq2) |
2004 | __releases(rq1->lock) | 2004 | __releases(rq1->lock) |
2005 | __releases(rq2->lock) | 2005 | __releases(rq2->lock) |
2006 | { | 2006 | { |
2007 | spin_unlock(&rq1->lock); | 2007 | spin_unlock(&rq1->lock); |
2008 | if (rq1 != rq2) | 2008 | if (rq1 != rq2) |
2009 | spin_unlock(&rq2->lock); | 2009 | spin_unlock(&rq2->lock); |
2010 | else | 2010 | else |
2011 | __release(rq2->lock); | 2011 | __release(rq2->lock); |
2012 | } | 2012 | } |
2013 | 2013 | ||
2014 | /* | 2014 | /* |
2015 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2015 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2016 | */ | 2016 | */ |
2017 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2017 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2018 | __releases(this_rq->lock) | 2018 | __releases(this_rq->lock) |
2019 | __acquires(busiest->lock) | 2019 | __acquires(busiest->lock) |
2020 | __acquires(this_rq->lock) | 2020 | __acquires(this_rq->lock) |
2021 | { | 2021 | { |
2022 | if (unlikely(!irqs_disabled())) { | 2022 | if (unlikely(!irqs_disabled())) { |
2023 | /* printk() doesn't work good under rq->lock */ | 2023 | /* printk() doesn't work good under rq->lock */ |
2024 | spin_unlock(&this_rq->lock); | 2024 | spin_unlock(&this_rq->lock); |
2025 | BUG_ON(1); | 2025 | BUG_ON(1); |
2026 | } | 2026 | } |
2027 | if (unlikely(!spin_trylock(&busiest->lock))) { | 2027 | if (unlikely(!spin_trylock(&busiest->lock))) { |
2028 | if (busiest < this_rq) { | 2028 | if (busiest < this_rq) { |
2029 | spin_unlock(&this_rq->lock); | 2029 | spin_unlock(&this_rq->lock); |
2030 | spin_lock(&busiest->lock); | 2030 | spin_lock(&busiest->lock); |
2031 | spin_lock(&this_rq->lock); | 2031 | spin_lock(&this_rq->lock); |
2032 | } else | 2032 | } else |
2033 | spin_lock(&busiest->lock); | 2033 | spin_lock(&busiest->lock); |
2034 | } | 2034 | } |
2035 | } | 2035 | } |
2036 | 2036 | ||
2037 | /* | 2037 | /* |
2038 | * If dest_cpu is allowed for this process, migrate the task to it. | 2038 | * If dest_cpu is allowed for this process, migrate the task to it. |
2039 | * This is accomplished by forcing the cpu_allowed mask to only | 2039 | * This is accomplished by forcing the cpu_allowed mask to only |
2040 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 2040 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
2041 | * the cpu_allowed mask is restored. | 2041 | * the cpu_allowed mask is restored. |
2042 | */ | 2042 | */ |
2043 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) | 2043 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
2044 | { | 2044 | { |
2045 | struct migration_req req; | 2045 | struct migration_req req; |
2046 | unsigned long flags; | 2046 | unsigned long flags; |
2047 | struct rq *rq; | 2047 | struct rq *rq; |
2048 | 2048 | ||
2049 | rq = task_rq_lock(p, &flags); | 2049 | rq = task_rq_lock(p, &flags); |
2050 | if (!cpu_isset(dest_cpu, p->cpus_allowed) | 2050 | if (!cpu_isset(dest_cpu, p->cpus_allowed) |
2051 | || unlikely(cpu_is_offline(dest_cpu))) | 2051 | || unlikely(cpu_is_offline(dest_cpu))) |
2052 | goto out; | 2052 | goto out; |
2053 | 2053 | ||
2054 | /* force the process onto the specified CPU */ | 2054 | /* force the process onto the specified CPU */ |
2055 | if (migrate_task(p, dest_cpu, &req)) { | 2055 | if (migrate_task(p, dest_cpu, &req)) { |
2056 | /* Need to wait for migration thread (might exit: take ref). */ | 2056 | /* Need to wait for migration thread (might exit: take ref). */ |
2057 | struct task_struct *mt = rq->migration_thread; | 2057 | struct task_struct *mt = rq->migration_thread; |
2058 | 2058 | ||
2059 | get_task_struct(mt); | 2059 | get_task_struct(mt); |
2060 | task_rq_unlock(rq, &flags); | 2060 | task_rq_unlock(rq, &flags); |
2061 | wake_up_process(mt); | 2061 | wake_up_process(mt); |
2062 | put_task_struct(mt); | 2062 | put_task_struct(mt); |
2063 | wait_for_completion(&req.done); | 2063 | wait_for_completion(&req.done); |
2064 | 2064 | ||
2065 | return; | 2065 | return; |
2066 | } | 2066 | } |
2067 | out: | 2067 | out: |
2068 | task_rq_unlock(rq, &flags); | 2068 | task_rq_unlock(rq, &flags); |
2069 | } | 2069 | } |
2070 | 2070 | ||
2071 | /* | 2071 | /* |
2072 | * sched_exec - execve() is a valuable balancing opportunity, because at | 2072 | * sched_exec - execve() is a valuable balancing opportunity, because at |
2073 | * this point the task has the smallest effective memory and cache footprint. | 2073 | * this point the task has the smallest effective memory and cache footprint. |
2074 | */ | 2074 | */ |
2075 | void sched_exec(void) | 2075 | void sched_exec(void) |
2076 | { | 2076 | { |
2077 | int new_cpu, this_cpu = get_cpu(); | 2077 | int new_cpu, this_cpu = get_cpu(); |
2078 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 2078 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); |
2079 | put_cpu(); | 2079 | put_cpu(); |
2080 | if (new_cpu != this_cpu) | 2080 | if (new_cpu != this_cpu) |
2081 | sched_migrate_task(current, new_cpu); | 2081 | sched_migrate_task(current, new_cpu); |
2082 | } | 2082 | } |
2083 | 2083 | ||
2084 | /* | 2084 | /* |
2085 | * pull_task - move a task from a remote runqueue to the local runqueue. | 2085 | * pull_task - move a task from a remote runqueue to the local runqueue. |
2086 | * Both runqueues must be locked. | 2086 | * Both runqueues must be locked. |
2087 | */ | 2087 | */ |
2088 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, | 2088 | static void pull_task(struct rq *src_rq, struct prio_array *src_array, |
2089 | struct task_struct *p, struct rq *this_rq, | 2089 | struct task_struct *p, struct rq *this_rq, |
2090 | struct prio_array *this_array, int this_cpu) | 2090 | struct prio_array *this_array, int this_cpu) |
2091 | { | 2091 | { |
2092 | dequeue_task(p, src_array); | 2092 | dequeue_task(p, src_array); |
2093 | dec_nr_running(p, src_rq); | 2093 | dec_nr_running(p, src_rq); |
2094 | set_task_cpu(p, this_cpu); | 2094 | set_task_cpu(p, this_cpu); |
2095 | inc_nr_running(p, this_rq); | 2095 | inc_nr_running(p, this_rq); |
2096 | enqueue_task(p, this_array); | 2096 | enqueue_task(p, this_array); |
2097 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) | 2097 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) |
2098 | + this_rq->most_recent_timestamp; | 2098 | + this_rq->most_recent_timestamp; |
2099 | /* | 2099 | /* |
2100 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2100 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2101 | * to be always true for them. | 2101 | * to be always true for them. |
2102 | */ | 2102 | */ |
2103 | if (TASK_PREEMPTS_CURR(p, this_rq)) | 2103 | if (TASK_PREEMPTS_CURR(p, this_rq)) |
2104 | resched_task(this_rq->curr); | 2104 | resched_task(this_rq->curr); |
2105 | } | 2105 | } |
2106 | 2106 | ||
2107 | /* | 2107 | /* |
2108 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? | 2108 | * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? |
2109 | */ | 2109 | */ |
2110 | static | 2110 | static |
2111 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | 2111 | int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, |
2112 | struct sched_domain *sd, enum idle_type idle, | 2112 | struct sched_domain *sd, enum idle_type idle, |
2113 | int *all_pinned) | 2113 | int *all_pinned) |
2114 | { | 2114 | { |
2115 | /* | 2115 | /* |
2116 | * We do not migrate tasks that are: | 2116 | * We do not migrate tasks that are: |
2117 | * 1) running (obviously), or | 2117 | * 1) running (obviously), or |
2118 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 2118 | * 2) cannot be migrated to this CPU due to cpus_allowed, or |
2119 | * 3) are cache-hot on their current CPU. | 2119 | * 3) are cache-hot on their current CPU. |
2120 | */ | 2120 | */ |
2121 | if (!cpu_isset(this_cpu, p->cpus_allowed)) | 2121 | if (!cpu_isset(this_cpu, p->cpus_allowed)) |
2122 | return 0; | 2122 | return 0; |
2123 | *all_pinned = 0; | 2123 | *all_pinned = 0; |
2124 | 2124 | ||
2125 | if (task_running(rq, p)) | 2125 | if (task_running(rq, p)) |
2126 | return 0; | 2126 | return 0; |
2127 | 2127 | ||
2128 | /* | 2128 | /* |
2129 | * Aggressive migration if: | 2129 | * Aggressive migration if: |
2130 | * 1) task is cache cold, or | 2130 | * 1) task is cache cold, or |
2131 | * 2) too many balance attempts have failed. | 2131 | * 2) too many balance attempts have failed. |
2132 | */ | 2132 | */ |
2133 | 2133 | ||
2134 | if (sd->nr_balance_failed > sd->cache_nice_tries) { | 2134 | if (sd->nr_balance_failed > sd->cache_nice_tries) { |
2135 | #ifdef CONFIG_SCHEDSTATS | 2135 | #ifdef CONFIG_SCHEDSTATS |
2136 | if (task_hot(p, rq->most_recent_timestamp, sd)) | 2136 | if (task_hot(p, rq->most_recent_timestamp, sd)) |
2137 | schedstat_inc(sd, lb_hot_gained[idle]); | 2137 | schedstat_inc(sd, lb_hot_gained[idle]); |
2138 | #endif | 2138 | #endif |
2139 | return 1; | 2139 | return 1; |
2140 | } | 2140 | } |
2141 | 2141 | ||
2142 | if (task_hot(p, rq->most_recent_timestamp, sd)) | 2142 | if (task_hot(p, rq->most_recent_timestamp, sd)) |
2143 | return 0; | 2143 | return 0; |
2144 | return 1; | 2144 | return 1; |
2145 | } | 2145 | } |
2146 | 2146 | ||
2147 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | 2147 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) |
2148 | 2148 | ||
2149 | /* | 2149 | /* |
2150 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted | 2150 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
2151 | * load from busiest to this_rq, as part of a balancing operation within | 2151 | * load from busiest to this_rq, as part of a balancing operation within |
2152 | * "domain". Returns the number of tasks moved. | 2152 | * "domain". Returns the number of tasks moved. |
2153 | * | 2153 | * |
2154 | * Called with both runqueues locked. | 2154 | * Called with both runqueues locked. |
2155 | */ | 2155 | */ |
2156 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | 2156 | static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, |
2157 | unsigned long max_nr_move, unsigned long max_load_move, | 2157 | unsigned long max_nr_move, unsigned long max_load_move, |
2158 | struct sched_domain *sd, enum idle_type idle, | 2158 | struct sched_domain *sd, enum idle_type idle, |
2159 | int *all_pinned) | 2159 | int *all_pinned) |
2160 | { | 2160 | { |
2161 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, | 2161 | int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, |
2162 | best_prio_seen, skip_for_load; | 2162 | best_prio_seen, skip_for_load; |
2163 | struct prio_array *array, *dst_array; | 2163 | struct prio_array *array, *dst_array; |
2164 | struct list_head *head, *curr; | 2164 | struct list_head *head, *curr; |
2165 | struct task_struct *tmp; | 2165 | struct task_struct *tmp; |
2166 | long rem_load_move; | 2166 | long rem_load_move; |
2167 | 2167 | ||
2168 | if (max_nr_move == 0 || max_load_move == 0) | 2168 | if (max_nr_move == 0 || max_load_move == 0) |
2169 | goto out; | 2169 | goto out; |
2170 | 2170 | ||
2171 | rem_load_move = max_load_move; | 2171 | rem_load_move = max_load_move; |
2172 | pinned = 1; | 2172 | pinned = 1; |
2173 | this_best_prio = rq_best_prio(this_rq); | 2173 | this_best_prio = rq_best_prio(this_rq); |
2174 | best_prio = rq_best_prio(busiest); | 2174 | best_prio = rq_best_prio(busiest); |
2175 | /* | 2175 | /* |
2176 | * Enable handling of the case where there is more than one task | 2176 | * Enable handling of the case where there is more than one task |
2177 | * with the best priority. If the current running task is one | 2177 | * with the best priority. If the current running task is one |
2178 | * of those with prio==best_prio we know it won't be moved | 2178 | * of those with prio==best_prio we know it won't be moved |
2179 | * and therefore it's safe to override the skip (based on load) of | 2179 | * and therefore it's safe to override the skip (based on load) of |
2180 | * any task we find with that prio. | 2180 | * any task we find with that prio. |
2181 | */ | 2181 | */ |
2182 | best_prio_seen = best_prio == busiest->curr->prio; | 2182 | best_prio_seen = best_prio == busiest->curr->prio; |
2183 | 2183 | ||
2184 | /* | 2184 | /* |
2185 | * We first consider expired tasks. Those will likely not be | 2185 | * We first consider expired tasks. Those will likely not be |
2186 | * executed in the near future, and they are most likely to | 2186 | * executed in the near future, and they are most likely to |
2187 | * be cache-cold, thus switching CPUs has the least effect | 2187 | * be cache-cold, thus switching CPUs has the least effect |
2188 | * on them. | 2188 | * on them. |
2189 | */ | 2189 | */ |
2190 | if (busiest->expired->nr_active) { | 2190 | if (busiest->expired->nr_active) { |
2191 | array = busiest->expired; | 2191 | array = busiest->expired; |
2192 | dst_array = this_rq->expired; | 2192 | dst_array = this_rq->expired; |
2193 | } else { | 2193 | } else { |
2194 | array = busiest->active; | 2194 | array = busiest->active; |
2195 | dst_array = this_rq->active; | 2195 | dst_array = this_rq->active; |
2196 | } | 2196 | } |
2197 | 2197 | ||
2198 | new_array: | 2198 | new_array: |
2199 | /* Start searching at priority 0: */ | 2199 | /* Start searching at priority 0: */ |
2200 | idx = 0; | 2200 | idx = 0; |
2201 | skip_bitmap: | 2201 | skip_bitmap: |
2202 | if (!idx) | 2202 | if (!idx) |
2203 | idx = sched_find_first_bit(array->bitmap); | 2203 | idx = sched_find_first_bit(array->bitmap); |
2204 | else | 2204 | else |
2205 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); | 2205 | idx = find_next_bit(array->bitmap, MAX_PRIO, idx); |
2206 | if (idx >= MAX_PRIO) { | 2206 | if (idx >= MAX_PRIO) { |
2207 | if (array == busiest->expired && busiest->active->nr_active) { | 2207 | if (array == busiest->expired && busiest->active->nr_active) { |
2208 | array = busiest->active; | 2208 | array = busiest->active; |
2209 | dst_array = this_rq->active; | 2209 | dst_array = this_rq->active; |
2210 | goto new_array; | 2210 | goto new_array; |
2211 | } | 2211 | } |
2212 | goto out; | 2212 | goto out; |
2213 | } | 2213 | } |
2214 | 2214 | ||
2215 | head = array->queue + idx; | 2215 | head = array->queue + idx; |
2216 | curr = head->prev; | 2216 | curr = head->prev; |
2217 | skip_queue: | 2217 | skip_queue: |
2218 | tmp = list_entry(curr, struct task_struct, run_list); | 2218 | tmp = list_entry(curr, struct task_struct, run_list); |
2219 | 2219 | ||
2220 | curr = curr->prev; | 2220 | curr = curr->prev; |
2221 | 2221 | ||
2222 | /* | 2222 | /* |
2223 | * To help distribute high priority tasks accross CPUs we don't | 2223 | * To help distribute high priority tasks accross CPUs we don't |
2224 | * skip a task if it will be the highest priority task (i.e. smallest | 2224 | * skip a task if it will be the highest priority task (i.e. smallest |
2225 | * prio value) on its new queue regardless of its load weight | 2225 | * prio value) on its new queue regardless of its load weight |
2226 | */ | 2226 | */ |
2227 | skip_for_load = tmp->load_weight > rem_load_move; | 2227 | skip_for_load = tmp->load_weight > rem_load_move; |
2228 | if (skip_for_load && idx < this_best_prio) | 2228 | if (skip_for_load && idx < this_best_prio) |
2229 | skip_for_load = !best_prio_seen && idx == best_prio; | 2229 | skip_for_load = !best_prio_seen && idx == best_prio; |
2230 | if (skip_for_load || | 2230 | if (skip_for_load || |
2231 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2231 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { |
2232 | 2232 | ||
2233 | best_prio_seen |= idx == best_prio; | 2233 | best_prio_seen |= idx == best_prio; |
2234 | if (curr != head) | 2234 | if (curr != head) |
2235 | goto skip_queue; | 2235 | goto skip_queue; |
2236 | idx++; | 2236 | idx++; |
2237 | goto skip_bitmap; | 2237 | goto skip_bitmap; |
2238 | } | 2238 | } |
2239 | 2239 | ||
2240 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2240 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
2241 | pulled++; | 2241 | pulled++; |
2242 | rem_load_move -= tmp->load_weight; | 2242 | rem_load_move -= tmp->load_weight; |
2243 | 2243 | ||
2244 | /* | 2244 | /* |
2245 | * We only want to steal up to the prescribed number of tasks | 2245 | * We only want to steal up to the prescribed number of tasks |
2246 | * and the prescribed amount of weighted load. | 2246 | * and the prescribed amount of weighted load. |
2247 | */ | 2247 | */ |
2248 | if (pulled < max_nr_move && rem_load_move > 0) { | 2248 | if (pulled < max_nr_move && rem_load_move > 0) { |
2249 | if (idx < this_best_prio) | 2249 | if (idx < this_best_prio) |
2250 | this_best_prio = idx; | 2250 | this_best_prio = idx; |
2251 | if (curr != head) | 2251 | if (curr != head) |
2252 | goto skip_queue; | 2252 | goto skip_queue; |
2253 | idx++; | 2253 | idx++; |
2254 | goto skip_bitmap; | 2254 | goto skip_bitmap; |
2255 | } | 2255 | } |
2256 | out: | 2256 | out: |
2257 | /* | 2257 | /* |
2258 | * Right now, this is the only place pull_task() is called, | 2258 | * Right now, this is the only place pull_task() is called, |
2259 | * so we can safely collect pull_task() stats here rather than | 2259 | * so we can safely collect pull_task() stats here rather than |
2260 | * inside pull_task(). | 2260 | * inside pull_task(). |
2261 | */ | 2261 | */ |
2262 | schedstat_add(sd, lb_gained[idle], pulled); | 2262 | schedstat_add(sd, lb_gained[idle], pulled); |
2263 | 2263 | ||
2264 | if (all_pinned) | 2264 | if (all_pinned) |
2265 | *all_pinned = pinned; | 2265 | *all_pinned = pinned; |
2266 | return pulled; | 2266 | return pulled; |
2267 | } | 2267 | } |
2268 | 2268 | ||
2269 | /* | 2269 | /* |
2270 | * find_busiest_group finds and returns the busiest CPU group within the | 2270 | * find_busiest_group finds and returns the busiest CPU group within the |
2271 | * domain. It calculates and returns the amount of weighted load which | 2271 | * domain. It calculates and returns the amount of weighted load which |
2272 | * should be moved to restore balance via the imbalance parameter. | 2272 | * should be moved to restore balance via the imbalance parameter. |
2273 | */ | 2273 | */ |
2274 | static struct sched_group * | 2274 | static struct sched_group * |
2275 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2275 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2276 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, | 2276 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, |
2277 | cpumask_t *cpus, int *balance) | 2277 | cpumask_t *cpus, int *balance) |
2278 | { | 2278 | { |
2279 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2279 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2280 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2280 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
2281 | unsigned long max_pull; | 2281 | unsigned long max_pull; |
2282 | unsigned long busiest_load_per_task, busiest_nr_running; | 2282 | unsigned long busiest_load_per_task, busiest_nr_running; |
2283 | unsigned long this_load_per_task, this_nr_running; | 2283 | unsigned long this_load_per_task, this_nr_running; |
2284 | int load_idx; | 2284 | int load_idx; |
2285 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2285 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2286 | int power_savings_balance = 1; | 2286 | int power_savings_balance = 1; |
2287 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | 2287 | unsigned long leader_nr_running = 0, min_load_per_task = 0; |
2288 | unsigned long min_nr_running = ULONG_MAX; | 2288 | unsigned long min_nr_running = ULONG_MAX; |
2289 | struct sched_group *group_min = NULL, *group_leader = NULL; | 2289 | struct sched_group *group_min = NULL, *group_leader = NULL; |
2290 | #endif | 2290 | #endif |
2291 | 2291 | ||
2292 | max_load = this_load = total_load = total_pwr = 0; | 2292 | max_load = this_load = total_load = total_pwr = 0; |
2293 | busiest_load_per_task = busiest_nr_running = 0; | 2293 | busiest_load_per_task = busiest_nr_running = 0; |
2294 | this_load_per_task = this_nr_running = 0; | 2294 | this_load_per_task = this_nr_running = 0; |
2295 | if (idle == NOT_IDLE) | 2295 | if (idle == NOT_IDLE) |
2296 | load_idx = sd->busy_idx; | 2296 | load_idx = sd->busy_idx; |
2297 | else if (idle == NEWLY_IDLE) | 2297 | else if (idle == NEWLY_IDLE) |
2298 | load_idx = sd->newidle_idx; | 2298 | load_idx = sd->newidle_idx; |
2299 | else | 2299 | else |
2300 | load_idx = sd->idle_idx; | 2300 | load_idx = sd->idle_idx; |
2301 | 2301 | ||
2302 | do { | 2302 | do { |
2303 | unsigned long load, group_capacity; | 2303 | unsigned long load, group_capacity; |
2304 | int local_group; | 2304 | int local_group; |
2305 | int i; | 2305 | int i; |
2306 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | 2306 | unsigned int balance_cpu = -1, first_idle_cpu = 0; |
2307 | unsigned long sum_nr_running, sum_weighted_load; | 2307 | unsigned long sum_nr_running, sum_weighted_load; |
2308 | 2308 | ||
2309 | local_group = cpu_isset(this_cpu, group->cpumask); | 2309 | local_group = cpu_isset(this_cpu, group->cpumask); |
2310 | 2310 | ||
2311 | if (local_group) | 2311 | if (local_group) |
2312 | balance_cpu = first_cpu(group->cpumask); | 2312 | balance_cpu = first_cpu(group->cpumask); |
2313 | 2313 | ||
2314 | /* Tally up the load of all CPUs in the group */ | 2314 | /* Tally up the load of all CPUs in the group */ |
2315 | sum_weighted_load = sum_nr_running = avg_load = 0; | 2315 | sum_weighted_load = sum_nr_running = avg_load = 0; |
2316 | 2316 | ||
2317 | for_each_cpu_mask(i, group->cpumask) { | 2317 | for_each_cpu_mask(i, group->cpumask) { |
2318 | struct rq *rq; | 2318 | struct rq *rq; |
2319 | 2319 | ||
2320 | if (!cpu_isset(i, *cpus)) | 2320 | if (!cpu_isset(i, *cpus)) |
2321 | continue; | 2321 | continue; |
2322 | 2322 | ||
2323 | rq = cpu_rq(i); | 2323 | rq = cpu_rq(i); |
2324 | 2324 | ||
2325 | if (*sd_idle && !idle_cpu(i)) | 2325 | if (*sd_idle && !idle_cpu(i)) |
2326 | *sd_idle = 0; | 2326 | *sd_idle = 0; |
2327 | 2327 | ||
2328 | /* Bias balancing toward cpus of our domain */ | 2328 | /* Bias balancing toward cpus of our domain */ |
2329 | if (local_group) { | 2329 | if (local_group) { |
2330 | if (idle_cpu(i) && !first_idle_cpu) { | 2330 | if (idle_cpu(i) && !first_idle_cpu) { |
2331 | first_idle_cpu = 1; | 2331 | first_idle_cpu = 1; |
2332 | balance_cpu = i; | 2332 | balance_cpu = i; |
2333 | } | 2333 | } |
2334 | 2334 | ||
2335 | load = target_load(i, load_idx); | 2335 | load = target_load(i, load_idx); |
2336 | } else | 2336 | } else |
2337 | load = source_load(i, load_idx); | 2337 | load = source_load(i, load_idx); |
2338 | 2338 | ||
2339 | avg_load += load; | 2339 | avg_load += load; |
2340 | sum_nr_running += rq->nr_running; | 2340 | sum_nr_running += rq->nr_running; |
2341 | sum_weighted_load += rq->raw_weighted_load; | 2341 | sum_weighted_load += rq->raw_weighted_load; |
2342 | } | 2342 | } |
2343 | 2343 | ||
2344 | /* | 2344 | /* |
2345 | * First idle cpu or the first cpu(busiest) in this sched group | 2345 | * First idle cpu or the first cpu(busiest) in this sched group |
2346 | * is eligible for doing load balancing at this and above | 2346 | * is eligible for doing load balancing at this and above |
2347 | * domains. | 2347 | * domains. |
2348 | */ | 2348 | */ |
2349 | if (local_group && balance_cpu != this_cpu && balance) { | 2349 | if (local_group && balance_cpu != this_cpu && balance) { |
2350 | *balance = 0; | 2350 | *balance = 0; |
2351 | goto ret; | 2351 | goto ret; |
2352 | } | 2352 | } |
2353 | 2353 | ||
2354 | total_load += avg_load; | 2354 | total_load += avg_load; |
2355 | total_pwr += group->cpu_power; | 2355 | total_pwr += group->cpu_power; |
2356 | 2356 | ||
2357 | /* Adjust by relative CPU power of the group */ | 2357 | /* Adjust by relative CPU power of the group */ |
2358 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2358 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
2359 | 2359 | ||
2360 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | 2360 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; |
2361 | 2361 | ||
2362 | if (local_group) { | 2362 | if (local_group) { |
2363 | this_load = avg_load; | 2363 | this_load = avg_load; |
2364 | this = group; | 2364 | this = group; |
2365 | this_nr_running = sum_nr_running; | 2365 | this_nr_running = sum_nr_running; |
2366 | this_load_per_task = sum_weighted_load; | 2366 | this_load_per_task = sum_weighted_load; |
2367 | } else if (avg_load > max_load && | 2367 | } else if (avg_load > max_load && |
2368 | sum_nr_running > group_capacity) { | 2368 | sum_nr_running > group_capacity) { |
2369 | max_load = avg_load; | 2369 | max_load = avg_load; |
2370 | busiest = group; | 2370 | busiest = group; |
2371 | busiest_nr_running = sum_nr_running; | 2371 | busiest_nr_running = sum_nr_running; |
2372 | busiest_load_per_task = sum_weighted_load; | 2372 | busiest_load_per_task = sum_weighted_load; |
2373 | } | 2373 | } |
2374 | 2374 | ||
2375 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2375 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2376 | /* | 2376 | /* |
2377 | * Busy processors will not participate in power savings | 2377 | * Busy processors will not participate in power savings |
2378 | * balance. | 2378 | * balance. |
2379 | */ | 2379 | */ |
2380 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2380 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2381 | goto group_next; | 2381 | goto group_next; |
2382 | 2382 | ||
2383 | /* | 2383 | /* |
2384 | * If the local group is idle or completely loaded | 2384 | * If the local group is idle or completely loaded |
2385 | * no need to do power savings balance at this domain | 2385 | * no need to do power savings balance at this domain |
2386 | */ | 2386 | */ |
2387 | if (local_group && (this_nr_running >= group_capacity || | 2387 | if (local_group && (this_nr_running >= group_capacity || |
2388 | !this_nr_running)) | 2388 | !this_nr_running)) |
2389 | power_savings_balance = 0; | 2389 | power_savings_balance = 0; |
2390 | 2390 | ||
2391 | /* | 2391 | /* |
2392 | * If a group is already running at full capacity or idle, | 2392 | * If a group is already running at full capacity or idle, |
2393 | * don't include that group in power savings calculations | 2393 | * don't include that group in power savings calculations |
2394 | */ | 2394 | */ |
2395 | if (!power_savings_balance || sum_nr_running >= group_capacity | 2395 | if (!power_savings_balance || sum_nr_running >= group_capacity |
2396 | || !sum_nr_running) | 2396 | || !sum_nr_running) |
2397 | goto group_next; | 2397 | goto group_next; |
2398 | 2398 | ||
2399 | /* | 2399 | /* |
2400 | * Calculate the group which has the least non-idle load. | 2400 | * Calculate the group which has the least non-idle load. |
2401 | * This is the group from where we need to pick up the load | 2401 | * This is the group from where we need to pick up the load |
2402 | * for saving power | 2402 | * for saving power |
2403 | */ | 2403 | */ |
2404 | if ((sum_nr_running < min_nr_running) || | 2404 | if ((sum_nr_running < min_nr_running) || |
2405 | (sum_nr_running == min_nr_running && | 2405 | (sum_nr_running == min_nr_running && |
2406 | first_cpu(group->cpumask) < | 2406 | first_cpu(group->cpumask) < |
2407 | first_cpu(group_min->cpumask))) { | 2407 | first_cpu(group_min->cpumask))) { |
2408 | group_min = group; | 2408 | group_min = group; |
2409 | min_nr_running = sum_nr_running; | 2409 | min_nr_running = sum_nr_running; |
2410 | min_load_per_task = sum_weighted_load / | 2410 | min_load_per_task = sum_weighted_load / |
2411 | sum_nr_running; | 2411 | sum_nr_running; |
2412 | } | 2412 | } |
2413 | 2413 | ||
2414 | /* | 2414 | /* |
2415 | * Calculate the group which is almost near its | 2415 | * Calculate the group which is almost near its |
2416 | * capacity but still has some space to pick up some load | 2416 | * capacity but still has some space to pick up some load |
2417 | * from other group and save more power | 2417 | * from other group and save more power |
2418 | */ | 2418 | */ |
2419 | if (sum_nr_running <= group_capacity - 1) { | 2419 | if (sum_nr_running <= group_capacity - 1) { |
2420 | if (sum_nr_running > leader_nr_running || | 2420 | if (sum_nr_running > leader_nr_running || |
2421 | (sum_nr_running == leader_nr_running && | 2421 | (sum_nr_running == leader_nr_running && |
2422 | first_cpu(group->cpumask) > | 2422 | first_cpu(group->cpumask) > |
2423 | first_cpu(group_leader->cpumask))) { | 2423 | first_cpu(group_leader->cpumask))) { |
2424 | group_leader = group; | 2424 | group_leader = group; |
2425 | leader_nr_running = sum_nr_running; | 2425 | leader_nr_running = sum_nr_running; |
2426 | } | 2426 | } |
2427 | } | 2427 | } |
2428 | group_next: | 2428 | group_next: |
2429 | #endif | 2429 | #endif |
2430 | group = group->next; | 2430 | group = group->next; |
2431 | } while (group != sd->groups); | 2431 | } while (group != sd->groups); |
2432 | 2432 | ||
2433 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) | 2433 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2434 | goto out_balanced; | 2434 | goto out_balanced; |
2435 | 2435 | ||
2436 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2436 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
2437 | 2437 | ||
2438 | if (this_load >= avg_load || | 2438 | if (this_load >= avg_load || |
2439 | 100*max_load <= sd->imbalance_pct*this_load) | 2439 | 100*max_load <= sd->imbalance_pct*this_load) |
2440 | goto out_balanced; | 2440 | goto out_balanced; |
2441 | 2441 | ||
2442 | busiest_load_per_task /= busiest_nr_running; | 2442 | busiest_load_per_task /= busiest_nr_running; |
2443 | /* | 2443 | /* |
2444 | * We're trying to get all the cpus to the average_load, so we don't | 2444 | * We're trying to get all the cpus to the average_load, so we don't |
2445 | * want to push ourselves above the average load, nor do we wish to | 2445 | * want to push ourselves above the average load, nor do we wish to |
2446 | * reduce the max loaded cpu below the average load, as either of these | 2446 | * reduce the max loaded cpu below the average load, as either of these |
2447 | * actions would just result in more rebalancing later, and ping-pong | 2447 | * actions would just result in more rebalancing later, and ping-pong |
2448 | * tasks around. Thus we look for the minimum possible imbalance. | 2448 | * tasks around. Thus we look for the minimum possible imbalance. |
2449 | * Negative imbalances (*we* are more loaded than anyone else) will | 2449 | * Negative imbalances (*we* are more loaded than anyone else) will |
2450 | * be counted as no imbalance for these purposes -- we can't fix that | 2450 | * be counted as no imbalance for these purposes -- we can't fix that |
2451 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2451 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2452 | * appear as very large values with unsigned longs. | 2452 | * appear as very large values with unsigned longs. |
2453 | */ | 2453 | */ |
2454 | if (max_load <= busiest_load_per_task) | 2454 | if (max_load <= busiest_load_per_task) |
2455 | goto out_balanced; | 2455 | goto out_balanced; |
2456 | 2456 | ||
2457 | /* | 2457 | /* |
2458 | * In the presence of smp nice balancing, certain scenarios can have | 2458 | * In the presence of smp nice balancing, certain scenarios can have |
2459 | * max load less than avg load(as we skip the groups at or below | 2459 | * max load less than avg load(as we skip the groups at or below |
2460 | * its cpu_power, while calculating max_load..) | 2460 | * its cpu_power, while calculating max_load..) |
2461 | */ | 2461 | */ |
2462 | if (max_load < avg_load) { | 2462 | if (max_load < avg_load) { |
2463 | *imbalance = 0; | 2463 | *imbalance = 0; |
2464 | goto small_imbalance; | 2464 | goto small_imbalance; |
2465 | } | 2465 | } |
2466 | 2466 | ||
2467 | /* Don't want to pull so many tasks that a group would go idle */ | 2467 | /* Don't want to pull so many tasks that a group would go idle */ |
2468 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); | 2468 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2469 | 2469 | ||
2470 | /* How much load to actually move to equalise the imbalance */ | 2470 | /* How much load to actually move to equalise the imbalance */ |
2471 | *imbalance = min(max_pull * busiest->cpu_power, | 2471 | *imbalance = min(max_pull * busiest->cpu_power, |
2472 | (avg_load - this_load) * this->cpu_power) | 2472 | (avg_load - this_load) * this->cpu_power) |
2473 | / SCHED_LOAD_SCALE; | 2473 | / SCHED_LOAD_SCALE; |
2474 | 2474 | ||
2475 | /* | 2475 | /* |
2476 | * if *imbalance is less than the average load per runnable task | 2476 | * if *imbalance is less than the average load per runnable task |
2477 | * there is no gaurantee that any tasks will be moved so we'll have | 2477 | * there is no gaurantee that any tasks will be moved so we'll have |
2478 | * a think about bumping its value to force at least one task to be | 2478 | * a think about bumping its value to force at least one task to be |
2479 | * moved | 2479 | * moved |
2480 | */ | 2480 | */ |
2481 | if (*imbalance < busiest_load_per_task) { | 2481 | if (*imbalance < busiest_load_per_task) { |
2482 | unsigned long tmp, pwr_now, pwr_move; | 2482 | unsigned long tmp, pwr_now, pwr_move; |
2483 | unsigned int imbn; | 2483 | unsigned int imbn; |
2484 | 2484 | ||
2485 | small_imbalance: | 2485 | small_imbalance: |
2486 | pwr_move = pwr_now = 0; | 2486 | pwr_move = pwr_now = 0; |
2487 | imbn = 2; | 2487 | imbn = 2; |
2488 | if (this_nr_running) { | 2488 | if (this_nr_running) { |
2489 | this_load_per_task /= this_nr_running; | 2489 | this_load_per_task /= this_nr_running; |
2490 | if (busiest_load_per_task > this_load_per_task) | 2490 | if (busiest_load_per_task > this_load_per_task) |
2491 | imbn = 1; | 2491 | imbn = 1; |
2492 | } else | 2492 | } else |
2493 | this_load_per_task = SCHED_LOAD_SCALE; | 2493 | this_load_per_task = SCHED_LOAD_SCALE; |
2494 | 2494 | ||
2495 | if (max_load - this_load >= busiest_load_per_task * imbn) { | 2495 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
2496 | *imbalance = busiest_load_per_task; | 2496 | *imbalance = busiest_load_per_task; |
2497 | return busiest; | 2497 | return busiest; |
2498 | } | 2498 | } |
2499 | 2499 | ||
2500 | /* | 2500 | /* |
2501 | * OK, we don't have enough imbalance to justify moving tasks, | 2501 | * OK, we don't have enough imbalance to justify moving tasks, |
2502 | * however we may be able to increase total CPU power used by | 2502 | * however we may be able to increase total CPU power used by |
2503 | * moving them. | 2503 | * moving them. |
2504 | */ | 2504 | */ |
2505 | 2505 | ||
2506 | pwr_now += busiest->cpu_power * | 2506 | pwr_now += busiest->cpu_power * |
2507 | min(busiest_load_per_task, max_load); | 2507 | min(busiest_load_per_task, max_load); |
2508 | pwr_now += this->cpu_power * | 2508 | pwr_now += this->cpu_power * |
2509 | min(this_load_per_task, this_load); | 2509 | min(this_load_per_task, this_load); |
2510 | pwr_now /= SCHED_LOAD_SCALE; | 2510 | pwr_now /= SCHED_LOAD_SCALE; |
2511 | 2511 | ||
2512 | /* Amount of load we'd subtract */ | 2512 | /* Amount of load we'd subtract */ |
2513 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / | 2513 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
2514 | busiest->cpu_power; | 2514 | busiest->cpu_power; |
2515 | if (max_load > tmp) | 2515 | if (max_load > tmp) |
2516 | pwr_move += busiest->cpu_power * | 2516 | pwr_move += busiest->cpu_power * |
2517 | min(busiest_load_per_task, max_load - tmp); | 2517 | min(busiest_load_per_task, max_load - tmp); |
2518 | 2518 | ||
2519 | /* Amount of load we'd add */ | 2519 | /* Amount of load we'd add */ |
2520 | if (max_load * busiest->cpu_power < | 2520 | if (max_load * busiest->cpu_power < |
2521 | busiest_load_per_task * SCHED_LOAD_SCALE) | 2521 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2522 | tmp = max_load * busiest->cpu_power / this->cpu_power; | 2522 | tmp = max_load * busiest->cpu_power / this->cpu_power; |
2523 | else | 2523 | else |
2524 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / | 2524 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
2525 | this->cpu_power; | 2525 | this->cpu_power; |
2526 | pwr_move += this->cpu_power * | 2526 | pwr_move += this->cpu_power * |
2527 | min(this_load_per_task, this_load + tmp); | 2527 | min(this_load_per_task, this_load + tmp); |
2528 | pwr_move /= SCHED_LOAD_SCALE; | 2528 | pwr_move /= SCHED_LOAD_SCALE; |
2529 | 2529 | ||
2530 | /* Move if we gain throughput */ | 2530 | /* Move if we gain throughput */ |
2531 | if (pwr_move <= pwr_now) | 2531 | if (pwr_move <= pwr_now) |
2532 | goto out_balanced; | 2532 | goto out_balanced; |
2533 | 2533 | ||
2534 | *imbalance = busiest_load_per_task; | 2534 | *imbalance = busiest_load_per_task; |
2535 | } | 2535 | } |
2536 | 2536 | ||
2537 | return busiest; | 2537 | return busiest; |
2538 | 2538 | ||
2539 | out_balanced: | 2539 | out_balanced: |
2540 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 2540 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
2541 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | 2541 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) |
2542 | goto ret; | 2542 | goto ret; |
2543 | 2543 | ||
2544 | if (this == group_leader && group_leader != group_min) { | 2544 | if (this == group_leader && group_leader != group_min) { |
2545 | *imbalance = min_load_per_task; | 2545 | *imbalance = min_load_per_task; |
2546 | return group_min; | 2546 | return group_min; |
2547 | } | 2547 | } |
2548 | #endif | 2548 | #endif |
2549 | ret: | 2549 | ret: |
2550 | *imbalance = 0; | 2550 | *imbalance = 0; |
2551 | return NULL; | 2551 | return NULL; |
2552 | } | 2552 | } |
2553 | 2553 | ||
2554 | /* | 2554 | /* |
2555 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2555 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2556 | */ | 2556 | */ |
2557 | static struct rq * | 2557 | static struct rq * |
2558 | find_busiest_queue(struct sched_group *group, enum idle_type idle, | 2558 | find_busiest_queue(struct sched_group *group, enum idle_type idle, |
2559 | unsigned long imbalance, cpumask_t *cpus) | 2559 | unsigned long imbalance, cpumask_t *cpus) |
2560 | { | 2560 | { |
2561 | struct rq *busiest = NULL, *rq; | 2561 | struct rq *busiest = NULL, *rq; |
2562 | unsigned long max_load = 0; | 2562 | unsigned long max_load = 0; |
2563 | int i; | 2563 | int i; |
2564 | 2564 | ||
2565 | for_each_cpu_mask(i, group->cpumask) { | 2565 | for_each_cpu_mask(i, group->cpumask) { |
2566 | 2566 | ||
2567 | if (!cpu_isset(i, *cpus)) | 2567 | if (!cpu_isset(i, *cpus)) |
2568 | continue; | 2568 | continue; |
2569 | 2569 | ||
2570 | rq = cpu_rq(i); | 2570 | rq = cpu_rq(i); |
2571 | 2571 | ||
2572 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) | 2572 | if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) |
2573 | continue; | 2573 | continue; |
2574 | 2574 | ||
2575 | if (rq->raw_weighted_load > max_load) { | 2575 | if (rq->raw_weighted_load > max_load) { |
2576 | max_load = rq->raw_weighted_load; | 2576 | max_load = rq->raw_weighted_load; |
2577 | busiest = rq; | 2577 | busiest = rq; |
2578 | } | 2578 | } |
2579 | } | 2579 | } |
2580 | 2580 | ||
2581 | return busiest; | 2581 | return busiest; |
2582 | } | 2582 | } |
2583 | 2583 | ||
2584 | /* | 2584 | /* |
2585 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but | 2585 | * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but |
2586 | * so long as it is large enough. | 2586 | * so long as it is large enough. |
2587 | */ | 2587 | */ |
2588 | #define MAX_PINNED_INTERVAL 512 | 2588 | #define MAX_PINNED_INTERVAL 512 |
2589 | 2589 | ||
2590 | static inline unsigned long minus_1_or_zero(unsigned long n) | 2590 | static inline unsigned long minus_1_or_zero(unsigned long n) |
2591 | { | 2591 | { |
2592 | return n > 0 ? n - 1 : 0; | 2592 | return n > 0 ? n - 1 : 0; |
2593 | } | 2593 | } |
2594 | 2594 | ||
2595 | /* | 2595 | /* |
2596 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2596 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2597 | * tasks if there is an imbalance. | 2597 | * tasks if there is an imbalance. |
2598 | */ | 2598 | */ |
2599 | static int load_balance(int this_cpu, struct rq *this_rq, | 2599 | static int load_balance(int this_cpu, struct rq *this_rq, |
2600 | struct sched_domain *sd, enum idle_type idle, | 2600 | struct sched_domain *sd, enum idle_type idle, |
2601 | int *balance) | 2601 | int *balance) |
2602 | { | 2602 | { |
2603 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2603 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
2604 | struct sched_group *group; | 2604 | struct sched_group *group; |
2605 | unsigned long imbalance; | 2605 | unsigned long imbalance; |
2606 | struct rq *busiest; | 2606 | struct rq *busiest; |
2607 | cpumask_t cpus = CPU_MASK_ALL; | 2607 | cpumask_t cpus = CPU_MASK_ALL; |
2608 | unsigned long flags; | 2608 | unsigned long flags; |
2609 | 2609 | ||
2610 | /* | 2610 | /* |
2611 | * When power savings policy is enabled for the parent domain, idle | 2611 | * When power savings policy is enabled for the parent domain, idle |
2612 | * sibling can pick up load irrespective of busy siblings. In this case, | 2612 | * sibling can pick up load irrespective of busy siblings. In this case, |
2613 | * let the state of idle sibling percolate up as IDLE, instead of | 2613 | * let the state of idle sibling percolate up as IDLE, instead of |
2614 | * portraying it as NOT_IDLE. | 2614 | * portraying it as NOT_IDLE. |
2615 | */ | 2615 | */ |
2616 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && | 2616 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2617 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2617 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2618 | sd_idle = 1; | 2618 | sd_idle = 1; |
2619 | 2619 | ||
2620 | schedstat_inc(sd, lb_cnt[idle]); | 2620 | schedstat_inc(sd, lb_cnt[idle]); |
2621 | 2621 | ||
2622 | redo: | 2622 | redo: |
2623 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2623 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
2624 | &cpus, balance); | 2624 | &cpus, balance); |
2625 | 2625 | ||
2626 | if (*balance == 0) | 2626 | if (*balance == 0) |
2627 | goto out_balanced; | 2627 | goto out_balanced; |
2628 | 2628 | ||
2629 | if (!group) { | 2629 | if (!group) { |
2630 | schedstat_inc(sd, lb_nobusyg[idle]); | 2630 | schedstat_inc(sd, lb_nobusyg[idle]); |
2631 | goto out_balanced; | 2631 | goto out_balanced; |
2632 | } | 2632 | } |
2633 | 2633 | ||
2634 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); | 2634 | busiest = find_busiest_queue(group, idle, imbalance, &cpus); |
2635 | if (!busiest) { | 2635 | if (!busiest) { |
2636 | schedstat_inc(sd, lb_nobusyq[idle]); | 2636 | schedstat_inc(sd, lb_nobusyq[idle]); |
2637 | goto out_balanced; | 2637 | goto out_balanced; |
2638 | } | 2638 | } |
2639 | 2639 | ||
2640 | BUG_ON(busiest == this_rq); | 2640 | BUG_ON(busiest == this_rq); |
2641 | 2641 | ||
2642 | schedstat_add(sd, lb_imbalance[idle], imbalance); | 2642 | schedstat_add(sd, lb_imbalance[idle], imbalance); |
2643 | 2643 | ||
2644 | nr_moved = 0; | 2644 | nr_moved = 0; |
2645 | if (busiest->nr_running > 1) { | 2645 | if (busiest->nr_running > 1) { |
2646 | /* | 2646 | /* |
2647 | * Attempt to move tasks. If find_busiest_group has found | 2647 | * Attempt to move tasks. If find_busiest_group has found |
2648 | * an imbalance but busiest->nr_running <= 1, the group is | 2648 | * an imbalance but busiest->nr_running <= 1, the group is |
2649 | * still unbalanced. nr_moved simply stays zero, so it is | 2649 | * still unbalanced. nr_moved simply stays zero, so it is |
2650 | * correctly treated as an imbalance. | 2650 | * correctly treated as an imbalance. |
2651 | */ | 2651 | */ |
2652 | local_irq_save(flags); | 2652 | local_irq_save(flags); |
2653 | double_rq_lock(this_rq, busiest); | 2653 | double_rq_lock(this_rq, busiest); |
2654 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2654 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2655 | minus_1_or_zero(busiest->nr_running), | 2655 | minus_1_or_zero(busiest->nr_running), |
2656 | imbalance, sd, idle, &all_pinned); | 2656 | imbalance, sd, idle, &all_pinned); |
2657 | double_rq_unlock(this_rq, busiest); | 2657 | double_rq_unlock(this_rq, busiest); |
2658 | local_irq_restore(flags); | 2658 | local_irq_restore(flags); |
2659 | 2659 | ||
2660 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2660 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2661 | if (unlikely(all_pinned)) { | 2661 | if (unlikely(all_pinned)) { |
2662 | cpu_clear(cpu_of(busiest), cpus); | 2662 | cpu_clear(cpu_of(busiest), cpus); |
2663 | if (!cpus_empty(cpus)) | 2663 | if (!cpus_empty(cpus)) |
2664 | goto redo; | 2664 | goto redo; |
2665 | goto out_balanced; | 2665 | goto out_balanced; |
2666 | } | 2666 | } |
2667 | } | 2667 | } |
2668 | 2668 | ||
2669 | if (!nr_moved) { | 2669 | if (!nr_moved) { |
2670 | schedstat_inc(sd, lb_failed[idle]); | 2670 | schedstat_inc(sd, lb_failed[idle]); |
2671 | sd->nr_balance_failed++; | 2671 | sd->nr_balance_failed++; |
2672 | 2672 | ||
2673 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2673 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
2674 | 2674 | ||
2675 | spin_lock_irqsave(&busiest->lock, flags); | 2675 | spin_lock_irqsave(&busiest->lock, flags); |
2676 | 2676 | ||
2677 | /* don't kick the migration_thread, if the curr | 2677 | /* don't kick the migration_thread, if the curr |
2678 | * task on busiest cpu can't be moved to this_cpu | 2678 | * task on busiest cpu can't be moved to this_cpu |
2679 | */ | 2679 | */ |
2680 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 2680 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
2681 | spin_unlock_irqrestore(&busiest->lock, flags); | 2681 | spin_unlock_irqrestore(&busiest->lock, flags); |
2682 | all_pinned = 1; | 2682 | all_pinned = 1; |
2683 | goto out_one_pinned; | 2683 | goto out_one_pinned; |
2684 | } | 2684 | } |
2685 | 2685 | ||
2686 | if (!busiest->active_balance) { | 2686 | if (!busiest->active_balance) { |
2687 | busiest->active_balance = 1; | 2687 | busiest->active_balance = 1; |
2688 | busiest->push_cpu = this_cpu; | 2688 | busiest->push_cpu = this_cpu; |
2689 | active_balance = 1; | 2689 | active_balance = 1; |
2690 | } | 2690 | } |
2691 | spin_unlock_irqrestore(&busiest->lock, flags); | 2691 | spin_unlock_irqrestore(&busiest->lock, flags); |
2692 | if (active_balance) | 2692 | if (active_balance) |
2693 | wake_up_process(busiest->migration_thread); | 2693 | wake_up_process(busiest->migration_thread); |
2694 | 2694 | ||
2695 | /* | 2695 | /* |
2696 | * We've kicked active balancing, reset the failure | 2696 | * We've kicked active balancing, reset the failure |
2697 | * counter. | 2697 | * counter. |
2698 | */ | 2698 | */ |
2699 | sd->nr_balance_failed = sd->cache_nice_tries+1; | 2699 | sd->nr_balance_failed = sd->cache_nice_tries+1; |
2700 | } | 2700 | } |
2701 | } else | 2701 | } else |
2702 | sd->nr_balance_failed = 0; | 2702 | sd->nr_balance_failed = 0; |
2703 | 2703 | ||
2704 | if (likely(!active_balance)) { | 2704 | if (likely(!active_balance)) { |
2705 | /* We were unbalanced, so reset the balancing interval */ | 2705 | /* We were unbalanced, so reset the balancing interval */ |
2706 | sd->balance_interval = sd->min_interval; | 2706 | sd->balance_interval = sd->min_interval; |
2707 | } else { | 2707 | } else { |
2708 | /* | 2708 | /* |
2709 | * If we've begun active balancing, start to back off. This | 2709 | * If we've begun active balancing, start to back off. This |
2710 | * case may not be covered by the all_pinned logic if there | 2710 | * case may not be covered by the all_pinned logic if there |
2711 | * is only 1 task on the busy runqueue (because we don't call | 2711 | * is only 1 task on the busy runqueue (because we don't call |
2712 | * move_tasks). | 2712 | * move_tasks). |
2713 | */ | 2713 | */ |
2714 | if (sd->balance_interval < sd->max_interval) | 2714 | if (sd->balance_interval < sd->max_interval) |
2715 | sd->balance_interval *= 2; | 2715 | sd->balance_interval *= 2; |
2716 | } | 2716 | } |
2717 | 2717 | ||
2718 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2718 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2719 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2719 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2720 | return -1; | 2720 | return -1; |
2721 | return nr_moved; | 2721 | return nr_moved; |
2722 | 2722 | ||
2723 | out_balanced: | 2723 | out_balanced: |
2724 | schedstat_inc(sd, lb_balanced[idle]); | 2724 | schedstat_inc(sd, lb_balanced[idle]); |
2725 | 2725 | ||
2726 | sd->nr_balance_failed = 0; | 2726 | sd->nr_balance_failed = 0; |
2727 | 2727 | ||
2728 | out_one_pinned: | 2728 | out_one_pinned: |
2729 | /* tune up the balancing interval */ | 2729 | /* tune up the balancing interval */ |
2730 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 2730 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
2731 | (sd->balance_interval < sd->max_interval)) | 2731 | (sd->balance_interval < sd->max_interval)) |
2732 | sd->balance_interval *= 2; | 2732 | sd->balance_interval *= 2; |
2733 | 2733 | ||
2734 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2734 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2735 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2735 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2736 | return -1; | 2736 | return -1; |
2737 | return 0; | 2737 | return 0; |
2738 | } | 2738 | } |
2739 | 2739 | ||
2740 | /* | 2740 | /* |
2741 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2741 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2742 | * tasks if there is an imbalance. | 2742 | * tasks if there is an imbalance. |
2743 | * | 2743 | * |
2744 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). | 2744 | * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). |
2745 | * this_rq is locked. | 2745 | * this_rq is locked. |
2746 | */ | 2746 | */ |
2747 | static int | 2747 | static int |
2748 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | 2748 | load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) |
2749 | { | 2749 | { |
2750 | struct sched_group *group; | 2750 | struct sched_group *group; |
2751 | struct rq *busiest = NULL; | 2751 | struct rq *busiest = NULL; |
2752 | unsigned long imbalance; | 2752 | unsigned long imbalance; |
2753 | int nr_moved = 0; | 2753 | int nr_moved = 0; |
2754 | int sd_idle = 0; | 2754 | int sd_idle = 0; |
2755 | cpumask_t cpus = CPU_MASK_ALL; | 2755 | cpumask_t cpus = CPU_MASK_ALL; |
2756 | 2756 | ||
2757 | /* | 2757 | /* |
2758 | * When power savings policy is enabled for the parent domain, idle | 2758 | * When power savings policy is enabled for the parent domain, idle |
2759 | * sibling can pick up load irrespective of busy siblings. In this case, | 2759 | * sibling can pick up load irrespective of busy siblings. In this case, |
2760 | * let the state of idle sibling percolate up as IDLE, instead of | 2760 | * let the state of idle sibling percolate up as IDLE, instead of |
2761 | * portraying it as NOT_IDLE. | 2761 | * portraying it as NOT_IDLE. |
2762 | */ | 2762 | */ |
2763 | if (sd->flags & SD_SHARE_CPUPOWER && | 2763 | if (sd->flags & SD_SHARE_CPUPOWER && |
2764 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2764 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2765 | sd_idle = 1; | 2765 | sd_idle = 1; |
2766 | 2766 | ||
2767 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2767 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2768 | redo: | 2768 | redo: |
2769 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | 2769 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, |
2770 | &sd_idle, &cpus, NULL); | 2770 | &sd_idle, &cpus, NULL); |
2771 | if (!group) { | 2771 | if (!group) { |
2772 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2772 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2773 | goto out_balanced; | 2773 | goto out_balanced; |
2774 | } | 2774 | } |
2775 | 2775 | ||
2776 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, | 2776 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, |
2777 | &cpus); | 2777 | &cpus); |
2778 | if (!busiest) { | 2778 | if (!busiest) { |
2779 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2779 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2780 | goto out_balanced; | 2780 | goto out_balanced; |
2781 | } | 2781 | } |
2782 | 2782 | ||
2783 | BUG_ON(busiest == this_rq); | 2783 | BUG_ON(busiest == this_rq); |
2784 | 2784 | ||
2785 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2785 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
2786 | 2786 | ||
2787 | nr_moved = 0; | 2787 | nr_moved = 0; |
2788 | if (busiest->nr_running > 1) { | 2788 | if (busiest->nr_running > 1) { |
2789 | /* Attempt to move tasks */ | 2789 | /* Attempt to move tasks */ |
2790 | double_lock_balance(this_rq, busiest); | 2790 | double_lock_balance(this_rq, busiest); |
2791 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2791 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2792 | minus_1_or_zero(busiest->nr_running), | 2792 | minus_1_or_zero(busiest->nr_running), |
2793 | imbalance, sd, NEWLY_IDLE, NULL); | 2793 | imbalance, sd, NEWLY_IDLE, NULL); |
2794 | spin_unlock(&busiest->lock); | 2794 | spin_unlock(&busiest->lock); |
2795 | 2795 | ||
2796 | if (!nr_moved) { | 2796 | if (!nr_moved) { |
2797 | cpu_clear(cpu_of(busiest), cpus); | 2797 | cpu_clear(cpu_of(busiest), cpus); |
2798 | if (!cpus_empty(cpus)) | 2798 | if (!cpus_empty(cpus)) |
2799 | goto redo; | 2799 | goto redo; |
2800 | } | 2800 | } |
2801 | } | 2801 | } |
2802 | 2802 | ||
2803 | if (!nr_moved) { | 2803 | if (!nr_moved) { |
2804 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2804 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
2805 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2805 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2806 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2806 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2807 | return -1; | 2807 | return -1; |
2808 | } else | 2808 | } else |
2809 | sd->nr_balance_failed = 0; | 2809 | sd->nr_balance_failed = 0; |
2810 | 2810 | ||
2811 | return nr_moved; | 2811 | return nr_moved; |
2812 | 2812 | ||
2813 | out_balanced: | 2813 | out_balanced: |
2814 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2814 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2815 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 2815 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2816 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 2816 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
2817 | return -1; | 2817 | return -1; |
2818 | sd->nr_balance_failed = 0; | 2818 | sd->nr_balance_failed = 0; |
2819 | 2819 | ||
2820 | return 0; | 2820 | return 0; |
2821 | } | 2821 | } |
2822 | 2822 | ||
2823 | /* | 2823 | /* |
2824 | * idle_balance is called by schedule() if this_cpu is about to become | 2824 | * idle_balance is called by schedule() if this_cpu is about to become |
2825 | * idle. Attempts to pull tasks from other CPUs. | 2825 | * idle. Attempts to pull tasks from other CPUs. |
2826 | */ | 2826 | */ |
2827 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2827 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2828 | { | 2828 | { |
2829 | struct sched_domain *sd; | 2829 | struct sched_domain *sd; |
2830 | int pulled_task = 0; | 2830 | int pulled_task = 0; |
2831 | unsigned long next_balance = jiffies + 60 * HZ; | 2831 | unsigned long next_balance = jiffies + 60 * HZ; |
2832 | 2832 | ||
2833 | for_each_domain(this_cpu, sd) { | 2833 | for_each_domain(this_cpu, sd) { |
2834 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2834 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
2835 | /* If we've pulled tasks over stop searching: */ | 2835 | /* If we've pulled tasks over stop searching: */ |
2836 | pulled_task = load_balance_newidle(this_cpu, | 2836 | pulled_task = load_balance_newidle(this_cpu, |
2837 | this_rq, sd); | 2837 | this_rq, sd); |
2838 | if (time_after(next_balance, | 2838 | if (time_after(next_balance, |
2839 | sd->last_balance + sd->balance_interval)) | 2839 | sd->last_balance + sd->balance_interval)) |
2840 | next_balance = sd->last_balance | 2840 | next_balance = sd->last_balance |
2841 | + sd->balance_interval; | 2841 | + sd->balance_interval; |
2842 | if (pulled_task) | 2842 | if (pulled_task) |
2843 | break; | 2843 | break; |
2844 | } | 2844 | } |
2845 | } | 2845 | } |
2846 | if (!pulled_task) | 2846 | if (!pulled_task) |
2847 | /* | 2847 | /* |
2848 | * We are going idle. next_balance may be set based on | 2848 | * We are going idle. next_balance may be set based on |
2849 | * a busy processor. So reset next_balance. | 2849 | * a busy processor. So reset next_balance. |
2850 | */ | 2850 | */ |
2851 | this_rq->next_balance = next_balance; | 2851 | this_rq->next_balance = next_balance; |
2852 | } | 2852 | } |
2853 | 2853 | ||
2854 | /* | 2854 | /* |
2855 | * active_load_balance is run by migration threads. It pushes running tasks | 2855 | * active_load_balance is run by migration threads. It pushes running tasks |
2856 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be | 2856 | * off the busiest CPU onto idle CPUs. It requires at least 1 task to be |
2857 | * running on each physical CPU where possible, and avoids physical / | 2857 | * running on each physical CPU where possible, and avoids physical / |
2858 | * logical imbalances. | 2858 | * logical imbalances. |
2859 | * | 2859 | * |
2860 | * Called with busiest_rq locked. | 2860 | * Called with busiest_rq locked. |
2861 | */ | 2861 | */ |
2862 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | 2862 | static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) |
2863 | { | 2863 | { |
2864 | int target_cpu = busiest_rq->push_cpu; | 2864 | int target_cpu = busiest_rq->push_cpu; |
2865 | struct sched_domain *sd; | 2865 | struct sched_domain *sd; |
2866 | struct rq *target_rq; | 2866 | struct rq *target_rq; |
2867 | 2867 | ||
2868 | /* Is there any task to move? */ | 2868 | /* Is there any task to move? */ |
2869 | if (busiest_rq->nr_running <= 1) | 2869 | if (busiest_rq->nr_running <= 1) |
2870 | return; | 2870 | return; |
2871 | 2871 | ||
2872 | target_rq = cpu_rq(target_cpu); | 2872 | target_rq = cpu_rq(target_cpu); |
2873 | 2873 | ||
2874 | /* | 2874 | /* |
2875 | * This condition is "impossible", if it occurs | 2875 | * This condition is "impossible", if it occurs |
2876 | * we need to fix it. Originally reported by | 2876 | * we need to fix it. Originally reported by |
2877 | * Bjorn Helgaas on a 128-cpu setup. | 2877 | * Bjorn Helgaas on a 128-cpu setup. |
2878 | */ | 2878 | */ |
2879 | BUG_ON(busiest_rq == target_rq); | 2879 | BUG_ON(busiest_rq == target_rq); |
2880 | 2880 | ||
2881 | /* move a task from busiest_rq to target_rq */ | 2881 | /* move a task from busiest_rq to target_rq */ |
2882 | double_lock_balance(busiest_rq, target_rq); | 2882 | double_lock_balance(busiest_rq, target_rq); |
2883 | 2883 | ||
2884 | /* Search for an sd spanning us and the target CPU. */ | 2884 | /* Search for an sd spanning us and the target CPU. */ |
2885 | for_each_domain(target_cpu, sd) { | 2885 | for_each_domain(target_cpu, sd) { |
2886 | if ((sd->flags & SD_LOAD_BALANCE) && | 2886 | if ((sd->flags & SD_LOAD_BALANCE) && |
2887 | cpu_isset(busiest_cpu, sd->span)) | 2887 | cpu_isset(busiest_cpu, sd->span)) |
2888 | break; | 2888 | break; |
2889 | } | 2889 | } |
2890 | 2890 | ||
2891 | if (likely(sd)) { | 2891 | if (likely(sd)) { |
2892 | schedstat_inc(sd, alb_cnt); | 2892 | schedstat_inc(sd, alb_cnt); |
2893 | 2893 | ||
2894 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, | 2894 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
2895 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, | 2895 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, |
2896 | NULL)) | 2896 | NULL)) |
2897 | schedstat_inc(sd, alb_pushed); | 2897 | schedstat_inc(sd, alb_pushed); |
2898 | else | 2898 | else |
2899 | schedstat_inc(sd, alb_failed); | 2899 | schedstat_inc(sd, alb_failed); |
2900 | } | 2900 | } |
2901 | spin_unlock(&target_rq->lock); | 2901 | spin_unlock(&target_rq->lock); |
2902 | } | 2902 | } |
2903 | 2903 | ||
2904 | static void update_load(struct rq *this_rq) | 2904 | static void update_load(struct rq *this_rq) |
2905 | { | 2905 | { |
2906 | unsigned long this_load; | 2906 | unsigned long this_load; |
2907 | unsigned int i, scale; | 2907 | unsigned int i, scale; |
2908 | 2908 | ||
2909 | this_load = this_rq->raw_weighted_load; | 2909 | this_load = this_rq->raw_weighted_load; |
2910 | 2910 | ||
2911 | /* Update our load: */ | 2911 | /* Update our load: */ |
2912 | for (i = 0, scale = 1; i < 3; i++, scale += scale) { | 2912 | for (i = 0, scale = 1; i < 3; i++, scale += scale) { |
2913 | unsigned long old_load, new_load; | 2913 | unsigned long old_load, new_load; |
2914 | 2914 | ||
2915 | /* scale is effectively 1 << i now, and >> i divides by scale */ | 2915 | /* scale is effectively 1 << i now, and >> i divides by scale */ |
2916 | 2916 | ||
2917 | old_load = this_rq->cpu_load[i]; | 2917 | old_load = this_rq->cpu_load[i]; |
2918 | new_load = this_load; | 2918 | new_load = this_load; |
2919 | /* | 2919 | /* |
2920 | * Round up the averaging division if load is increasing. This | 2920 | * Round up the averaging division if load is increasing. This |
2921 | * prevents us from getting stuck on 9 if the load is 10, for | 2921 | * prevents us from getting stuck on 9 if the load is 10, for |
2922 | * example. | 2922 | * example. |
2923 | */ | 2923 | */ |
2924 | if (new_load > old_load) | 2924 | if (new_load > old_load) |
2925 | new_load += scale-1; | 2925 | new_load += scale-1; |
2926 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; | 2926 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; |
2927 | } | 2927 | } |
2928 | } | 2928 | } |
2929 | 2929 | ||
2930 | /* | 2930 | /* |
2931 | * run_rebalance_domains is triggered when needed from the scheduler tick. | 2931 | * run_rebalance_domains is triggered when needed from the scheduler tick. |
2932 | * | 2932 | * |
2933 | * It checks each scheduling domain to see if it is due to be balanced, | 2933 | * It checks each scheduling domain to see if it is due to be balanced, |
2934 | * and initiates a balancing operation if so. | 2934 | * and initiates a balancing operation if so. |
2935 | * | 2935 | * |
2936 | * Balancing parameters are set up in arch_init_sched_domains. | 2936 | * Balancing parameters are set up in arch_init_sched_domains. |
2937 | */ | 2937 | */ |
2938 | static DEFINE_SPINLOCK(balancing); | 2938 | static DEFINE_SPINLOCK(balancing); |
2939 | 2939 | ||
2940 | static void run_rebalance_domains(struct softirq_action *h) | 2940 | static void run_rebalance_domains(struct softirq_action *h) |
2941 | { | 2941 | { |
2942 | int this_cpu = smp_processor_id(), balance = 1; | 2942 | int this_cpu = smp_processor_id(), balance = 1; |
2943 | struct rq *this_rq = cpu_rq(this_cpu); | 2943 | struct rq *this_rq = cpu_rq(this_cpu); |
2944 | unsigned long interval; | 2944 | unsigned long interval; |
2945 | struct sched_domain *sd; | 2945 | struct sched_domain *sd; |
2946 | /* | 2946 | /* |
2947 | * We are idle if there are no processes running. This | 2947 | * We are idle if there are no processes running. This |
2948 | * is valid even if we are the idle process (SMT). | 2948 | * is valid even if we are the idle process (SMT). |
2949 | */ | 2949 | */ |
2950 | enum idle_type idle = !this_rq->nr_running ? | 2950 | enum idle_type idle = !this_rq->nr_running ? |
2951 | SCHED_IDLE : NOT_IDLE; | 2951 | SCHED_IDLE : NOT_IDLE; |
2952 | /* Earliest time when we have to call run_rebalance_domains again */ | 2952 | /* Earliest time when we have to call run_rebalance_domains again */ |
2953 | unsigned long next_balance = jiffies + 60*HZ; | 2953 | unsigned long next_balance = jiffies + 60*HZ; |
2954 | 2954 | ||
2955 | for_each_domain(this_cpu, sd) { | 2955 | for_each_domain(this_cpu, sd) { |
2956 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2956 | if (!(sd->flags & SD_LOAD_BALANCE)) |
2957 | continue; | 2957 | continue; |
2958 | 2958 | ||
2959 | interval = sd->balance_interval; | 2959 | interval = sd->balance_interval; |
2960 | if (idle != SCHED_IDLE) | 2960 | if (idle != SCHED_IDLE) |
2961 | interval *= sd->busy_factor; | 2961 | interval *= sd->busy_factor; |
2962 | 2962 | ||
2963 | /* scale ms to jiffies */ | 2963 | /* scale ms to jiffies */ |
2964 | interval = msecs_to_jiffies(interval); | 2964 | interval = msecs_to_jiffies(interval); |
2965 | if (unlikely(!interval)) | 2965 | if (unlikely(!interval)) |
2966 | interval = 1; | 2966 | interval = 1; |
2967 | 2967 | ||
2968 | if (sd->flags & SD_SERIALIZE) { | 2968 | if (sd->flags & SD_SERIALIZE) { |
2969 | if (!spin_trylock(&balancing)) | 2969 | if (!spin_trylock(&balancing)) |
2970 | goto out; | 2970 | goto out; |
2971 | } | 2971 | } |
2972 | 2972 | ||
2973 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | 2973 | if (time_after_eq(jiffies, sd->last_balance + interval)) { |
2974 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { | 2974 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { |
2975 | /* | 2975 | /* |
2976 | * We've pulled tasks over so either we're no | 2976 | * We've pulled tasks over so either we're no |
2977 | * longer idle, or one of our SMT siblings is | 2977 | * longer idle, or one of our SMT siblings is |
2978 | * not idle. | 2978 | * not idle. |
2979 | */ | 2979 | */ |
2980 | idle = NOT_IDLE; | 2980 | idle = NOT_IDLE; |
2981 | } | 2981 | } |
2982 | sd->last_balance = jiffies; | 2982 | sd->last_balance = jiffies; |
2983 | } | 2983 | } |
2984 | if (sd->flags & SD_SERIALIZE) | 2984 | if (sd->flags & SD_SERIALIZE) |
2985 | spin_unlock(&balancing); | 2985 | spin_unlock(&balancing); |
2986 | out: | 2986 | out: |
2987 | if (time_after(next_balance, sd->last_balance + interval)) | 2987 | if (time_after(next_balance, sd->last_balance + interval)) |
2988 | next_balance = sd->last_balance + interval; | 2988 | next_balance = sd->last_balance + interval; |
2989 | 2989 | ||
2990 | /* | 2990 | /* |
2991 | * Stop the load balance at this level. There is another | 2991 | * Stop the load balance at this level. There is another |
2992 | * CPU in our sched group which is doing load balancing more | 2992 | * CPU in our sched group which is doing load balancing more |
2993 | * actively. | 2993 | * actively. |
2994 | */ | 2994 | */ |
2995 | if (!balance) | 2995 | if (!balance) |
2996 | break; | 2996 | break; |
2997 | } | 2997 | } |
2998 | this_rq->next_balance = next_balance; | 2998 | this_rq->next_balance = next_balance; |
2999 | } | 2999 | } |
3000 | #else | 3000 | #else |
3001 | /* | 3001 | /* |
3002 | * on UP we do not need to balance between CPUs: | 3002 | * on UP we do not need to balance between CPUs: |
3003 | */ | 3003 | */ |
3004 | static inline void idle_balance(int cpu, struct rq *rq) | 3004 | static inline void idle_balance(int cpu, struct rq *rq) |
3005 | { | 3005 | { |
3006 | } | 3006 | } |
3007 | #endif | 3007 | #endif |
3008 | 3008 | ||
3009 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3009 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
3010 | 3010 | ||
3011 | EXPORT_PER_CPU_SYMBOL(kstat); | 3011 | EXPORT_PER_CPU_SYMBOL(kstat); |
3012 | 3012 | ||
3013 | /* | 3013 | /* |
3014 | * This is called on clock ticks and on context switches. | 3014 | * This is called on clock ticks and on context switches. |
3015 | * Bank in p->sched_time the ns elapsed since the last tick or switch. | 3015 | * Bank in p->sched_time the ns elapsed since the last tick or switch. |
3016 | */ | 3016 | */ |
3017 | static inline void | 3017 | static inline void |
3018 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) | 3018 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
3019 | { | 3019 | { |
3020 | p->sched_time += now - p->last_ran; | 3020 | p->sched_time += now - p->last_ran; |
3021 | p->last_ran = rq->most_recent_timestamp = now; | 3021 | p->last_ran = rq->most_recent_timestamp = now; |
3022 | } | 3022 | } |
3023 | 3023 | ||
3024 | /* | 3024 | /* |
3025 | * Return current->sched_time plus any more ns on the sched_clock | 3025 | * Return current->sched_time plus any more ns on the sched_clock |
3026 | * that have not yet been banked. | 3026 | * that have not yet been banked. |
3027 | */ | 3027 | */ |
3028 | unsigned long long current_sched_time(const struct task_struct *p) | 3028 | unsigned long long current_sched_time(const struct task_struct *p) |
3029 | { | 3029 | { |
3030 | unsigned long long ns; | 3030 | unsigned long long ns; |
3031 | unsigned long flags; | 3031 | unsigned long flags; |
3032 | 3032 | ||
3033 | local_irq_save(flags); | 3033 | local_irq_save(flags); |
3034 | ns = p->sched_time + sched_clock() - p->last_ran; | 3034 | ns = p->sched_time + sched_clock() - p->last_ran; |
3035 | local_irq_restore(flags); | 3035 | local_irq_restore(flags); |
3036 | 3036 | ||
3037 | return ns; | 3037 | return ns; |
3038 | } | 3038 | } |
3039 | 3039 | ||
3040 | /* | 3040 | /* |
3041 | * We place interactive tasks back into the active array, if possible. | 3041 | * We place interactive tasks back into the active array, if possible. |
3042 | * | 3042 | * |
3043 | * To guarantee that this does not starve expired tasks we ignore the | 3043 | * To guarantee that this does not starve expired tasks we ignore the |
3044 | * interactivity of a task if the first expired task had to wait more | 3044 | * interactivity of a task if the first expired task had to wait more |
3045 | * than a 'reasonable' amount of time. This deadline timeout is | 3045 | * than a 'reasonable' amount of time. This deadline timeout is |
3046 | * load-dependent, as the frequency of array switched decreases with | 3046 | * load-dependent, as the frequency of array switched decreases with |
3047 | * increasing number of running tasks. We also ignore the interactivity | 3047 | * increasing number of running tasks. We also ignore the interactivity |
3048 | * if a better static_prio task has expired: | 3048 | * if a better static_prio task has expired: |
3049 | */ | 3049 | */ |
3050 | static inline int expired_starving(struct rq *rq) | 3050 | static inline int expired_starving(struct rq *rq) |
3051 | { | 3051 | { |
3052 | if (rq->curr->static_prio > rq->best_expired_prio) | 3052 | if (rq->curr->static_prio > rq->best_expired_prio) |
3053 | return 1; | 3053 | return 1; |
3054 | if (!STARVATION_LIMIT || !rq->expired_timestamp) | 3054 | if (!STARVATION_LIMIT || !rq->expired_timestamp) |
3055 | return 0; | 3055 | return 0; |
3056 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) | 3056 | if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) |
3057 | return 1; | 3057 | return 1; |
3058 | return 0; | 3058 | return 0; |
3059 | } | 3059 | } |
3060 | 3060 | ||
3061 | /* | 3061 | /* |
3062 | * Account user cpu time to a process. | 3062 | * Account user cpu time to a process. |
3063 | * @p: the process that the cpu time gets accounted to | 3063 | * @p: the process that the cpu time gets accounted to |
3064 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3064 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3065 | * @cputime: the cpu time spent in user space since the last update | 3065 | * @cputime: the cpu time spent in user space since the last update |
3066 | */ | 3066 | */ |
3067 | void account_user_time(struct task_struct *p, cputime_t cputime) | 3067 | void account_user_time(struct task_struct *p, cputime_t cputime) |
3068 | { | 3068 | { |
3069 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3069 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3070 | cputime64_t tmp; | 3070 | cputime64_t tmp; |
3071 | 3071 | ||
3072 | p->utime = cputime_add(p->utime, cputime); | 3072 | p->utime = cputime_add(p->utime, cputime); |
3073 | 3073 | ||
3074 | /* Add user time to cpustat. */ | 3074 | /* Add user time to cpustat. */ |
3075 | tmp = cputime_to_cputime64(cputime); | 3075 | tmp = cputime_to_cputime64(cputime); |
3076 | if (TASK_NICE(p) > 0) | 3076 | if (TASK_NICE(p) > 0) |
3077 | cpustat->nice = cputime64_add(cpustat->nice, tmp); | 3077 | cpustat->nice = cputime64_add(cpustat->nice, tmp); |
3078 | else | 3078 | else |
3079 | cpustat->user = cputime64_add(cpustat->user, tmp); | 3079 | cpustat->user = cputime64_add(cpustat->user, tmp); |
3080 | } | 3080 | } |
3081 | 3081 | ||
3082 | /* | 3082 | /* |
3083 | * Account system cpu time to a process. | 3083 | * Account system cpu time to a process. |
3084 | * @p: the process that the cpu time gets accounted to | 3084 | * @p: the process that the cpu time gets accounted to |
3085 | * @hardirq_offset: the offset to subtract from hardirq_count() | 3085 | * @hardirq_offset: the offset to subtract from hardirq_count() |
3086 | * @cputime: the cpu time spent in kernel space since the last update | 3086 | * @cputime: the cpu time spent in kernel space since the last update |
3087 | */ | 3087 | */ |
3088 | void account_system_time(struct task_struct *p, int hardirq_offset, | 3088 | void account_system_time(struct task_struct *p, int hardirq_offset, |
3089 | cputime_t cputime) | 3089 | cputime_t cputime) |
3090 | { | 3090 | { |
3091 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3091 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3092 | struct rq *rq = this_rq(); | 3092 | struct rq *rq = this_rq(); |
3093 | cputime64_t tmp; | 3093 | cputime64_t tmp; |
3094 | 3094 | ||
3095 | p->stime = cputime_add(p->stime, cputime); | 3095 | p->stime = cputime_add(p->stime, cputime); |
3096 | 3096 | ||
3097 | /* Add system time to cpustat. */ | 3097 | /* Add system time to cpustat. */ |
3098 | tmp = cputime_to_cputime64(cputime); | 3098 | tmp = cputime_to_cputime64(cputime); |
3099 | if (hardirq_count() - hardirq_offset) | 3099 | if (hardirq_count() - hardirq_offset) |
3100 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3100 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3101 | else if (softirq_count()) | 3101 | else if (softirq_count()) |
3102 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3102 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3103 | else if (p != rq->idle) | 3103 | else if (p != rq->idle) |
3104 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3104 | cpustat->system = cputime64_add(cpustat->system, tmp); |
3105 | else if (atomic_read(&rq->nr_iowait) > 0) | 3105 | else if (atomic_read(&rq->nr_iowait) > 0) |
3106 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3106 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3107 | else | 3107 | else |
3108 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3108 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3109 | /* Account for system time used */ | 3109 | /* Account for system time used */ |
3110 | acct_update_integrals(p); | 3110 | acct_update_integrals(p); |
3111 | } | 3111 | } |
3112 | 3112 | ||
3113 | /* | 3113 | /* |
3114 | * Account for involuntary wait time. | 3114 | * Account for involuntary wait time. |
3115 | * @p: the process from which the cpu time has been stolen | 3115 | * @p: the process from which the cpu time has been stolen |
3116 | * @steal: the cpu time spent in involuntary wait | 3116 | * @steal: the cpu time spent in involuntary wait |
3117 | */ | 3117 | */ |
3118 | void account_steal_time(struct task_struct *p, cputime_t steal) | 3118 | void account_steal_time(struct task_struct *p, cputime_t steal) |
3119 | { | 3119 | { |
3120 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3120 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3121 | cputime64_t tmp = cputime_to_cputime64(steal); | 3121 | cputime64_t tmp = cputime_to_cputime64(steal); |
3122 | struct rq *rq = this_rq(); | 3122 | struct rq *rq = this_rq(); |
3123 | 3123 | ||
3124 | if (p == rq->idle) { | 3124 | if (p == rq->idle) { |
3125 | p->stime = cputime_add(p->stime, steal); | 3125 | p->stime = cputime_add(p->stime, steal); |
3126 | if (atomic_read(&rq->nr_iowait) > 0) | 3126 | if (atomic_read(&rq->nr_iowait) > 0) |
3127 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3127 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3128 | else | 3128 | else |
3129 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3129 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3130 | } else | 3130 | } else |
3131 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3131 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3132 | } | 3132 | } |
3133 | 3133 | ||
3134 | static void task_running_tick(struct rq *rq, struct task_struct *p) | 3134 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
3135 | { | 3135 | { |
3136 | if (p->array != rq->active) { | 3136 | if (p->array != rq->active) { |
3137 | /* Task has expired but was not scheduled yet */ | 3137 | /* Task has expired but was not scheduled yet */ |
3138 | set_tsk_need_resched(p); | 3138 | set_tsk_need_resched(p); |
3139 | return; | 3139 | return; |
3140 | } | 3140 | } |
3141 | spin_lock(&rq->lock); | 3141 | spin_lock(&rq->lock); |
3142 | /* | 3142 | /* |
3143 | * The task was running during this tick - update the | 3143 | * The task was running during this tick - update the |
3144 | * time slice counter. Note: we do not update a thread's | 3144 | * time slice counter. Note: we do not update a thread's |
3145 | * priority until it either goes to sleep or uses up its | 3145 | * priority until it either goes to sleep or uses up its |
3146 | * timeslice. This makes it possible for interactive tasks | 3146 | * timeslice. This makes it possible for interactive tasks |
3147 | * to use up their timeslices at their highest priority levels. | 3147 | * to use up their timeslices at their highest priority levels. |
3148 | */ | 3148 | */ |
3149 | if (rt_task(p)) { | 3149 | if (rt_task(p)) { |
3150 | /* | 3150 | /* |
3151 | * RR tasks need a special form of timeslice management. | 3151 | * RR tasks need a special form of timeslice management. |
3152 | * FIFO tasks have no timeslices. | 3152 | * FIFO tasks have no timeslices. |
3153 | */ | 3153 | */ |
3154 | if ((p->policy == SCHED_RR) && !--p->time_slice) { | 3154 | if ((p->policy == SCHED_RR) && !--p->time_slice) { |
3155 | p->time_slice = task_timeslice(p); | 3155 | p->time_slice = task_timeslice(p); |
3156 | p->first_time_slice = 0; | 3156 | p->first_time_slice = 0; |
3157 | set_tsk_need_resched(p); | 3157 | set_tsk_need_resched(p); |
3158 | 3158 | ||
3159 | /* put it at the end of the queue: */ | 3159 | /* put it at the end of the queue: */ |
3160 | requeue_task(p, rq->active); | 3160 | requeue_task(p, rq->active); |
3161 | } | 3161 | } |
3162 | goto out_unlock; | 3162 | goto out_unlock; |
3163 | } | 3163 | } |
3164 | if (!--p->time_slice) { | 3164 | if (!--p->time_slice) { |
3165 | dequeue_task(p, rq->active); | 3165 | dequeue_task(p, rq->active); |
3166 | set_tsk_need_resched(p); | 3166 | set_tsk_need_resched(p); |
3167 | p->prio = effective_prio(p); | 3167 | p->prio = effective_prio(p); |
3168 | p->time_slice = task_timeslice(p); | 3168 | p->time_slice = task_timeslice(p); |
3169 | p->first_time_slice = 0; | 3169 | p->first_time_slice = 0; |
3170 | 3170 | ||
3171 | if (!rq->expired_timestamp) | 3171 | if (!rq->expired_timestamp) |
3172 | rq->expired_timestamp = jiffies; | 3172 | rq->expired_timestamp = jiffies; |
3173 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { | 3173 | if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { |
3174 | enqueue_task(p, rq->expired); | 3174 | enqueue_task(p, rq->expired); |
3175 | if (p->static_prio < rq->best_expired_prio) | 3175 | if (p->static_prio < rq->best_expired_prio) |
3176 | rq->best_expired_prio = p->static_prio; | 3176 | rq->best_expired_prio = p->static_prio; |
3177 | } else | 3177 | } else |
3178 | enqueue_task(p, rq->active); | 3178 | enqueue_task(p, rq->active); |
3179 | } else { | 3179 | } else { |
3180 | /* | 3180 | /* |
3181 | * Prevent a too long timeslice allowing a task to monopolize | 3181 | * Prevent a too long timeslice allowing a task to monopolize |
3182 | * the CPU. We do this by splitting up the timeslice into | 3182 | * the CPU. We do this by splitting up the timeslice into |
3183 | * smaller pieces. | 3183 | * smaller pieces. |
3184 | * | 3184 | * |
3185 | * Note: this does not mean the task's timeslices expire or | 3185 | * Note: this does not mean the task's timeslices expire or |
3186 | * get lost in any way, they just might be preempted by | 3186 | * get lost in any way, they just might be preempted by |
3187 | * another task of equal priority. (one with higher | 3187 | * another task of equal priority. (one with higher |
3188 | * priority would have preempted this task already.) We | 3188 | * priority would have preempted this task already.) We |
3189 | * requeue this task to the end of the list on this priority | 3189 | * requeue this task to the end of the list on this priority |
3190 | * level, which is in essence a round-robin of tasks with | 3190 | * level, which is in essence a round-robin of tasks with |
3191 | * equal priority. | 3191 | * equal priority. |
3192 | * | 3192 | * |
3193 | * This only applies to tasks in the interactive | 3193 | * This only applies to tasks in the interactive |
3194 | * delta range with at least TIMESLICE_GRANULARITY to requeue. | 3194 | * delta range with at least TIMESLICE_GRANULARITY to requeue. |
3195 | */ | 3195 | */ |
3196 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - | 3196 | if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - |
3197 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && | 3197 | p->time_slice) % TIMESLICE_GRANULARITY(p)) && |
3198 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && | 3198 | (p->time_slice >= TIMESLICE_GRANULARITY(p)) && |
3199 | (p->array == rq->active)) { | 3199 | (p->array == rq->active)) { |
3200 | 3200 | ||
3201 | requeue_task(p, rq->active); | 3201 | requeue_task(p, rq->active); |
3202 | set_tsk_need_resched(p); | 3202 | set_tsk_need_resched(p); |
3203 | } | 3203 | } |
3204 | } | 3204 | } |
3205 | out_unlock: | 3205 | out_unlock: |
3206 | spin_unlock(&rq->lock); | 3206 | spin_unlock(&rq->lock); |
3207 | } | 3207 | } |
3208 | 3208 | ||
3209 | /* | 3209 | /* |
3210 | * This function gets called by the timer code, with HZ frequency. | 3210 | * This function gets called by the timer code, with HZ frequency. |
3211 | * We call it with interrupts disabled. | 3211 | * We call it with interrupts disabled. |
3212 | * | 3212 | * |
3213 | * It also gets called by the fork code, when changing the parent's | 3213 | * It also gets called by the fork code, when changing the parent's |
3214 | * timeslices. | 3214 | * timeslices. |
3215 | */ | 3215 | */ |
3216 | void scheduler_tick(void) | 3216 | void scheduler_tick(void) |
3217 | { | 3217 | { |
3218 | unsigned long long now = sched_clock(); | 3218 | unsigned long long now = sched_clock(); |
3219 | struct task_struct *p = current; | 3219 | struct task_struct *p = current; |
3220 | int cpu = smp_processor_id(); | 3220 | int cpu = smp_processor_id(); |
3221 | struct rq *rq = cpu_rq(cpu); | 3221 | struct rq *rq = cpu_rq(cpu); |
3222 | 3222 | ||
3223 | update_cpu_clock(p, rq, now); | 3223 | update_cpu_clock(p, rq, now); |
3224 | 3224 | ||
3225 | if (p != rq->idle) | 3225 | if (p != rq->idle) |
3226 | task_running_tick(rq, p); | 3226 | task_running_tick(rq, p); |
3227 | #ifdef CONFIG_SMP | 3227 | #ifdef CONFIG_SMP |
3228 | update_load(rq); | 3228 | update_load(rq); |
3229 | if (time_after_eq(jiffies, rq->next_balance)) | 3229 | if (time_after_eq(jiffies, rq->next_balance)) |
3230 | raise_softirq(SCHED_SOFTIRQ); | 3230 | raise_softirq(SCHED_SOFTIRQ); |
3231 | #endif | 3231 | #endif |
3232 | } | 3232 | } |
3233 | 3233 | ||
3234 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) | 3234 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) |
3235 | 3235 | ||
3236 | void fastcall add_preempt_count(int val) | 3236 | void fastcall add_preempt_count(int val) |
3237 | { | 3237 | { |
3238 | /* | 3238 | /* |
3239 | * Underflow? | 3239 | * Underflow? |
3240 | */ | 3240 | */ |
3241 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) | 3241 | if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) |
3242 | return; | 3242 | return; |
3243 | preempt_count() += val; | 3243 | preempt_count() += val; |
3244 | /* | 3244 | /* |
3245 | * Spinlock count overflowing soon? | 3245 | * Spinlock count overflowing soon? |
3246 | */ | 3246 | */ |
3247 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= | 3247 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
3248 | PREEMPT_MASK - 10); | 3248 | PREEMPT_MASK - 10); |
3249 | } | 3249 | } |
3250 | EXPORT_SYMBOL(add_preempt_count); | 3250 | EXPORT_SYMBOL(add_preempt_count); |
3251 | 3251 | ||
3252 | void fastcall sub_preempt_count(int val) | 3252 | void fastcall sub_preempt_count(int val) |
3253 | { | 3253 | { |
3254 | /* | 3254 | /* |
3255 | * Underflow? | 3255 | * Underflow? |
3256 | */ | 3256 | */ |
3257 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) | 3257 | if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) |
3258 | return; | 3258 | return; |
3259 | /* | 3259 | /* |
3260 | * Is the spinlock portion underflowing? | 3260 | * Is the spinlock portion underflowing? |
3261 | */ | 3261 | */ |
3262 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && | 3262 | if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && |
3263 | !(preempt_count() & PREEMPT_MASK))) | 3263 | !(preempt_count() & PREEMPT_MASK))) |
3264 | return; | 3264 | return; |
3265 | 3265 | ||
3266 | preempt_count() -= val; | 3266 | preempt_count() -= val; |
3267 | } | 3267 | } |
3268 | EXPORT_SYMBOL(sub_preempt_count); | 3268 | EXPORT_SYMBOL(sub_preempt_count); |
3269 | 3269 | ||
3270 | #endif | 3270 | #endif |
3271 | 3271 | ||
3272 | static inline int interactive_sleep(enum sleep_type sleep_type) | 3272 | static inline int interactive_sleep(enum sleep_type sleep_type) |
3273 | { | 3273 | { |
3274 | return (sleep_type == SLEEP_INTERACTIVE || | 3274 | return (sleep_type == SLEEP_INTERACTIVE || |
3275 | sleep_type == SLEEP_INTERRUPTED); | 3275 | sleep_type == SLEEP_INTERRUPTED); |
3276 | } | 3276 | } |
3277 | 3277 | ||
3278 | /* | 3278 | /* |
3279 | * schedule() is the main scheduler function. | 3279 | * schedule() is the main scheduler function. |
3280 | */ | 3280 | */ |
3281 | asmlinkage void __sched schedule(void) | 3281 | asmlinkage void __sched schedule(void) |
3282 | { | 3282 | { |
3283 | struct task_struct *prev, *next; | 3283 | struct task_struct *prev, *next; |
3284 | struct prio_array *array; | 3284 | struct prio_array *array; |
3285 | struct list_head *queue; | 3285 | struct list_head *queue; |
3286 | unsigned long long now; | 3286 | unsigned long long now; |
3287 | unsigned long run_time; | 3287 | unsigned long run_time; |
3288 | int cpu, idx, new_prio; | 3288 | int cpu, idx, new_prio; |
3289 | long *switch_count; | 3289 | long *switch_count; |
3290 | struct rq *rq; | 3290 | struct rq *rq; |
3291 | 3291 | ||
3292 | /* | 3292 | /* |
3293 | * Test if we are atomic. Since do_exit() needs to call into | 3293 | * Test if we are atomic. Since do_exit() needs to call into |
3294 | * schedule() atomically, we ignore that path for now. | 3294 | * schedule() atomically, we ignore that path for now. |
3295 | * Otherwise, whine if we are scheduling when we should not be. | 3295 | * Otherwise, whine if we are scheduling when we should not be. |
3296 | */ | 3296 | */ |
3297 | if (unlikely(in_atomic() && !current->exit_state)) { | 3297 | if (unlikely(in_atomic() && !current->exit_state)) { |
3298 | printk(KERN_ERR "BUG: scheduling while atomic: " | 3298 | printk(KERN_ERR "BUG: scheduling while atomic: " |
3299 | "%s/0x%08x/%d\n", | 3299 | "%s/0x%08x/%d\n", |
3300 | current->comm, preempt_count(), current->pid); | 3300 | current->comm, preempt_count(), current->pid); |
3301 | debug_show_held_locks(current); | 3301 | debug_show_held_locks(current); |
3302 | if (irqs_disabled()) | 3302 | if (irqs_disabled()) |
3303 | print_irqtrace_events(current); | 3303 | print_irqtrace_events(current); |
3304 | dump_stack(); | 3304 | dump_stack(); |
3305 | } | 3305 | } |
3306 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3306 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
3307 | 3307 | ||
3308 | need_resched: | 3308 | need_resched: |
3309 | preempt_disable(); | 3309 | preempt_disable(); |
3310 | prev = current; | 3310 | prev = current; |
3311 | release_kernel_lock(prev); | 3311 | release_kernel_lock(prev); |
3312 | need_resched_nonpreemptible: | 3312 | need_resched_nonpreemptible: |
3313 | rq = this_rq(); | 3313 | rq = this_rq(); |
3314 | 3314 | ||
3315 | /* | 3315 | /* |
3316 | * The idle thread is not allowed to schedule! | 3316 | * The idle thread is not allowed to schedule! |
3317 | * Remove this check after it has been exercised a bit. | 3317 | * Remove this check after it has been exercised a bit. |
3318 | */ | 3318 | */ |
3319 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { | 3319 | if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { |
3320 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | 3320 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); |
3321 | dump_stack(); | 3321 | dump_stack(); |
3322 | } | 3322 | } |
3323 | 3323 | ||
3324 | schedstat_inc(rq, sched_cnt); | 3324 | schedstat_inc(rq, sched_cnt); |
3325 | now = sched_clock(); | 3325 | now = sched_clock(); |
3326 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { | 3326 | if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { |
3327 | run_time = now - prev->timestamp; | 3327 | run_time = now - prev->timestamp; |
3328 | if (unlikely((long long)(now - prev->timestamp) < 0)) | 3328 | if (unlikely((long long)(now - prev->timestamp) < 0)) |
3329 | run_time = 0; | 3329 | run_time = 0; |
3330 | } else | 3330 | } else |
3331 | run_time = NS_MAX_SLEEP_AVG; | 3331 | run_time = NS_MAX_SLEEP_AVG; |
3332 | 3332 | ||
3333 | /* | 3333 | /* |
3334 | * Tasks charged proportionately less run_time at high sleep_avg to | 3334 | * Tasks charged proportionately less run_time at high sleep_avg to |
3335 | * delay them losing their interactive status | 3335 | * delay them losing their interactive status |
3336 | */ | 3336 | */ |
3337 | run_time /= (CURRENT_BONUS(prev) ? : 1); | 3337 | run_time /= (CURRENT_BONUS(prev) ? : 1); |
3338 | 3338 | ||
3339 | spin_lock_irq(&rq->lock); | 3339 | spin_lock_irq(&rq->lock); |
3340 | 3340 | ||
3341 | switch_count = &prev->nivcsw; | 3341 | switch_count = &prev->nivcsw; |
3342 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3342 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
3343 | switch_count = &prev->nvcsw; | 3343 | switch_count = &prev->nvcsw; |
3344 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 3344 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && |
3345 | unlikely(signal_pending(prev)))) | 3345 | unlikely(signal_pending(prev)))) |
3346 | prev->state = TASK_RUNNING; | 3346 | prev->state = TASK_RUNNING; |
3347 | else { | 3347 | else { |
3348 | if (prev->state == TASK_UNINTERRUPTIBLE) | 3348 | if (prev->state == TASK_UNINTERRUPTIBLE) |
3349 | rq->nr_uninterruptible++; | 3349 | rq->nr_uninterruptible++; |
3350 | deactivate_task(prev, rq); | 3350 | deactivate_task(prev, rq); |
3351 | } | 3351 | } |
3352 | } | 3352 | } |
3353 | 3353 | ||
3354 | cpu = smp_processor_id(); | 3354 | cpu = smp_processor_id(); |
3355 | if (unlikely(!rq->nr_running)) { | 3355 | if (unlikely(!rq->nr_running)) { |
3356 | idle_balance(cpu, rq); | 3356 | idle_balance(cpu, rq); |
3357 | if (!rq->nr_running) { | 3357 | if (!rq->nr_running) { |
3358 | next = rq->idle; | 3358 | next = rq->idle; |
3359 | rq->expired_timestamp = 0; | 3359 | rq->expired_timestamp = 0; |
3360 | goto switch_tasks; | 3360 | goto switch_tasks; |
3361 | } | 3361 | } |
3362 | } | 3362 | } |
3363 | 3363 | ||
3364 | array = rq->active; | 3364 | array = rq->active; |
3365 | if (unlikely(!array->nr_active)) { | 3365 | if (unlikely(!array->nr_active)) { |
3366 | /* | 3366 | /* |
3367 | * Switch the active and expired arrays. | 3367 | * Switch the active and expired arrays. |
3368 | */ | 3368 | */ |
3369 | schedstat_inc(rq, sched_switch); | 3369 | schedstat_inc(rq, sched_switch); |
3370 | rq->active = rq->expired; | 3370 | rq->active = rq->expired; |
3371 | rq->expired = array; | 3371 | rq->expired = array; |
3372 | array = rq->active; | 3372 | array = rq->active; |
3373 | rq->expired_timestamp = 0; | 3373 | rq->expired_timestamp = 0; |
3374 | rq->best_expired_prio = MAX_PRIO; | 3374 | rq->best_expired_prio = MAX_PRIO; |
3375 | } | 3375 | } |
3376 | 3376 | ||
3377 | idx = sched_find_first_bit(array->bitmap); | 3377 | idx = sched_find_first_bit(array->bitmap); |
3378 | queue = array->queue + idx; | 3378 | queue = array->queue + idx; |
3379 | next = list_entry(queue->next, struct task_struct, run_list); | 3379 | next = list_entry(queue->next, struct task_struct, run_list); |
3380 | 3380 | ||
3381 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { | 3381 | if (!rt_task(next) && interactive_sleep(next->sleep_type)) { |
3382 | unsigned long long delta = now - next->timestamp; | 3382 | unsigned long long delta = now - next->timestamp; |
3383 | if (unlikely((long long)(now - next->timestamp) < 0)) | 3383 | if (unlikely((long long)(now - next->timestamp) < 0)) |
3384 | delta = 0; | 3384 | delta = 0; |
3385 | 3385 | ||
3386 | if (next->sleep_type == SLEEP_INTERACTIVE) | 3386 | if (next->sleep_type == SLEEP_INTERACTIVE) |
3387 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; | 3387 | delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; |
3388 | 3388 | ||
3389 | array = next->array; | 3389 | array = next->array; |
3390 | new_prio = recalc_task_prio(next, next->timestamp + delta); | 3390 | new_prio = recalc_task_prio(next, next->timestamp + delta); |
3391 | 3391 | ||
3392 | if (unlikely(next->prio != new_prio)) { | 3392 | if (unlikely(next->prio != new_prio)) { |
3393 | dequeue_task(next, array); | 3393 | dequeue_task(next, array); |
3394 | next->prio = new_prio; | 3394 | next->prio = new_prio; |
3395 | enqueue_task(next, array); | 3395 | enqueue_task(next, array); |
3396 | } | 3396 | } |
3397 | } | 3397 | } |
3398 | next->sleep_type = SLEEP_NORMAL; | 3398 | next->sleep_type = SLEEP_NORMAL; |
3399 | switch_tasks: | 3399 | switch_tasks: |
3400 | if (next == rq->idle) | 3400 | if (next == rq->idle) |
3401 | schedstat_inc(rq, sched_goidle); | 3401 | schedstat_inc(rq, sched_goidle); |
3402 | prefetch(next); | 3402 | prefetch(next); |
3403 | prefetch_stack(next); | 3403 | prefetch_stack(next); |
3404 | clear_tsk_need_resched(prev); | 3404 | clear_tsk_need_resched(prev); |
3405 | rcu_qsctr_inc(task_cpu(prev)); | 3405 | rcu_qsctr_inc(task_cpu(prev)); |
3406 | 3406 | ||
3407 | update_cpu_clock(prev, rq, now); | 3407 | update_cpu_clock(prev, rq, now); |
3408 | 3408 | ||
3409 | prev->sleep_avg -= run_time; | 3409 | prev->sleep_avg -= run_time; |
3410 | if ((long)prev->sleep_avg <= 0) | 3410 | if ((long)prev->sleep_avg <= 0) |
3411 | prev->sleep_avg = 0; | 3411 | prev->sleep_avg = 0; |
3412 | prev->timestamp = prev->last_ran = now; | 3412 | prev->timestamp = prev->last_ran = now; |
3413 | 3413 | ||
3414 | sched_info_switch(prev, next); | 3414 | sched_info_switch(prev, next); |
3415 | if (likely(prev != next)) { | 3415 | if (likely(prev != next)) { |
3416 | next->timestamp = next->last_ran = now; | 3416 | next->timestamp = next->last_ran = now; |
3417 | rq->nr_switches++; | 3417 | rq->nr_switches++; |
3418 | rq->curr = next; | 3418 | rq->curr = next; |
3419 | ++*switch_count; | 3419 | ++*switch_count; |
3420 | 3420 | ||
3421 | prepare_task_switch(rq, next); | 3421 | prepare_task_switch(rq, next); |
3422 | prev = context_switch(rq, prev, next); | 3422 | prev = context_switch(rq, prev, next); |
3423 | barrier(); | 3423 | barrier(); |
3424 | /* | 3424 | /* |
3425 | * this_rq must be evaluated again because prev may have moved | 3425 | * this_rq must be evaluated again because prev may have moved |
3426 | * CPUs since it called schedule(), thus the 'rq' on its stack | 3426 | * CPUs since it called schedule(), thus the 'rq' on its stack |
3427 | * frame will be invalid. | 3427 | * frame will be invalid. |
3428 | */ | 3428 | */ |
3429 | finish_task_switch(this_rq(), prev); | 3429 | finish_task_switch(this_rq(), prev); |
3430 | } else | 3430 | } else |
3431 | spin_unlock_irq(&rq->lock); | 3431 | spin_unlock_irq(&rq->lock); |
3432 | 3432 | ||
3433 | prev = current; | 3433 | prev = current; |
3434 | if (unlikely(reacquire_kernel_lock(prev) < 0)) | 3434 | if (unlikely(reacquire_kernel_lock(prev) < 0)) |
3435 | goto need_resched_nonpreemptible; | 3435 | goto need_resched_nonpreemptible; |
3436 | preempt_enable_no_resched(); | 3436 | preempt_enable_no_resched(); |
3437 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3437 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3438 | goto need_resched; | 3438 | goto need_resched; |
3439 | } | 3439 | } |
3440 | EXPORT_SYMBOL(schedule); | 3440 | EXPORT_SYMBOL(schedule); |
3441 | 3441 | ||
3442 | #ifdef CONFIG_PREEMPT | 3442 | #ifdef CONFIG_PREEMPT |
3443 | /* | 3443 | /* |
3444 | * this is the entry point to schedule() from in-kernel preemption | 3444 | * this is the entry point to schedule() from in-kernel preemption |
3445 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3445 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3446 | * occur there and call schedule directly. | 3446 | * occur there and call schedule directly. |
3447 | */ | 3447 | */ |
3448 | asmlinkage void __sched preempt_schedule(void) | 3448 | asmlinkage void __sched preempt_schedule(void) |
3449 | { | 3449 | { |
3450 | struct thread_info *ti = current_thread_info(); | 3450 | struct thread_info *ti = current_thread_info(); |
3451 | #ifdef CONFIG_PREEMPT_BKL | 3451 | #ifdef CONFIG_PREEMPT_BKL |
3452 | struct task_struct *task = current; | 3452 | struct task_struct *task = current; |
3453 | int saved_lock_depth; | 3453 | int saved_lock_depth; |
3454 | #endif | 3454 | #endif |
3455 | /* | 3455 | /* |
3456 | * If there is a non-zero preempt_count or interrupts are disabled, | 3456 | * If there is a non-zero preempt_count or interrupts are disabled, |
3457 | * we do not want to preempt the current task. Just return.. | 3457 | * we do not want to preempt the current task. Just return.. |
3458 | */ | 3458 | */ |
3459 | if (likely(ti->preempt_count || irqs_disabled())) | 3459 | if (likely(ti->preempt_count || irqs_disabled())) |
3460 | return; | 3460 | return; |
3461 | 3461 | ||
3462 | need_resched: | 3462 | need_resched: |
3463 | add_preempt_count(PREEMPT_ACTIVE); | 3463 | add_preempt_count(PREEMPT_ACTIVE); |
3464 | /* | 3464 | /* |
3465 | * We keep the big kernel semaphore locked, but we | 3465 | * We keep the big kernel semaphore locked, but we |
3466 | * clear ->lock_depth so that schedule() doesnt | 3466 | * clear ->lock_depth so that schedule() doesnt |
3467 | * auto-release the semaphore: | 3467 | * auto-release the semaphore: |
3468 | */ | 3468 | */ |
3469 | #ifdef CONFIG_PREEMPT_BKL | 3469 | #ifdef CONFIG_PREEMPT_BKL |
3470 | saved_lock_depth = task->lock_depth; | 3470 | saved_lock_depth = task->lock_depth; |
3471 | task->lock_depth = -1; | 3471 | task->lock_depth = -1; |
3472 | #endif | 3472 | #endif |
3473 | schedule(); | 3473 | schedule(); |
3474 | #ifdef CONFIG_PREEMPT_BKL | 3474 | #ifdef CONFIG_PREEMPT_BKL |
3475 | task->lock_depth = saved_lock_depth; | 3475 | task->lock_depth = saved_lock_depth; |
3476 | #endif | 3476 | #endif |
3477 | sub_preempt_count(PREEMPT_ACTIVE); | 3477 | sub_preempt_count(PREEMPT_ACTIVE); |
3478 | 3478 | ||
3479 | /* we could miss a preemption opportunity between schedule and now */ | 3479 | /* we could miss a preemption opportunity between schedule and now */ |
3480 | barrier(); | 3480 | barrier(); |
3481 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3481 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3482 | goto need_resched; | 3482 | goto need_resched; |
3483 | } | 3483 | } |
3484 | EXPORT_SYMBOL(preempt_schedule); | 3484 | EXPORT_SYMBOL(preempt_schedule); |
3485 | 3485 | ||
3486 | /* | 3486 | /* |
3487 | * this is the entry point to schedule() from kernel preemption | 3487 | * this is the entry point to schedule() from kernel preemption |
3488 | * off of irq context. | 3488 | * off of irq context. |
3489 | * Note, that this is called and return with irqs disabled. This will | 3489 | * Note, that this is called and return with irqs disabled. This will |
3490 | * protect us against recursive calling from irq. | 3490 | * protect us against recursive calling from irq. |
3491 | */ | 3491 | */ |
3492 | asmlinkage void __sched preempt_schedule_irq(void) | 3492 | asmlinkage void __sched preempt_schedule_irq(void) |
3493 | { | 3493 | { |
3494 | struct thread_info *ti = current_thread_info(); | 3494 | struct thread_info *ti = current_thread_info(); |
3495 | #ifdef CONFIG_PREEMPT_BKL | 3495 | #ifdef CONFIG_PREEMPT_BKL |
3496 | struct task_struct *task = current; | 3496 | struct task_struct *task = current; |
3497 | int saved_lock_depth; | 3497 | int saved_lock_depth; |
3498 | #endif | 3498 | #endif |
3499 | /* Catch callers which need to be fixed */ | 3499 | /* Catch callers which need to be fixed */ |
3500 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3500 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3501 | 3501 | ||
3502 | need_resched: | 3502 | need_resched: |
3503 | add_preempt_count(PREEMPT_ACTIVE); | 3503 | add_preempt_count(PREEMPT_ACTIVE); |
3504 | /* | 3504 | /* |
3505 | * We keep the big kernel semaphore locked, but we | 3505 | * We keep the big kernel semaphore locked, but we |
3506 | * clear ->lock_depth so that schedule() doesnt | 3506 | * clear ->lock_depth so that schedule() doesnt |
3507 | * auto-release the semaphore: | 3507 | * auto-release the semaphore: |
3508 | */ | 3508 | */ |
3509 | #ifdef CONFIG_PREEMPT_BKL | 3509 | #ifdef CONFIG_PREEMPT_BKL |
3510 | saved_lock_depth = task->lock_depth; | 3510 | saved_lock_depth = task->lock_depth; |
3511 | task->lock_depth = -1; | 3511 | task->lock_depth = -1; |
3512 | #endif | 3512 | #endif |
3513 | local_irq_enable(); | 3513 | local_irq_enable(); |
3514 | schedule(); | 3514 | schedule(); |
3515 | local_irq_disable(); | 3515 | local_irq_disable(); |
3516 | #ifdef CONFIG_PREEMPT_BKL | 3516 | #ifdef CONFIG_PREEMPT_BKL |
3517 | task->lock_depth = saved_lock_depth; | 3517 | task->lock_depth = saved_lock_depth; |
3518 | #endif | 3518 | #endif |
3519 | sub_preempt_count(PREEMPT_ACTIVE); | 3519 | sub_preempt_count(PREEMPT_ACTIVE); |
3520 | 3520 | ||
3521 | /* we could miss a preemption opportunity between schedule and now */ | 3521 | /* we could miss a preemption opportunity between schedule and now */ |
3522 | barrier(); | 3522 | barrier(); |
3523 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3523 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3524 | goto need_resched; | 3524 | goto need_resched; |
3525 | } | 3525 | } |
3526 | 3526 | ||
3527 | #endif /* CONFIG_PREEMPT */ | 3527 | #endif /* CONFIG_PREEMPT */ |
3528 | 3528 | ||
3529 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 3529 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
3530 | void *key) | 3530 | void *key) |
3531 | { | 3531 | { |
3532 | return try_to_wake_up(curr->private, mode, sync); | 3532 | return try_to_wake_up(curr->private, mode, sync); |
3533 | } | 3533 | } |
3534 | EXPORT_SYMBOL(default_wake_function); | 3534 | EXPORT_SYMBOL(default_wake_function); |
3535 | 3535 | ||
3536 | /* | 3536 | /* |
3537 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 3537 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
3538 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 3538 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
3539 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 3539 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
3540 | * | 3540 | * |
3541 | * There are circumstances in which we can try to wake a task which has already | 3541 | * There are circumstances in which we can try to wake a task which has already |
3542 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 3542 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
3543 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 3543 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
3544 | */ | 3544 | */ |
3545 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 3545 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
3546 | int nr_exclusive, int sync, void *key) | 3546 | int nr_exclusive, int sync, void *key) |
3547 | { | 3547 | { |
3548 | struct list_head *tmp, *next; | 3548 | struct list_head *tmp, *next; |
3549 | 3549 | ||
3550 | list_for_each_safe(tmp, next, &q->task_list) { | 3550 | list_for_each_safe(tmp, next, &q->task_list) { |
3551 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | 3551 | wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); |
3552 | unsigned flags = curr->flags; | 3552 | unsigned flags = curr->flags; |
3553 | 3553 | ||
3554 | if (curr->func(curr, mode, sync, key) && | 3554 | if (curr->func(curr, mode, sync, key) && |
3555 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 3555 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
3556 | break; | 3556 | break; |
3557 | } | 3557 | } |
3558 | } | 3558 | } |
3559 | 3559 | ||
3560 | /** | 3560 | /** |
3561 | * __wake_up - wake up threads blocked on a waitqueue. | 3561 | * __wake_up - wake up threads blocked on a waitqueue. |
3562 | * @q: the waitqueue | 3562 | * @q: the waitqueue |
3563 | * @mode: which threads | 3563 | * @mode: which threads |
3564 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 3564 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
3565 | * @key: is directly passed to the wakeup function | 3565 | * @key: is directly passed to the wakeup function |
3566 | */ | 3566 | */ |
3567 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | 3567 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, |
3568 | int nr_exclusive, void *key) | 3568 | int nr_exclusive, void *key) |
3569 | { | 3569 | { |
3570 | unsigned long flags; | 3570 | unsigned long flags; |
3571 | 3571 | ||
3572 | spin_lock_irqsave(&q->lock, flags); | 3572 | spin_lock_irqsave(&q->lock, flags); |
3573 | __wake_up_common(q, mode, nr_exclusive, 0, key); | 3573 | __wake_up_common(q, mode, nr_exclusive, 0, key); |
3574 | spin_unlock_irqrestore(&q->lock, flags); | 3574 | spin_unlock_irqrestore(&q->lock, flags); |
3575 | } | 3575 | } |
3576 | EXPORT_SYMBOL(__wake_up); | 3576 | EXPORT_SYMBOL(__wake_up); |
3577 | 3577 | ||
3578 | /* | 3578 | /* |
3579 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. | 3579 | * Same as __wake_up but called with the spinlock in wait_queue_head_t held. |
3580 | */ | 3580 | */ |
3581 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | 3581 | void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) |
3582 | { | 3582 | { |
3583 | __wake_up_common(q, mode, 1, 0, NULL); | 3583 | __wake_up_common(q, mode, 1, 0, NULL); |
3584 | } | 3584 | } |
3585 | 3585 | ||
3586 | /** | 3586 | /** |
3587 | * __wake_up_sync - wake up threads blocked on a waitqueue. | 3587 | * __wake_up_sync - wake up threads blocked on a waitqueue. |
3588 | * @q: the waitqueue | 3588 | * @q: the waitqueue |
3589 | * @mode: which threads | 3589 | * @mode: which threads |
3590 | * @nr_exclusive: how many wake-one or wake-many threads to wake up | 3590 | * @nr_exclusive: how many wake-one or wake-many threads to wake up |
3591 | * | 3591 | * |
3592 | * The sync wakeup differs that the waker knows that it will schedule | 3592 | * The sync wakeup differs that the waker knows that it will schedule |
3593 | * away soon, so while the target thread will be woken up, it will not | 3593 | * away soon, so while the target thread will be woken up, it will not |
3594 | * be migrated to another CPU - ie. the two threads are 'synchronized' | 3594 | * be migrated to another CPU - ie. the two threads are 'synchronized' |
3595 | * with each other. This can prevent needless bouncing between CPUs. | 3595 | * with each other. This can prevent needless bouncing between CPUs. |
3596 | * | 3596 | * |
3597 | * On UP it can prevent extra preemption. | 3597 | * On UP it can prevent extra preemption. |
3598 | */ | 3598 | */ |
3599 | void fastcall | 3599 | void fastcall |
3600 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 3600 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) |
3601 | { | 3601 | { |
3602 | unsigned long flags; | 3602 | unsigned long flags; |
3603 | int sync = 1; | 3603 | int sync = 1; |
3604 | 3604 | ||
3605 | if (unlikely(!q)) | 3605 | if (unlikely(!q)) |
3606 | return; | 3606 | return; |
3607 | 3607 | ||
3608 | if (unlikely(!nr_exclusive)) | 3608 | if (unlikely(!nr_exclusive)) |
3609 | sync = 0; | 3609 | sync = 0; |
3610 | 3610 | ||
3611 | spin_lock_irqsave(&q->lock, flags); | 3611 | spin_lock_irqsave(&q->lock, flags); |
3612 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); | 3612 | __wake_up_common(q, mode, nr_exclusive, sync, NULL); |
3613 | spin_unlock_irqrestore(&q->lock, flags); | 3613 | spin_unlock_irqrestore(&q->lock, flags); |
3614 | } | 3614 | } |
3615 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ | 3615 | EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ |
3616 | 3616 | ||
3617 | void fastcall complete(struct completion *x) | 3617 | void fastcall complete(struct completion *x) |
3618 | { | 3618 | { |
3619 | unsigned long flags; | 3619 | unsigned long flags; |
3620 | 3620 | ||
3621 | spin_lock_irqsave(&x->wait.lock, flags); | 3621 | spin_lock_irqsave(&x->wait.lock, flags); |
3622 | x->done++; | 3622 | x->done++; |
3623 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 3623 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
3624 | 1, 0, NULL); | 3624 | 1, 0, NULL); |
3625 | spin_unlock_irqrestore(&x->wait.lock, flags); | 3625 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3626 | } | 3626 | } |
3627 | EXPORT_SYMBOL(complete); | 3627 | EXPORT_SYMBOL(complete); |
3628 | 3628 | ||
3629 | void fastcall complete_all(struct completion *x) | 3629 | void fastcall complete_all(struct completion *x) |
3630 | { | 3630 | { |
3631 | unsigned long flags; | 3631 | unsigned long flags; |
3632 | 3632 | ||
3633 | spin_lock_irqsave(&x->wait.lock, flags); | 3633 | spin_lock_irqsave(&x->wait.lock, flags); |
3634 | x->done += UINT_MAX/2; | 3634 | x->done += UINT_MAX/2; |
3635 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 3635 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, |
3636 | 0, 0, NULL); | 3636 | 0, 0, NULL); |
3637 | spin_unlock_irqrestore(&x->wait.lock, flags); | 3637 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3638 | } | 3638 | } |
3639 | EXPORT_SYMBOL(complete_all); | 3639 | EXPORT_SYMBOL(complete_all); |
3640 | 3640 | ||
3641 | void fastcall __sched wait_for_completion(struct completion *x) | 3641 | void fastcall __sched wait_for_completion(struct completion *x) |
3642 | { | 3642 | { |
3643 | might_sleep(); | 3643 | might_sleep(); |
3644 | 3644 | ||
3645 | spin_lock_irq(&x->wait.lock); | 3645 | spin_lock_irq(&x->wait.lock); |
3646 | if (!x->done) { | 3646 | if (!x->done) { |
3647 | DECLARE_WAITQUEUE(wait, current); | 3647 | DECLARE_WAITQUEUE(wait, current); |
3648 | 3648 | ||
3649 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3649 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3650 | __add_wait_queue_tail(&x->wait, &wait); | 3650 | __add_wait_queue_tail(&x->wait, &wait); |
3651 | do { | 3651 | do { |
3652 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3652 | __set_current_state(TASK_UNINTERRUPTIBLE); |
3653 | spin_unlock_irq(&x->wait.lock); | 3653 | spin_unlock_irq(&x->wait.lock); |
3654 | schedule(); | 3654 | schedule(); |
3655 | spin_lock_irq(&x->wait.lock); | 3655 | spin_lock_irq(&x->wait.lock); |
3656 | } while (!x->done); | 3656 | } while (!x->done); |
3657 | __remove_wait_queue(&x->wait, &wait); | 3657 | __remove_wait_queue(&x->wait, &wait); |
3658 | } | 3658 | } |
3659 | x->done--; | 3659 | x->done--; |
3660 | spin_unlock_irq(&x->wait.lock); | 3660 | spin_unlock_irq(&x->wait.lock); |
3661 | } | 3661 | } |
3662 | EXPORT_SYMBOL(wait_for_completion); | 3662 | EXPORT_SYMBOL(wait_for_completion); |
3663 | 3663 | ||
3664 | unsigned long fastcall __sched | 3664 | unsigned long fastcall __sched |
3665 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) | 3665 | wait_for_completion_timeout(struct completion *x, unsigned long timeout) |
3666 | { | 3666 | { |
3667 | might_sleep(); | 3667 | might_sleep(); |
3668 | 3668 | ||
3669 | spin_lock_irq(&x->wait.lock); | 3669 | spin_lock_irq(&x->wait.lock); |
3670 | if (!x->done) { | 3670 | if (!x->done) { |
3671 | DECLARE_WAITQUEUE(wait, current); | 3671 | DECLARE_WAITQUEUE(wait, current); |
3672 | 3672 | ||
3673 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3673 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3674 | __add_wait_queue_tail(&x->wait, &wait); | 3674 | __add_wait_queue_tail(&x->wait, &wait); |
3675 | do { | 3675 | do { |
3676 | __set_current_state(TASK_UNINTERRUPTIBLE); | 3676 | __set_current_state(TASK_UNINTERRUPTIBLE); |
3677 | spin_unlock_irq(&x->wait.lock); | 3677 | spin_unlock_irq(&x->wait.lock); |
3678 | timeout = schedule_timeout(timeout); | 3678 | timeout = schedule_timeout(timeout); |
3679 | spin_lock_irq(&x->wait.lock); | 3679 | spin_lock_irq(&x->wait.lock); |
3680 | if (!timeout) { | 3680 | if (!timeout) { |
3681 | __remove_wait_queue(&x->wait, &wait); | 3681 | __remove_wait_queue(&x->wait, &wait); |
3682 | goto out; | 3682 | goto out; |
3683 | } | 3683 | } |
3684 | } while (!x->done); | 3684 | } while (!x->done); |
3685 | __remove_wait_queue(&x->wait, &wait); | 3685 | __remove_wait_queue(&x->wait, &wait); |
3686 | } | 3686 | } |
3687 | x->done--; | 3687 | x->done--; |
3688 | out: | 3688 | out: |
3689 | spin_unlock_irq(&x->wait.lock); | 3689 | spin_unlock_irq(&x->wait.lock); |
3690 | return timeout; | 3690 | return timeout; |
3691 | } | 3691 | } |
3692 | EXPORT_SYMBOL(wait_for_completion_timeout); | 3692 | EXPORT_SYMBOL(wait_for_completion_timeout); |
3693 | 3693 | ||
3694 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) | 3694 | int fastcall __sched wait_for_completion_interruptible(struct completion *x) |
3695 | { | 3695 | { |
3696 | int ret = 0; | 3696 | int ret = 0; |
3697 | 3697 | ||
3698 | might_sleep(); | 3698 | might_sleep(); |
3699 | 3699 | ||
3700 | spin_lock_irq(&x->wait.lock); | 3700 | spin_lock_irq(&x->wait.lock); |
3701 | if (!x->done) { | 3701 | if (!x->done) { |
3702 | DECLARE_WAITQUEUE(wait, current); | 3702 | DECLARE_WAITQUEUE(wait, current); |
3703 | 3703 | ||
3704 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3704 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3705 | __add_wait_queue_tail(&x->wait, &wait); | 3705 | __add_wait_queue_tail(&x->wait, &wait); |
3706 | do { | 3706 | do { |
3707 | if (signal_pending(current)) { | 3707 | if (signal_pending(current)) { |
3708 | ret = -ERESTARTSYS; | 3708 | ret = -ERESTARTSYS; |
3709 | __remove_wait_queue(&x->wait, &wait); | 3709 | __remove_wait_queue(&x->wait, &wait); |
3710 | goto out; | 3710 | goto out; |
3711 | } | 3711 | } |
3712 | __set_current_state(TASK_INTERRUPTIBLE); | 3712 | __set_current_state(TASK_INTERRUPTIBLE); |
3713 | spin_unlock_irq(&x->wait.lock); | 3713 | spin_unlock_irq(&x->wait.lock); |
3714 | schedule(); | 3714 | schedule(); |
3715 | spin_lock_irq(&x->wait.lock); | 3715 | spin_lock_irq(&x->wait.lock); |
3716 | } while (!x->done); | 3716 | } while (!x->done); |
3717 | __remove_wait_queue(&x->wait, &wait); | 3717 | __remove_wait_queue(&x->wait, &wait); |
3718 | } | 3718 | } |
3719 | x->done--; | 3719 | x->done--; |
3720 | out: | 3720 | out: |
3721 | spin_unlock_irq(&x->wait.lock); | 3721 | spin_unlock_irq(&x->wait.lock); |
3722 | 3722 | ||
3723 | return ret; | 3723 | return ret; |
3724 | } | 3724 | } |
3725 | EXPORT_SYMBOL(wait_for_completion_interruptible); | 3725 | EXPORT_SYMBOL(wait_for_completion_interruptible); |
3726 | 3726 | ||
3727 | unsigned long fastcall __sched | 3727 | unsigned long fastcall __sched |
3728 | wait_for_completion_interruptible_timeout(struct completion *x, | 3728 | wait_for_completion_interruptible_timeout(struct completion *x, |
3729 | unsigned long timeout) | 3729 | unsigned long timeout) |
3730 | { | 3730 | { |
3731 | might_sleep(); | 3731 | might_sleep(); |
3732 | 3732 | ||
3733 | spin_lock_irq(&x->wait.lock); | 3733 | spin_lock_irq(&x->wait.lock); |
3734 | if (!x->done) { | 3734 | if (!x->done) { |
3735 | DECLARE_WAITQUEUE(wait, current); | 3735 | DECLARE_WAITQUEUE(wait, current); |
3736 | 3736 | ||
3737 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 3737 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3738 | __add_wait_queue_tail(&x->wait, &wait); | 3738 | __add_wait_queue_tail(&x->wait, &wait); |
3739 | do { | 3739 | do { |
3740 | if (signal_pending(current)) { | 3740 | if (signal_pending(current)) { |
3741 | timeout = -ERESTARTSYS; | 3741 | timeout = -ERESTARTSYS; |
3742 | __remove_wait_queue(&x->wait, &wait); | 3742 | __remove_wait_queue(&x->wait, &wait); |
3743 | goto out; | 3743 | goto out; |
3744 | } | 3744 | } |
3745 | __set_current_state(TASK_INTERRUPTIBLE); | 3745 | __set_current_state(TASK_INTERRUPTIBLE); |
3746 | spin_unlock_irq(&x->wait.lock); | 3746 | spin_unlock_irq(&x->wait.lock); |
3747 | timeout = schedule_timeout(timeout); | 3747 | timeout = schedule_timeout(timeout); |
3748 | spin_lock_irq(&x->wait.lock); | 3748 | spin_lock_irq(&x->wait.lock); |
3749 | if (!timeout) { | 3749 | if (!timeout) { |
3750 | __remove_wait_queue(&x->wait, &wait); | 3750 | __remove_wait_queue(&x->wait, &wait); |
3751 | goto out; | 3751 | goto out; |
3752 | } | 3752 | } |
3753 | } while (!x->done); | 3753 | } while (!x->done); |
3754 | __remove_wait_queue(&x->wait, &wait); | 3754 | __remove_wait_queue(&x->wait, &wait); |
3755 | } | 3755 | } |
3756 | x->done--; | 3756 | x->done--; |
3757 | out: | 3757 | out: |
3758 | spin_unlock_irq(&x->wait.lock); | 3758 | spin_unlock_irq(&x->wait.lock); |
3759 | return timeout; | 3759 | return timeout; |
3760 | } | 3760 | } |
3761 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 3761 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
3762 | 3762 | ||
3763 | 3763 | ||
3764 | #define SLEEP_ON_VAR \ | 3764 | #define SLEEP_ON_VAR \ |
3765 | unsigned long flags; \ | 3765 | unsigned long flags; \ |
3766 | wait_queue_t wait; \ | 3766 | wait_queue_t wait; \ |
3767 | init_waitqueue_entry(&wait, current); | 3767 | init_waitqueue_entry(&wait, current); |
3768 | 3768 | ||
3769 | #define SLEEP_ON_HEAD \ | 3769 | #define SLEEP_ON_HEAD \ |
3770 | spin_lock_irqsave(&q->lock,flags); \ | 3770 | spin_lock_irqsave(&q->lock,flags); \ |
3771 | __add_wait_queue(q, &wait); \ | 3771 | __add_wait_queue(q, &wait); \ |
3772 | spin_unlock(&q->lock); | 3772 | spin_unlock(&q->lock); |
3773 | 3773 | ||
3774 | #define SLEEP_ON_TAIL \ | 3774 | #define SLEEP_ON_TAIL \ |
3775 | spin_lock_irq(&q->lock); \ | 3775 | spin_lock_irq(&q->lock); \ |
3776 | __remove_wait_queue(q, &wait); \ | 3776 | __remove_wait_queue(q, &wait); \ |
3777 | spin_unlock_irqrestore(&q->lock, flags); | 3777 | spin_unlock_irqrestore(&q->lock, flags); |
3778 | 3778 | ||
3779 | void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | 3779 | void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) |
3780 | { | 3780 | { |
3781 | SLEEP_ON_VAR | 3781 | SLEEP_ON_VAR |
3782 | 3782 | ||
3783 | current->state = TASK_INTERRUPTIBLE; | 3783 | current->state = TASK_INTERRUPTIBLE; |
3784 | 3784 | ||
3785 | SLEEP_ON_HEAD | 3785 | SLEEP_ON_HEAD |
3786 | schedule(); | 3786 | schedule(); |
3787 | SLEEP_ON_TAIL | 3787 | SLEEP_ON_TAIL |
3788 | } | 3788 | } |
3789 | EXPORT_SYMBOL(interruptible_sleep_on); | 3789 | EXPORT_SYMBOL(interruptible_sleep_on); |
3790 | 3790 | ||
3791 | long fastcall __sched | 3791 | long fastcall __sched |
3792 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3792 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3793 | { | 3793 | { |
3794 | SLEEP_ON_VAR | 3794 | SLEEP_ON_VAR |
3795 | 3795 | ||
3796 | current->state = TASK_INTERRUPTIBLE; | 3796 | current->state = TASK_INTERRUPTIBLE; |
3797 | 3797 | ||
3798 | SLEEP_ON_HEAD | 3798 | SLEEP_ON_HEAD |
3799 | timeout = schedule_timeout(timeout); | 3799 | timeout = schedule_timeout(timeout); |
3800 | SLEEP_ON_TAIL | 3800 | SLEEP_ON_TAIL |
3801 | 3801 | ||
3802 | return timeout; | 3802 | return timeout; |
3803 | } | 3803 | } |
3804 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); | 3804 | EXPORT_SYMBOL(interruptible_sleep_on_timeout); |
3805 | 3805 | ||
3806 | void fastcall __sched sleep_on(wait_queue_head_t *q) | 3806 | void fastcall __sched sleep_on(wait_queue_head_t *q) |
3807 | { | 3807 | { |
3808 | SLEEP_ON_VAR | 3808 | SLEEP_ON_VAR |
3809 | 3809 | ||
3810 | current->state = TASK_UNINTERRUPTIBLE; | 3810 | current->state = TASK_UNINTERRUPTIBLE; |
3811 | 3811 | ||
3812 | SLEEP_ON_HEAD | 3812 | SLEEP_ON_HEAD |
3813 | schedule(); | 3813 | schedule(); |
3814 | SLEEP_ON_TAIL | 3814 | SLEEP_ON_TAIL |
3815 | } | 3815 | } |
3816 | EXPORT_SYMBOL(sleep_on); | 3816 | EXPORT_SYMBOL(sleep_on); |
3817 | 3817 | ||
3818 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3818 | long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) |
3819 | { | 3819 | { |
3820 | SLEEP_ON_VAR | 3820 | SLEEP_ON_VAR |
3821 | 3821 | ||
3822 | current->state = TASK_UNINTERRUPTIBLE; | 3822 | current->state = TASK_UNINTERRUPTIBLE; |
3823 | 3823 | ||
3824 | SLEEP_ON_HEAD | 3824 | SLEEP_ON_HEAD |
3825 | timeout = schedule_timeout(timeout); | 3825 | timeout = schedule_timeout(timeout); |
3826 | SLEEP_ON_TAIL | 3826 | SLEEP_ON_TAIL |
3827 | 3827 | ||
3828 | return timeout; | 3828 | return timeout; |
3829 | } | 3829 | } |
3830 | 3830 | ||
3831 | EXPORT_SYMBOL(sleep_on_timeout); | 3831 | EXPORT_SYMBOL(sleep_on_timeout); |
3832 | 3832 | ||
3833 | #ifdef CONFIG_RT_MUTEXES | 3833 | #ifdef CONFIG_RT_MUTEXES |
3834 | 3834 | ||
3835 | /* | 3835 | /* |
3836 | * rt_mutex_setprio - set the current priority of a task | 3836 | * rt_mutex_setprio - set the current priority of a task |
3837 | * @p: task | 3837 | * @p: task |
3838 | * @prio: prio value (kernel-internal form) | 3838 | * @prio: prio value (kernel-internal form) |
3839 | * | 3839 | * |
3840 | * This function changes the 'effective' priority of a task. It does | 3840 | * This function changes the 'effective' priority of a task. It does |
3841 | * not touch ->normal_prio like __setscheduler(). | 3841 | * not touch ->normal_prio like __setscheduler(). |
3842 | * | 3842 | * |
3843 | * Used by the rt_mutex code to implement priority inheritance logic. | 3843 | * Used by the rt_mutex code to implement priority inheritance logic. |
3844 | */ | 3844 | */ |
3845 | void rt_mutex_setprio(struct task_struct *p, int prio) | 3845 | void rt_mutex_setprio(struct task_struct *p, int prio) |
3846 | { | 3846 | { |
3847 | struct prio_array *array; | 3847 | struct prio_array *array; |
3848 | unsigned long flags; | 3848 | unsigned long flags; |
3849 | struct rq *rq; | 3849 | struct rq *rq; |
3850 | int oldprio; | 3850 | int oldprio; |
3851 | 3851 | ||
3852 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 3852 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
3853 | 3853 | ||
3854 | rq = task_rq_lock(p, &flags); | 3854 | rq = task_rq_lock(p, &flags); |
3855 | 3855 | ||
3856 | oldprio = p->prio; | 3856 | oldprio = p->prio; |
3857 | array = p->array; | 3857 | array = p->array; |
3858 | if (array) | 3858 | if (array) |
3859 | dequeue_task(p, array); | 3859 | dequeue_task(p, array); |
3860 | p->prio = prio; | 3860 | p->prio = prio; |
3861 | 3861 | ||
3862 | if (array) { | 3862 | if (array) { |
3863 | /* | 3863 | /* |
3864 | * If changing to an RT priority then queue it | 3864 | * If changing to an RT priority then queue it |
3865 | * in the active array! | 3865 | * in the active array! |
3866 | */ | 3866 | */ |
3867 | if (rt_task(p)) | 3867 | if (rt_task(p)) |
3868 | array = rq->active; | 3868 | array = rq->active; |
3869 | enqueue_task(p, array); | 3869 | enqueue_task(p, array); |
3870 | /* | 3870 | /* |
3871 | * Reschedule if we are currently running on this runqueue and | 3871 | * Reschedule if we are currently running on this runqueue and |
3872 | * our priority decreased, or if we are not currently running on | 3872 | * our priority decreased, or if we are not currently running on |
3873 | * this runqueue and our priority is higher than the current's | 3873 | * this runqueue and our priority is higher than the current's |
3874 | */ | 3874 | */ |
3875 | if (task_running(rq, p)) { | 3875 | if (task_running(rq, p)) { |
3876 | if (p->prio > oldprio) | 3876 | if (p->prio > oldprio) |
3877 | resched_task(rq->curr); | 3877 | resched_task(rq->curr); |
3878 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 3878 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3879 | resched_task(rq->curr); | 3879 | resched_task(rq->curr); |
3880 | } | 3880 | } |
3881 | task_rq_unlock(rq, &flags); | 3881 | task_rq_unlock(rq, &flags); |
3882 | } | 3882 | } |
3883 | 3883 | ||
3884 | #endif | 3884 | #endif |
3885 | 3885 | ||
3886 | void set_user_nice(struct task_struct *p, long nice) | 3886 | void set_user_nice(struct task_struct *p, long nice) |
3887 | { | 3887 | { |
3888 | struct prio_array *array; | 3888 | struct prio_array *array; |
3889 | int old_prio, delta; | 3889 | int old_prio, delta; |
3890 | unsigned long flags; | 3890 | unsigned long flags; |
3891 | struct rq *rq; | 3891 | struct rq *rq; |
3892 | 3892 | ||
3893 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3893 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3894 | return; | 3894 | return; |
3895 | /* | 3895 | /* |
3896 | * We have to be careful, if called from sys_setpriority(), | 3896 | * We have to be careful, if called from sys_setpriority(), |
3897 | * the task might be in the middle of scheduling on another CPU. | 3897 | * the task might be in the middle of scheduling on another CPU. |
3898 | */ | 3898 | */ |
3899 | rq = task_rq_lock(p, &flags); | 3899 | rq = task_rq_lock(p, &flags); |
3900 | /* | 3900 | /* |
3901 | * The RT priorities are set via sched_setscheduler(), but we still | 3901 | * The RT priorities are set via sched_setscheduler(), but we still |
3902 | * allow the 'normal' nice value to be set - but as expected | 3902 | * allow the 'normal' nice value to be set - but as expected |
3903 | * it wont have any effect on scheduling until the task is | 3903 | * it wont have any effect on scheduling until the task is |
3904 | * not SCHED_NORMAL/SCHED_BATCH: | 3904 | * not SCHED_NORMAL/SCHED_BATCH: |
3905 | */ | 3905 | */ |
3906 | if (has_rt_policy(p)) { | 3906 | if (has_rt_policy(p)) { |
3907 | p->static_prio = NICE_TO_PRIO(nice); | 3907 | p->static_prio = NICE_TO_PRIO(nice); |
3908 | goto out_unlock; | 3908 | goto out_unlock; |
3909 | } | 3909 | } |
3910 | array = p->array; | 3910 | array = p->array; |
3911 | if (array) { | 3911 | if (array) { |
3912 | dequeue_task(p, array); | 3912 | dequeue_task(p, array); |
3913 | dec_raw_weighted_load(rq, p); | 3913 | dec_raw_weighted_load(rq, p); |
3914 | } | 3914 | } |
3915 | 3915 | ||
3916 | p->static_prio = NICE_TO_PRIO(nice); | 3916 | p->static_prio = NICE_TO_PRIO(nice); |
3917 | set_load_weight(p); | 3917 | set_load_weight(p); |
3918 | old_prio = p->prio; | 3918 | old_prio = p->prio; |
3919 | p->prio = effective_prio(p); | 3919 | p->prio = effective_prio(p); |
3920 | delta = p->prio - old_prio; | 3920 | delta = p->prio - old_prio; |
3921 | 3921 | ||
3922 | if (array) { | 3922 | if (array) { |
3923 | enqueue_task(p, array); | 3923 | enqueue_task(p, array); |
3924 | inc_raw_weighted_load(rq, p); | 3924 | inc_raw_weighted_load(rq, p); |
3925 | /* | 3925 | /* |
3926 | * If the task increased its priority or is running and | 3926 | * If the task increased its priority or is running and |
3927 | * lowered its priority, then reschedule its CPU: | 3927 | * lowered its priority, then reschedule its CPU: |
3928 | */ | 3928 | */ |
3929 | if (delta < 0 || (delta > 0 && task_running(rq, p))) | 3929 | if (delta < 0 || (delta > 0 && task_running(rq, p))) |
3930 | resched_task(rq->curr); | 3930 | resched_task(rq->curr); |
3931 | } | 3931 | } |
3932 | out_unlock: | 3932 | out_unlock: |
3933 | task_rq_unlock(rq, &flags); | 3933 | task_rq_unlock(rq, &flags); |
3934 | } | 3934 | } |
3935 | EXPORT_SYMBOL(set_user_nice); | 3935 | EXPORT_SYMBOL(set_user_nice); |
3936 | 3936 | ||
3937 | /* | 3937 | /* |
3938 | * can_nice - check if a task can reduce its nice value | 3938 | * can_nice - check if a task can reduce its nice value |
3939 | * @p: task | 3939 | * @p: task |
3940 | * @nice: nice value | 3940 | * @nice: nice value |
3941 | */ | 3941 | */ |
3942 | int can_nice(const struct task_struct *p, const int nice) | 3942 | int can_nice(const struct task_struct *p, const int nice) |
3943 | { | 3943 | { |
3944 | /* convert nice value [19,-20] to rlimit style value [1,40] */ | 3944 | /* convert nice value [19,-20] to rlimit style value [1,40] */ |
3945 | int nice_rlim = 20 - nice; | 3945 | int nice_rlim = 20 - nice; |
3946 | 3946 | ||
3947 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || | 3947 | return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || |
3948 | capable(CAP_SYS_NICE)); | 3948 | capable(CAP_SYS_NICE)); |
3949 | } | 3949 | } |
3950 | 3950 | ||
3951 | #ifdef __ARCH_WANT_SYS_NICE | 3951 | #ifdef __ARCH_WANT_SYS_NICE |
3952 | 3952 | ||
3953 | /* | 3953 | /* |
3954 | * sys_nice - change the priority of the current process. | 3954 | * sys_nice - change the priority of the current process. |
3955 | * @increment: priority increment | 3955 | * @increment: priority increment |
3956 | * | 3956 | * |
3957 | * sys_setpriority is a more generic, but much slower function that | 3957 | * sys_setpriority is a more generic, but much slower function that |
3958 | * does similar things. | 3958 | * does similar things. |
3959 | */ | 3959 | */ |
3960 | asmlinkage long sys_nice(int increment) | 3960 | asmlinkage long sys_nice(int increment) |
3961 | { | 3961 | { |
3962 | long nice, retval; | 3962 | long nice, retval; |
3963 | 3963 | ||
3964 | /* | 3964 | /* |
3965 | * Setpriority might change our priority at the same moment. | 3965 | * Setpriority might change our priority at the same moment. |
3966 | * We don't have to worry. Conceptually one call occurs first | 3966 | * We don't have to worry. Conceptually one call occurs first |
3967 | * and we have a single winner. | 3967 | * and we have a single winner. |
3968 | */ | 3968 | */ |
3969 | if (increment < -40) | 3969 | if (increment < -40) |
3970 | increment = -40; | 3970 | increment = -40; |
3971 | if (increment > 40) | 3971 | if (increment > 40) |
3972 | increment = 40; | 3972 | increment = 40; |
3973 | 3973 | ||
3974 | nice = PRIO_TO_NICE(current->static_prio) + increment; | 3974 | nice = PRIO_TO_NICE(current->static_prio) + increment; |
3975 | if (nice < -20) | 3975 | if (nice < -20) |
3976 | nice = -20; | 3976 | nice = -20; |
3977 | if (nice > 19) | 3977 | if (nice > 19) |
3978 | nice = 19; | 3978 | nice = 19; |
3979 | 3979 | ||
3980 | if (increment < 0 && !can_nice(current, nice)) | 3980 | if (increment < 0 && !can_nice(current, nice)) |
3981 | return -EPERM; | 3981 | return -EPERM; |
3982 | 3982 | ||
3983 | retval = security_task_setnice(current, nice); | 3983 | retval = security_task_setnice(current, nice); |
3984 | if (retval) | 3984 | if (retval) |
3985 | return retval; | 3985 | return retval; |
3986 | 3986 | ||
3987 | set_user_nice(current, nice); | 3987 | set_user_nice(current, nice); |
3988 | return 0; | 3988 | return 0; |
3989 | } | 3989 | } |
3990 | 3990 | ||
3991 | #endif | 3991 | #endif |
3992 | 3992 | ||
3993 | /** | 3993 | /** |
3994 | * task_prio - return the priority value of a given task. | 3994 | * task_prio - return the priority value of a given task. |
3995 | * @p: the task in question. | 3995 | * @p: the task in question. |
3996 | * | 3996 | * |
3997 | * This is the priority value as seen by users in /proc. | 3997 | * This is the priority value as seen by users in /proc. |
3998 | * RT tasks are offset by -200. Normal tasks are centered | 3998 | * RT tasks are offset by -200. Normal tasks are centered |
3999 | * around 0, value goes from -16 to +15. | 3999 | * around 0, value goes from -16 to +15. |
4000 | */ | 4000 | */ |
4001 | int task_prio(const struct task_struct *p) | 4001 | int task_prio(const struct task_struct *p) |
4002 | { | 4002 | { |
4003 | return p->prio - MAX_RT_PRIO; | 4003 | return p->prio - MAX_RT_PRIO; |
4004 | } | 4004 | } |
4005 | 4005 | ||
4006 | /** | 4006 | /** |
4007 | * task_nice - return the nice value of a given task. | 4007 | * task_nice - return the nice value of a given task. |
4008 | * @p: the task in question. | 4008 | * @p: the task in question. |
4009 | */ | 4009 | */ |
4010 | int task_nice(const struct task_struct *p) | 4010 | int task_nice(const struct task_struct *p) |
4011 | { | 4011 | { |
4012 | return TASK_NICE(p); | 4012 | return TASK_NICE(p); |
4013 | } | 4013 | } |
4014 | EXPORT_SYMBOL_GPL(task_nice); | 4014 | EXPORT_SYMBOL_GPL(task_nice); |
4015 | 4015 | ||
4016 | /** | 4016 | /** |
4017 | * idle_cpu - is a given cpu idle currently? | 4017 | * idle_cpu - is a given cpu idle currently? |
4018 | * @cpu: the processor in question. | 4018 | * @cpu: the processor in question. |
4019 | */ | 4019 | */ |
4020 | int idle_cpu(int cpu) | 4020 | int idle_cpu(int cpu) |
4021 | { | 4021 | { |
4022 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; | 4022 | return cpu_curr(cpu) == cpu_rq(cpu)->idle; |
4023 | } | 4023 | } |
4024 | 4024 | ||
4025 | /** | 4025 | /** |
4026 | * idle_task - return the idle task for a given cpu. | 4026 | * idle_task - return the idle task for a given cpu. |
4027 | * @cpu: the processor in question. | 4027 | * @cpu: the processor in question. |
4028 | */ | 4028 | */ |
4029 | struct task_struct *idle_task(int cpu) | 4029 | struct task_struct *idle_task(int cpu) |
4030 | { | 4030 | { |
4031 | return cpu_rq(cpu)->idle; | 4031 | return cpu_rq(cpu)->idle; |
4032 | } | 4032 | } |
4033 | 4033 | ||
4034 | /** | 4034 | /** |
4035 | * find_process_by_pid - find a process with a matching PID value. | 4035 | * find_process_by_pid - find a process with a matching PID value. |
4036 | * @pid: the pid in question. | 4036 | * @pid: the pid in question. |
4037 | */ | 4037 | */ |
4038 | static inline struct task_struct *find_process_by_pid(pid_t pid) | 4038 | static inline struct task_struct *find_process_by_pid(pid_t pid) |
4039 | { | 4039 | { |
4040 | return pid ? find_task_by_pid(pid) : current; | 4040 | return pid ? find_task_by_pid(pid) : current; |
4041 | } | 4041 | } |
4042 | 4042 | ||
4043 | /* Actually do priority change: must hold rq lock. */ | 4043 | /* Actually do priority change: must hold rq lock. */ |
4044 | static void __setscheduler(struct task_struct *p, int policy, int prio) | 4044 | static void __setscheduler(struct task_struct *p, int policy, int prio) |
4045 | { | 4045 | { |
4046 | BUG_ON(p->array); | 4046 | BUG_ON(p->array); |
4047 | 4047 | ||
4048 | p->policy = policy; | 4048 | p->policy = policy; |
4049 | p->rt_priority = prio; | 4049 | p->rt_priority = prio; |
4050 | p->normal_prio = normal_prio(p); | 4050 | p->normal_prio = normal_prio(p); |
4051 | /* we are holding p->pi_lock already */ | 4051 | /* we are holding p->pi_lock already */ |
4052 | p->prio = rt_mutex_getprio(p); | 4052 | p->prio = rt_mutex_getprio(p); |
4053 | /* | 4053 | /* |
4054 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 4054 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
4055 | */ | 4055 | */ |
4056 | if (policy == SCHED_BATCH) | 4056 | if (policy == SCHED_BATCH) |
4057 | p->sleep_avg = 0; | 4057 | p->sleep_avg = 0; |
4058 | set_load_weight(p); | 4058 | set_load_weight(p); |
4059 | } | 4059 | } |
4060 | 4060 | ||
4061 | /** | 4061 | /** |
4062 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. | 4062 | * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. |
4063 | * @p: the task in question. | 4063 | * @p: the task in question. |
4064 | * @policy: new policy. | 4064 | * @policy: new policy. |
4065 | * @param: structure containing the new RT priority. | 4065 | * @param: structure containing the new RT priority. |
4066 | * | 4066 | * |
4067 | * NOTE that the task may be already dead. | 4067 | * NOTE that the task may be already dead. |
4068 | */ | 4068 | */ |
4069 | int sched_setscheduler(struct task_struct *p, int policy, | 4069 | int sched_setscheduler(struct task_struct *p, int policy, |
4070 | struct sched_param *param) | 4070 | struct sched_param *param) |
4071 | { | 4071 | { |
4072 | int retval, oldprio, oldpolicy = -1; | 4072 | int retval, oldprio, oldpolicy = -1; |
4073 | struct prio_array *array; | 4073 | struct prio_array *array; |
4074 | unsigned long flags; | 4074 | unsigned long flags; |
4075 | struct rq *rq; | 4075 | struct rq *rq; |
4076 | 4076 | ||
4077 | /* may grab non-irq protected spin_locks */ | 4077 | /* may grab non-irq protected spin_locks */ |
4078 | BUG_ON(in_interrupt()); | 4078 | BUG_ON(in_interrupt()); |
4079 | recheck: | 4079 | recheck: |
4080 | /* double check policy once rq lock held */ | 4080 | /* double check policy once rq lock held */ |
4081 | if (policy < 0) | 4081 | if (policy < 0) |
4082 | policy = oldpolicy = p->policy; | 4082 | policy = oldpolicy = p->policy; |
4083 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 4083 | else if (policy != SCHED_FIFO && policy != SCHED_RR && |
4084 | policy != SCHED_NORMAL && policy != SCHED_BATCH) | 4084 | policy != SCHED_NORMAL && policy != SCHED_BATCH) |
4085 | return -EINVAL; | 4085 | return -EINVAL; |
4086 | /* | 4086 | /* |
4087 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 4087 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
4088 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and | 4088 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and |
4089 | * SCHED_BATCH is 0. | 4089 | * SCHED_BATCH is 0. |
4090 | */ | 4090 | */ |
4091 | if (param->sched_priority < 0 || | 4091 | if (param->sched_priority < 0 || |
4092 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 4092 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
4093 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 4093 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
4094 | return -EINVAL; | 4094 | return -EINVAL; |
4095 | if (is_rt_policy(policy) != (param->sched_priority != 0)) | 4095 | if (is_rt_policy(policy) != (param->sched_priority != 0)) |
4096 | return -EINVAL; | 4096 | return -EINVAL; |
4097 | 4097 | ||
4098 | /* | 4098 | /* |
4099 | * Allow unprivileged RT tasks to decrease priority: | 4099 | * Allow unprivileged RT tasks to decrease priority: |
4100 | */ | 4100 | */ |
4101 | if (!capable(CAP_SYS_NICE)) { | 4101 | if (!capable(CAP_SYS_NICE)) { |
4102 | if (is_rt_policy(policy)) { | 4102 | if (is_rt_policy(policy)) { |
4103 | unsigned long rlim_rtprio; | 4103 | unsigned long rlim_rtprio; |
4104 | unsigned long flags; | 4104 | unsigned long flags; |
4105 | 4105 | ||
4106 | if (!lock_task_sighand(p, &flags)) | 4106 | if (!lock_task_sighand(p, &flags)) |
4107 | return -ESRCH; | 4107 | return -ESRCH; |
4108 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; | 4108 | rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; |
4109 | unlock_task_sighand(p, &flags); | 4109 | unlock_task_sighand(p, &flags); |
4110 | 4110 | ||
4111 | /* can't set/change the rt policy */ | 4111 | /* can't set/change the rt policy */ |
4112 | if (policy != p->policy && !rlim_rtprio) | 4112 | if (policy != p->policy && !rlim_rtprio) |
4113 | return -EPERM; | 4113 | return -EPERM; |
4114 | 4114 | ||
4115 | /* can't increase priority */ | 4115 | /* can't increase priority */ |
4116 | if (param->sched_priority > p->rt_priority && | 4116 | if (param->sched_priority > p->rt_priority && |
4117 | param->sched_priority > rlim_rtprio) | 4117 | param->sched_priority > rlim_rtprio) |
4118 | return -EPERM; | 4118 | return -EPERM; |
4119 | } | 4119 | } |
4120 | 4120 | ||
4121 | /* can't change other user's priorities */ | 4121 | /* can't change other user's priorities */ |
4122 | if ((current->euid != p->euid) && | 4122 | if ((current->euid != p->euid) && |
4123 | (current->euid != p->uid)) | 4123 | (current->euid != p->uid)) |
4124 | return -EPERM; | 4124 | return -EPERM; |
4125 | } | 4125 | } |
4126 | 4126 | ||
4127 | retval = security_task_setscheduler(p, policy, param); | 4127 | retval = security_task_setscheduler(p, policy, param); |
4128 | if (retval) | 4128 | if (retval) |
4129 | return retval; | 4129 | return retval; |
4130 | /* | 4130 | /* |
4131 | * make sure no PI-waiters arrive (or leave) while we are | 4131 | * make sure no PI-waiters arrive (or leave) while we are |
4132 | * changing the priority of the task: | 4132 | * changing the priority of the task: |
4133 | */ | 4133 | */ |
4134 | spin_lock_irqsave(&p->pi_lock, flags); | 4134 | spin_lock_irqsave(&p->pi_lock, flags); |
4135 | /* | 4135 | /* |
4136 | * To be able to change p->policy safely, the apropriate | 4136 | * To be able to change p->policy safely, the apropriate |
4137 | * runqueue lock must be held. | 4137 | * runqueue lock must be held. |
4138 | */ | 4138 | */ |
4139 | rq = __task_rq_lock(p); | 4139 | rq = __task_rq_lock(p); |
4140 | /* recheck policy now with rq lock held */ | 4140 | /* recheck policy now with rq lock held */ |
4141 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4141 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
4142 | policy = oldpolicy = -1; | 4142 | policy = oldpolicy = -1; |
4143 | __task_rq_unlock(rq); | 4143 | __task_rq_unlock(rq); |
4144 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4144 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4145 | goto recheck; | 4145 | goto recheck; |
4146 | } | 4146 | } |
4147 | array = p->array; | 4147 | array = p->array; |
4148 | if (array) | 4148 | if (array) |
4149 | deactivate_task(p, rq); | 4149 | deactivate_task(p, rq); |
4150 | oldprio = p->prio; | 4150 | oldprio = p->prio; |
4151 | __setscheduler(p, policy, param->sched_priority); | 4151 | __setscheduler(p, policy, param->sched_priority); |
4152 | if (array) { | 4152 | if (array) { |
4153 | __activate_task(p, rq); | 4153 | __activate_task(p, rq); |
4154 | /* | 4154 | /* |
4155 | * Reschedule if we are currently running on this runqueue and | 4155 | * Reschedule if we are currently running on this runqueue and |
4156 | * our priority decreased, or if we are not currently running on | 4156 | * our priority decreased, or if we are not currently running on |
4157 | * this runqueue and our priority is higher than the current's | 4157 | * this runqueue and our priority is higher than the current's |
4158 | */ | 4158 | */ |
4159 | if (task_running(rq, p)) { | 4159 | if (task_running(rq, p)) { |
4160 | if (p->prio > oldprio) | 4160 | if (p->prio > oldprio) |
4161 | resched_task(rq->curr); | 4161 | resched_task(rq->curr); |
4162 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4162 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
4163 | resched_task(rq->curr); | 4163 | resched_task(rq->curr); |
4164 | } | 4164 | } |
4165 | __task_rq_unlock(rq); | 4165 | __task_rq_unlock(rq); |
4166 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4166 | spin_unlock_irqrestore(&p->pi_lock, flags); |
4167 | 4167 | ||
4168 | rt_mutex_adjust_pi(p); | 4168 | rt_mutex_adjust_pi(p); |
4169 | 4169 | ||
4170 | return 0; | 4170 | return 0; |
4171 | } | 4171 | } |
4172 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4172 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
4173 | 4173 | ||
4174 | static int | 4174 | static int |
4175 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 4175 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4176 | { | 4176 | { |
4177 | struct sched_param lparam; | 4177 | struct sched_param lparam; |
4178 | struct task_struct *p; | 4178 | struct task_struct *p; |
4179 | int retval; | 4179 | int retval; |
4180 | 4180 | ||
4181 | if (!param || pid < 0) | 4181 | if (!param || pid < 0) |
4182 | return -EINVAL; | 4182 | return -EINVAL; |
4183 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) | 4183 | if (copy_from_user(&lparam, param, sizeof(struct sched_param))) |
4184 | return -EFAULT; | 4184 | return -EFAULT; |
4185 | 4185 | ||
4186 | rcu_read_lock(); | 4186 | rcu_read_lock(); |
4187 | retval = -ESRCH; | 4187 | retval = -ESRCH; |
4188 | p = find_process_by_pid(pid); | 4188 | p = find_process_by_pid(pid); |
4189 | if (p != NULL) | 4189 | if (p != NULL) |
4190 | retval = sched_setscheduler(p, policy, &lparam); | 4190 | retval = sched_setscheduler(p, policy, &lparam); |
4191 | rcu_read_unlock(); | 4191 | rcu_read_unlock(); |
4192 | 4192 | ||
4193 | return retval; | 4193 | return retval; |
4194 | } | 4194 | } |
4195 | 4195 | ||
4196 | /** | 4196 | /** |
4197 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority | 4197 | * sys_sched_setscheduler - set/change the scheduler policy and RT priority |
4198 | * @pid: the pid in question. | 4198 | * @pid: the pid in question. |
4199 | * @policy: new policy. | 4199 | * @policy: new policy. |
4200 | * @param: structure containing the new RT priority. | 4200 | * @param: structure containing the new RT priority. |
4201 | */ | 4201 | */ |
4202 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 4202 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, |
4203 | struct sched_param __user *param) | 4203 | struct sched_param __user *param) |
4204 | { | 4204 | { |
4205 | /* negative values for policy are not valid */ | 4205 | /* negative values for policy are not valid */ |
4206 | if (policy < 0) | 4206 | if (policy < 0) |
4207 | return -EINVAL; | 4207 | return -EINVAL; |
4208 | 4208 | ||
4209 | return do_sched_setscheduler(pid, policy, param); | 4209 | return do_sched_setscheduler(pid, policy, param); |
4210 | } | 4210 | } |
4211 | 4211 | ||
4212 | /** | 4212 | /** |
4213 | * sys_sched_setparam - set/change the RT priority of a thread | 4213 | * sys_sched_setparam - set/change the RT priority of a thread |
4214 | * @pid: the pid in question. | 4214 | * @pid: the pid in question. |
4215 | * @param: structure containing the new RT priority. | 4215 | * @param: structure containing the new RT priority. |
4216 | */ | 4216 | */ |
4217 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) | 4217 | asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) |
4218 | { | 4218 | { |
4219 | return do_sched_setscheduler(pid, -1, param); | 4219 | return do_sched_setscheduler(pid, -1, param); |
4220 | } | 4220 | } |
4221 | 4221 | ||
4222 | /** | 4222 | /** |
4223 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread | 4223 | * sys_sched_getscheduler - get the policy (scheduling class) of a thread |
4224 | * @pid: the pid in question. | 4224 | * @pid: the pid in question. |
4225 | */ | 4225 | */ |
4226 | asmlinkage long sys_sched_getscheduler(pid_t pid) | 4226 | asmlinkage long sys_sched_getscheduler(pid_t pid) |
4227 | { | 4227 | { |
4228 | struct task_struct *p; | 4228 | struct task_struct *p; |
4229 | int retval = -EINVAL; | 4229 | int retval = -EINVAL; |
4230 | 4230 | ||
4231 | if (pid < 0) | 4231 | if (pid < 0) |
4232 | goto out_nounlock; | 4232 | goto out_nounlock; |
4233 | 4233 | ||
4234 | retval = -ESRCH; | 4234 | retval = -ESRCH; |
4235 | read_lock(&tasklist_lock); | 4235 | read_lock(&tasklist_lock); |
4236 | p = find_process_by_pid(pid); | 4236 | p = find_process_by_pid(pid); |
4237 | if (p) { | 4237 | if (p) { |
4238 | retval = security_task_getscheduler(p); | 4238 | retval = security_task_getscheduler(p); |
4239 | if (!retval) | 4239 | if (!retval) |
4240 | retval = p->policy; | 4240 | retval = p->policy; |
4241 | } | 4241 | } |
4242 | read_unlock(&tasklist_lock); | 4242 | read_unlock(&tasklist_lock); |
4243 | 4243 | ||
4244 | out_nounlock: | 4244 | out_nounlock: |
4245 | return retval; | 4245 | return retval; |
4246 | } | 4246 | } |
4247 | 4247 | ||
4248 | /** | 4248 | /** |
4249 | * sys_sched_getscheduler - get the RT priority of a thread | 4249 | * sys_sched_getscheduler - get the RT priority of a thread |
4250 | * @pid: the pid in question. | 4250 | * @pid: the pid in question. |
4251 | * @param: structure containing the RT priority. | 4251 | * @param: structure containing the RT priority. |
4252 | */ | 4252 | */ |
4253 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) | 4253 | asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) |
4254 | { | 4254 | { |
4255 | struct sched_param lp; | 4255 | struct sched_param lp; |
4256 | struct task_struct *p; | 4256 | struct task_struct *p; |
4257 | int retval = -EINVAL; | 4257 | int retval = -EINVAL; |
4258 | 4258 | ||
4259 | if (!param || pid < 0) | 4259 | if (!param || pid < 0) |
4260 | goto out_nounlock; | 4260 | goto out_nounlock; |
4261 | 4261 | ||
4262 | read_lock(&tasklist_lock); | 4262 | read_lock(&tasklist_lock); |
4263 | p = find_process_by_pid(pid); | 4263 | p = find_process_by_pid(pid); |
4264 | retval = -ESRCH; | 4264 | retval = -ESRCH; |
4265 | if (!p) | 4265 | if (!p) |
4266 | goto out_unlock; | 4266 | goto out_unlock; |
4267 | 4267 | ||
4268 | retval = security_task_getscheduler(p); | 4268 | retval = security_task_getscheduler(p); |
4269 | if (retval) | 4269 | if (retval) |
4270 | goto out_unlock; | 4270 | goto out_unlock; |
4271 | 4271 | ||
4272 | lp.sched_priority = p->rt_priority; | 4272 | lp.sched_priority = p->rt_priority; |
4273 | read_unlock(&tasklist_lock); | 4273 | read_unlock(&tasklist_lock); |
4274 | 4274 | ||
4275 | /* | 4275 | /* |
4276 | * This one might sleep, we cannot do it with a spinlock held ... | 4276 | * This one might sleep, we cannot do it with a spinlock held ... |
4277 | */ | 4277 | */ |
4278 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | 4278 | retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; |
4279 | 4279 | ||
4280 | out_nounlock: | 4280 | out_nounlock: |
4281 | return retval; | 4281 | return retval; |
4282 | 4282 | ||
4283 | out_unlock: | 4283 | out_unlock: |
4284 | read_unlock(&tasklist_lock); | 4284 | read_unlock(&tasklist_lock); |
4285 | return retval; | 4285 | return retval; |
4286 | } | 4286 | } |
4287 | 4287 | ||
4288 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) | 4288 | long sched_setaffinity(pid_t pid, cpumask_t new_mask) |
4289 | { | 4289 | { |
4290 | cpumask_t cpus_allowed; | 4290 | cpumask_t cpus_allowed; |
4291 | struct task_struct *p; | 4291 | struct task_struct *p; |
4292 | int retval; | 4292 | int retval; |
4293 | 4293 | ||
4294 | lock_cpu_hotplug(); | 4294 | lock_cpu_hotplug(); |
4295 | read_lock(&tasklist_lock); | 4295 | read_lock(&tasklist_lock); |
4296 | 4296 | ||
4297 | p = find_process_by_pid(pid); | 4297 | p = find_process_by_pid(pid); |
4298 | if (!p) { | 4298 | if (!p) { |
4299 | read_unlock(&tasklist_lock); | 4299 | read_unlock(&tasklist_lock); |
4300 | unlock_cpu_hotplug(); | 4300 | unlock_cpu_hotplug(); |
4301 | return -ESRCH; | 4301 | return -ESRCH; |
4302 | } | 4302 | } |
4303 | 4303 | ||
4304 | /* | 4304 | /* |
4305 | * It is not safe to call set_cpus_allowed with the | 4305 | * It is not safe to call set_cpus_allowed with the |
4306 | * tasklist_lock held. We will bump the task_struct's | 4306 | * tasklist_lock held. We will bump the task_struct's |
4307 | * usage count and then drop tasklist_lock. | 4307 | * usage count and then drop tasklist_lock. |
4308 | */ | 4308 | */ |
4309 | get_task_struct(p); | 4309 | get_task_struct(p); |
4310 | read_unlock(&tasklist_lock); | 4310 | read_unlock(&tasklist_lock); |
4311 | 4311 | ||
4312 | retval = -EPERM; | 4312 | retval = -EPERM; |
4313 | if ((current->euid != p->euid) && (current->euid != p->uid) && | 4313 | if ((current->euid != p->euid) && (current->euid != p->uid) && |
4314 | !capable(CAP_SYS_NICE)) | 4314 | !capable(CAP_SYS_NICE)) |
4315 | goto out_unlock; | 4315 | goto out_unlock; |
4316 | 4316 | ||
4317 | retval = security_task_setscheduler(p, 0, NULL); | 4317 | retval = security_task_setscheduler(p, 0, NULL); |
4318 | if (retval) | 4318 | if (retval) |
4319 | goto out_unlock; | 4319 | goto out_unlock; |
4320 | 4320 | ||
4321 | cpus_allowed = cpuset_cpus_allowed(p); | 4321 | cpus_allowed = cpuset_cpus_allowed(p); |
4322 | cpus_and(new_mask, new_mask, cpus_allowed); | 4322 | cpus_and(new_mask, new_mask, cpus_allowed); |
4323 | retval = set_cpus_allowed(p, new_mask); | 4323 | retval = set_cpus_allowed(p, new_mask); |
4324 | 4324 | ||
4325 | out_unlock: | 4325 | out_unlock: |
4326 | put_task_struct(p); | 4326 | put_task_struct(p); |
4327 | unlock_cpu_hotplug(); | 4327 | unlock_cpu_hotplug(); |
4328 | return retval; | 4328 | return retval; |
4329 | } | 4329 | } |
4330 | 4330 | ||
4331 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, | 4331 | static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, |
4332 | cpumask_t *new_mask) | 4332 | cpumask_t *new_mask) |
4333 | { | 4333 | { |
4334 | if (len < sizeof(cpumask_t)) { | 4334 | if (len < sizeof(cpumask_t)) { |
4335 | memset(new_mask, 0, sizeof(cpumask_t)); | 4335 | memset(new_mask, 0, sizeof(cpumask_t)); |
4336 | } else if (len > sizeof(cpumask_t)) { | 4336 | } else if (len > sizeof(cpumask_t)) { |
4337 | len = sizeof(cpumask_t); | 4337 | len = sizeof(cpumask_t); |
4338 | } | 4338 | } |
4339 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; | 4339 | return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; |
4340 | } | 4340 | } |
4341 | 4341 | ||
4342 | /** | 4342 | /** |
4343 | * sys_sched_setaffinity - set the cpu affinity of a process | 4343 | * sys_sched_setaffinity - set the cpu affinity of a process |
4344 | * @pid: pid of the process | 4344 | * @pid: pid of the process |
4345 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4345 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4346 | * @user_mask_ptr: user-space pointer to the new cpu mask | 4346 | * @user_mask_ptr: user-space pointer to the new cpu mask |
4347 | */ | 4347 | */ |
4348 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, | 4348 | asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, |
4349 | unsigned long __user *user_mask_ptr) | 4349 | unsigned long __user *user_mask_ptr) |
4350 | { | 4350 | { |
4351 | cpumask_t new_mask; | 4351 | cpumask_t new_mask; |
4352 | int retval; | 4352 | int retval; |
4353 | 4353 | ||
4354 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); | 4354 | retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); |
4355 | if (retval) | 4355 | if (retval) |
4356 | return retval; | 4356 | return retval; |
4357 | 4357 | ||
4358 | return sched_setaffinity(pid, new_mask); | 4358 | return sched_setaffinity(pid, new_mask); |
4359 | } | 4359 | } |
4360 | 4360 | ||
4361 | /* | 4361 | /* |
4362 | * Represents all cpu's present in the system | 4362 | * Represents all cpu's present in the system |
4363 | * In systems capable of hotplug, this map could dynamically grow | 4363 | * In systems capable of hotplug, this map could dynamically grow |
4364 | * as new cpu's are detected in the system via any platform specific | 4364 | * as new cpu's are detected in the system via any platform specific |
4365 | * method, such as ACPI for e.g. | 4365 | * method, such as ACPI for e.g. |
4366 | */ | 4366 | */ |
4367 | 4367 | ||
4368 | cpumask_t cpu_present_map __read_mostly; | 4368 | cpumask_t cpu_present_map __read_mostly; |
4369 | EXPORT_SYMBOL(cpu_present_map); | 4369 | EXPORT_SYMBOL(cpu_present_map); |
4370 | 4370 | ||
4371 | #ifndef CONFIG_SMP | 4371 | #ifndef CONFIG_SMP |
4372 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; | 4372 | cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; |
4373 | EXPORT_SYMBOL(cpu_online_map); | 4373 | EXPORT_SYMBOL(cpu_online_map); |
4374 | 4374 | ||
4375 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; | 4375 | cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; |
4376 | EXPORT_SYMBOL(cpu_possible_map); | 4376 | EXPORT_SYMBOL(cpu_possible_map); |
4377 | #endif | 4377 | #endif |
4378 | 4378 | ||
4379 | long sched_getaffinity(pid_t pid, cpumask_t *mask) | 4379 | long sched_getaffinity(pid_t pid, cpumask_t *mask) |
4380 | { | 4380 | { |
4381 | struct task_struct *p; | 4381 | struct task_struct *p; |
4382 | int retval; | 4382 | int retval; |
4383 | 4383 | ||
4384 | lock_cpu_hotplug(); | 4384 | lock_cpu_hotplug(); |
4385 | read_lock(&tasklist_lock); | 4385 | read_lock(&tasklist_lock); |
4386 | 4386 | ||
4387 | retval = -ESRCH; | 4387 | retval = -ESRCH; |
4388 | p = find_process_by_pid(pid); | 4388 | p = find_process_by_pid(pid); |
4389 | if (!p) | 4389 | if (!p) |
4390 | goto out_unlock; | 4390 | goto out_unlock; |
4391 | 4391 | ||
4392 | retval = security_task_getscheduler(p); | 4392 | retval = security_task_getscheduler(p); |
4393 | if (retval) | 4393 | if (retval) |
4394 | goto out_unlock; | 4394 | goto out_unlock; |
4395 | 4395 | ||
4396 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); | 4396 | cpus_and(*mask, p->cpus_allowed, cpu_online_map); |
4397 | 4397 | ||
4398 | out_unlock: | 4398 | out_unlock: |
4399 | read_unlock(&tasklist_lock); | 4399 | read_unlock(&tasklist_lock); |
4400 | unlock_cpu_hotplug(); | 4400 | unlock_cpu_hotplug(); |
4401 | if (retval) | 4401 | if (retval) |
4402 | return retval; | 4402 | return retval; |
4403 | 4403 | ||
4404 | return 0; | 4404 | return 0; |
4405 | } | 4405 | } |
4406 | 4406 | ||
4407 | /** | 4407 | /** |
4408 | * sys_sched_getaffinity - get the cpu affinity of a process | 4408 | * sys_sched_getaffinity - get the cpu affinity of a process |
4409 | * @pid: pid of the process | 4409 | * @pid: pid of the process |
4410 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr | 4410 | * @len: length in bytes of the bitmask pointed to by user_mask_ptr |
4411 | * @user_mask_ptr: user-space pointer to hold the current cpu mask | 4411 | * @user_mask_ptr: user-space pointer to hold the current cpu mask |
4412 | */ | 4412 | */ |
4413 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, | 4413 | asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, |
4414 | unsigned long __user *user_mask_ptr) | 4414 | unsigned long __user *user_mask_ptr) |
4415 | { | 4415 | { |
4416 | int ret; | 4416 | int ret; |
4417 | cpumask_t mask; | 4417 | cpumask_t mask; |
4418 | 4418 | ||
4419 | if (len < sizeof(cpumask_t)) | 4419 | if (len < sizeof(cpumask_t)) |
4420 | return -EINVAL; | 4420 | return -EINVAL; |
4421 | 4421 | ||
4422 | ret = sched_getaffinity(pid, &mask); | 4422 | ret = sched_getaffinity(pid, &mask); |
4423 | if (ret < 0) | 4423 | if (ret < 0) |
4424 | return ret; | 4424 | return ret; |
4425 | 4425 | ||
4426 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) | 4426 | if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) |
4427 | return -EFAULT; | 4427 | return -EFAULT; |
4428 | 4428 | ||
4429 | return sizeof(cpumask_t); | 4429 | return sizeof(cpumask_t); |
4430 | } | 4430 | } |
4431 | 4431 | ||
4432 | /** | 4432 | /** |
4433 | * sys_sched_yield - yield the current processor to other threads. | 4433 | * sys_sched_yield - yield the current processor to other threads. |
4434 | * | 4434 | * |
4435 | * This function yields the current CPU by moving the calling thread | 4435 | * This function yields the current CPU by moving the calling thread |
4436 | * to the expired array. If there are no other threads running on this | 4436 | * to the expired array. If there are no other threads running on this |
4437 | * CPU then this function will return. | 4437 | * CPU then this function will return. |
4438 | */ | 4438 | */ |
4439 | asmlinkage long sys_sched_yield(void) | 4439 | asmlinkage long sys_sched_yield(void) |
4440 | { | 4440 | { |
4441 | struct rq *rq = this_rq_lock(); | 4441 | struct rq *rq = this_rq_lock(); |
4442 | struct prio_array *array = current->array, *target = rq->expired; | 4442 | struct prio_array *array = current->array, *target = rq->expired; |
4443 | 4443 | ||
4444 | schedstat_inc(rq, yld_cnt); | 4444 | schedstat_inc(rq, yld_cnt); |
4445 | /* | 4445 | /* |
4446 | * We implement yielding by moving the task into the expired | 4446 | * We implement yielding by moving the task into the expired |
4447 | * queue. | 4447 | * queue. |
4448 | * | 4448 | * |
4449 | * (special rule: RT tasks will just roundrobin in the active | 4449 | * (special rule: RT tasks will just roundrobin in the active |
4450 | * array.) | 4450 | * array.) |
4451 | */ | 4451 | */ |
4452 | if (rt_task(current)) | 4452 | if (rt_task(current)) |
4453 | target = rq->active; | 4453 | target = rq->active; |
4454 | 4454 | ||
4455 | if (array->nr_active == 1) { | 4455 | if (array->nr_active == 1) { |
4456 | schedstat_inc(rq, yld_act_empty); | 4456 | schedstat_inc(rq, yld_act_empty); |
4457 | if (!rq->expired->nr_active) | 4457 | if (!rq->expired->nr_active) |
4458 | schedstat_inc(rq, yld_both_empty); | 4458 | schedstat_inc(rq, yld_both_empty); |
4459 | } else if (!rq->expired->nr_active) | 4459 | } else if (!rq->expired->nr_active) |
4460 | schedstat_inc(rq, yld_exp_empty); | 4460 | schedstat_inc(rq, yld_exp_empty); |
4461 | 4461 | ||
4462 | if (array != target) { | 4462 | if (array != target) { |
4463 | dequeue_task(current, array); | 4463 | dequeue_task(current, array); |
4464 | enqueue_task(current, target); | 4464 | enqueue_task(current, target); |
4465 | } else | 4465 | } else |
4466 | /* | 4466 | /* |
4467 | * requeue_task is cheaper so perform that if possible. | 4467 | * requeue_task is cheaper so perform that if possible. |
4468 | */ | 4468 | */ |
4469 | requeue_task(current, array); | 4469 | requeue_task(current, array); |
4470 | 4470 | ||
4471 | /* | 4471 | /* |
4472 | * Since we are going to call schedule() anyway, there's | 4472 | * Since we are going to call schedule() anyway, there's |
4473 | * no need to preempt or enable interrupts: | 4473 | * no need to preempt or enable interrupts: |
4474 | */ | 4474 | */ |
4475 | __release(rq->lock); | 4475 | __release(rq->lock); |
4476 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); | 4476 | spin_release(&rq->lock.dep_map, 1, _THIS_IP_); |
4477 | _raw_spin_unlock(&rq->lock); | 4477 | _raw_spin_unlock(&rq->lock); |
4478 | preempt_enable_no_resched(); | 4478 | preempt_enable_no_resched(); |
4479 | 4479 | ||
4480 | schedule(); | 4480 | schedule(); |
4481 | 4481 | ||
4482 | return 0; | 4482 | return 0; |
4483 | } | 4483 | } |
4484 | 4484 | ||
4485 | static void __cond_resched(void) | 4485 | static void __cond_resched(void) |
4486 | { | 4486 | { |
4487 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 4487 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
4488 | __might_sleep(__FILE__, __LINE__); | 4488 | __might_sleep(__FILE__, __LINE__); |
4489 | #endif | 4489 | #endif |
4490 | /* | 4490 | /* |
4491 | * The BKS might be reacquired before we have dropped | 4491 | * The BKS might be reacquired before we have dropped |
4492 | * PREEMPT_ACTIVE, which could trigger a second | 4492 | * PREEMPT_ACTIVE, which could trigger a second |
4493 | * cond_resched() call. | 4493 | * cond_resched() call. |
4494 | */ | 4494 | */ |
4495 | do { | 4495 | do { |
4496 | add_preempt_count(PREEMPT_ACTIVE); | 4496 | add_preempt_count(PREEMPT_ACTIVE); |
4497 | schedule(); | 4497 | schedule(); |
4498 | sub_preempt_count(PREEMPT_ACTIVE); | 4498 | sub_preempt_count(PREEMPT_ACTIVE); |
4499 | } while (need_resched()); | 4499 | } while (need_resched()); |
4500 | } | 4500 | } |
4501 | 4501 | ||
4502 | int __sched cond_resched(void) | 4502 | int __sched cond_resched(void) |
4503 | { | 4503 | { |
4504 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4504 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4505 | system_state == SYSTEM_RUNNING) { | 4505 | system_state == SYSTEM_RUNNING) { |
4506 | __cond_resched(); | 4506 | __cond_resched(); |
4507 | return 1; | 4507 | return 1; |
4508 | } | 4508 | } |
4509 | return 0; | 4509 | return 0; |
4510 | } | 4510 | } |
4511 | EXPORT_SYMBOL(cond_resched); | 4511 | EXPORT_SYMBOL(cond_resched); |
4512 | 4512 | ||
4513 | /* | 4513 | /* |
4514 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4514 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4515 | * call schedule, and on return reacquire the lock. | 4515 | * call schedule, and on return reacquire the lock. |
4516 | * | 4516 | * |
4517 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 4517 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4518 | * operations here to prevent schedule() from being called twice (once via | 4518 | * operations here to prevent schedule() from being called twice (once via |
4519 | * spin_unlock(), once by hand). | 4519 | * spin_unlock(), once by hand). |
4520 | */ | 4520 | */ |
4521 | int cond_resched_lock(spinlock_t *lock) | 4521 | int cond_resched_lock(spinlock_t *lock) |
4522 | { | 4522 | { |
4523 | int ret = 0; | 4523 | int ret = 0; |
4524 | 4524 | ||
4525 | if (need_lockbreak(lock)) { | 4525 | if (need_lockbreak(lock)) { |
4526 | spin_unlock(lock); | 4526 | spin_unlock(lock); |
4527 | cpu_relax(); | 4527 | cpu_relax(); |
4528 | ret = 1; | 4528 | ret = 1; |
4529 | spin_lock(lock); | 4529 | spin_lock(lock); |
4530 | } | 4530 | } |
4531 | if (need_resched() && system_state == SYSTEM_RUNNING) { | 4531 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
4532 | spin_release(&lock->dep_map, 1, _THIS_IP_); | 4532 | spin_release(&lock->dep_map, 1, _THIS_IP_); |
4533 | _raw_spin_unlock(lock); | 4533 | _raw_spin_unlock(lock); |
4534 | preempt_enable_no_resched(); | 4534 | preempt_enable_no_resched(); |
4535 | __cond_resched(); | 4535 | __cond_resched(); |
4536 | ret = 1; | 4536 | ret = 1; |
4537 | spin_lock(lock); | 4537 | spin_lock(lock); |
4538 | } | 4538 | } |
4539 | return ret; | 4539 | return ret; |
4540 | } | 4540 | } |
4541 | EXPORT_SYMBOL(cond_resched_lock); | 4541 | EXPORT_SYMBOL(cond_resched_lock); |
4542 | 4542 | ||
4543 | int __sched cond_resched_softirq(void) | 4543 | int __sched cond_resched_softirq(void) |
4544 | { | 4544 | { |
4545 | BUG_ON(!in_softirq()); | 4545 | BUG_ON(!in_softirq()); |
4546 | 4546 | ||
4547 | if (need_resched() && system_state == SYSTEM_RUNNING) { | 4547 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
4548 | raw_local_irq_disable(); | 4548 | raw_local_irq_disable(); |
4549 | _local_bh_enable(); | 4549 | _local_bh_enable(); |
4550 | raw_local_irq_enable(); | 4550 | raw_local_irq_enable(); |
4551 | __cond_resched(); | 4551 | __cond_resched(); |
4552 | local_bh_disable(); | 4552 | local_bh_disable(); |
4553 | return 1; | 4553 | return 1; |
4554 | } | 4554 | } |
4555 | return 0; | 4555 | return 0; |
4556 | } | 4556 | } |
4557 | EXPORT_SYMBOL(cond_resched_softirq); | 4557 | EXPORT_SYMBOL(cond_resched_softirq); |
4558 | 4558 | ||
4559 | /** | 4559 | /** |
4560 | * yield - yield the current processor to other threads. | 4560 | * yield - yield the current processor to other threads. |
4561 | * | 4561 | * |
4562 | * This is a shortcut for kernel-space yielding - it marks the | 4562 | * This is a shortcut for kernel-space yielding - it marks the |
4563 | * thread runnable and calls sys_sched_yield(). | 4563 | * thread runnable and calls sys_sched_yield(). |
4564 | */ | 4564 | */ |
4565 | void __sched yield(void) | 4565 | void __sched yield(void) |
4566 | { | 4566 | { |
4567 | set_current_state(TASK_RUNNING); | 4567 | set_current_state(TASK_RUNNING); |
4568 | sys_sched_yield(); | 4568 | sys_sched_yield(); |
4569 | } | 4569 | } |
4570 | EXPORT_SYMBOL(yield); | 4570 | EXPORT_SYMBOL(yield); |
4571 | 4571 | ||
4572 | /* | 4572 | /* |
4573 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 4573 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4574 | * that process accounting knows that this is a task in IO wait state. | 4574 | * that process accounting knows that this is a task in IO wait state. |
4575 | * | 4575 | * |
4576 | * But don't do that if it is a deliberate, throttling IO wait (this task | 4576 | * But don't do that if it is a deliberate, throttling IO wait (this task |
4577 | * has set its backing_dev_info: the queue against which it should throttle) | 4577 | * has set its backing_dev_info: the queue against which it should throttle) |
4578 | */ | 4578 | */ |
4579 | void __sched io_schedule(void) | 4579 | void __sched io_schedule(void) |
4580 | { | 4580 | { |
4581 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 4581 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4582 | 4582 | ||
4583 | delayacct_blkio_start(); | 4583 | delayacct_blkio_start(); |
4584 | atomic_inc(&rq->nr_iowait); | 4584 | atomic_inc(&rq->nr_iowait); |
4585 | schedule(); | 4585 | schedule(); |
4586 | atomic_dec(&rq->nr_iowait); | 4586 | atomic_dec(&rq->nr_iowait); |
4587 | delayacct_blkio_end(); | 4587 | delayacct_blkio_end(); |
4588 | } | 4588 | } |
4589 | EXPORT_SYMBOL(io_schedule); | 4589 | EXPORT_SYMBOL(io_schedule); |
4590 | 4590 | ||
4591 | long __sched io_schedule_timeout(long timeout) | 4591 | long __sched io_schedule_timeout(long timeout) |
4592 | { | 4592 | { |
4593 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 4593 | struct rq *rq = &__raw_get_cpu_var(runqueues); |
4594 | long ret; | 4594 | long ret; |
4595 | 4595 | ||
4596 | delayacct_blkio_start(); | 4596 | delayacct_blkio_start(); |
4597 | atomic_inc(&rq->nr_iowait); | 4597 | atomic_inc(&rq->nr_iowait); |
4598 | ret = schedule_timeout(timeout); | 4598 | ret = schedule_timeout(timeout); |
4599 | atomic_dec(&rq->nr_iowait); | 4599 | atomic_dec(&rq->nr_iowait); |
4600 | delayacct_blkio_end(); | 4600 | delayacct_blkio_end(); |
4601 | return ret; | 4601 | return ret; |
4602 | } | 4602 | } |
4603 | 4603 | ||
4604 | /** | 4604 | /** |
4605 | * sys_sched_get_priority_max - return maximum RT priority. | 4605 | * sys_sched_get_priority_max - return maximum RT priority. |
4606 | * @policy: scheduling class. | 4606 | * @policy: scheduling class. |
4607 | * | 4607 | * |
4608 | * this syscall returns the maximum rt_priority that can be used | 4608 | * this syscall returns the maximum rt_priority that can be used |
4609 | * by a given scheduling class. | 4609 | * by a given scheduling class. |
4610 | */ | 4610 | */ |
4611 | asmlinkage long sys_sched_get_priority_max(int policy) | 4611 | asmlinkage long sys_sched_get_priority_max(int policy) |
4612 | { | 4612 | { |
4613 | int ret = -EINVAL; | 4613 | int ret = -EINVAL; |
4614 | 4614 | ||
4615 | switch (policy) { | 4615 | switch (policy) { |
4616 | case SCHED_FIFO: | 4616 | case SCHED_FIFO: |
4617 | case SCHED_RR: | 4617 | case SCHED_RR: |
4618 | ret = MAX_USER_RT_PRIO-1; | 4618 | ret = MAX_USER_RT_PRIO-1; |
4619 | break; | 4619 | break; |
4620 | case SCHED_NORMAL: | 4620 | case SCHED_NORMAL: |
4621 | case SCHED_BATCH: | 4621 | case SCHED_BATCH: |
4622 | ret = 0; | 4622 | ret = 0; |
4623 | break; | 4623 | break; |
4624 | } | 4624 | } |
4625 | return ret; | 4625 | return ret; |
4626 | } | 4626 | } |
4627 | 4627 | ||
4628 | /** | 4628 | /** |
4629 | * sys_sched_get_priority_min - return minimum RT priority. | 4629 | * sys_sched_get_priority_min - return minimum RT priority. |
4630 | * @policy: scheduling class. | 4630 | * @policy: scheduling class. |
4631 | * | 4631 | * |
4632 | * this syscall returns the minimum rt_priority that can be used | 4632 | * this syscall returns the minimum rt_priority that can be used |
4633 | * by a given scheduling class. | 4633 | * by a given scheduling class. |
4634 | */ | 4634 | */ |
4635 | asmlinkage long sys_sched_get_priority_min(int policy) | 4635 | asmlinkage long sys_sched_get_priority_min(int policy) |
4636 | { | 4636 | { |
4637 | int ret = -EINVAL; | 4637 | int ret = -EINVAL; |
4638 | 4638 | ||
4639 | switch (policy) { | 4639 | switch (policy) { |
4640 | case SCHED_FIFO: | 4640 | case SCHED_FIFO: |
4641 | case SCHED_RR: | 4641 | case SCHED_RR: |
4642 | ret = 1; | 4642 | ret = 1; |
4643 | break; | 4643 | break; |
4644 | case SCHED_NORMAL: | 4644 | case SCHED_NORMAL: |
4645 | case SCHED_BATCH: | 4645 | case SCHED_BATCH: |
4646 | ret = 0; | 4646 | ret = 0; |
4647 | } | 4647 | } |
4648 | return ret; | 4648 | return ret; |
4649 | } | 4649 | } |
4650 | 4650 | ||
4651 | /** | 4651 | /** |
4652 | * sys_sched_rr_get_interval - return the default timeslice of a process. | 4652 | * sys_sched_rr_get_interval - return the default timeslice of a process. |
4653 | * @pid: pid of the process. | 4653 | * @pid: pid of the process. |
4654 | * @interval: userspace pointer to the timeslice value. | 4654 | * @interval: userspace pointer to the timeslice value. |
4655 | * | 4655 | * |
4656 | * this syscall writes the default timeslice value of a given process | 4656 | * this syscall writes the default timeslice value of a given process |
4657 | * into the user-space timespec buffer. A value of '0' means infinity. | 4657 | * into the user-space timespec buffer. A value of '0' means infinity. |
4658 | */ | 4658 | */ |
4659 | asmlinkage | 4659 | asmlinkage |
4660 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | 4660 | long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) |
4661 | { | 4661 | { |
4662 | struct task_struct *p; | 4662 | struct task_struct *p; |
4663 | int retval = -EINVAL; | 4663 | int retval = -EINVAL; |
4664 | struct timespec t; | 4664 | struct timespec t; |
4665 | 4665 | ||
4666 | if (pid < 0) | 4666 | if (pid < 0) |
4667 | goto out_nounlock; | 4667 | goto out_nounlock; |
4668 | 4668 | ||
4669 | retval = -ESRCH; | 4669 | retval = -ESRCH; |
4670 | read_lock(&tasklist_lock); | 4670 | read_lock(&tasklist_lock); |
4671 | p = find_process_by_pid(pid); | 4671 | p = find_process_by_pid(pid); |
4672 | if (!p) | 4672 | if (!p) |
4673 | goto out_unlock; | 4673 | goto out_unlock; |
4674 | 4674 | ||
4675 | retval = security_task_getscheduler(p); | 4675 | retval = security_task_getscheduler(p); |
4676 | if (retval) | 4676 | if (retval) |
4677 | goto out_unlock; | 4677 | goto out_unlock; |
4678 | 4678 | ||
4679 | jiffies_to_timespec(p->policy == SCHED_FIFO ? | 4679 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4680 | 0 : task_timeslice(p), &t); | 4680 | 0 : task_timeslice(p), &t); |
4681 | read_unlock(&tasklist_lock); | 4681 | read_unlock(&tasklist_lock); |
4682 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4682 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
4683 | out_nounlock: | 4683 | out_nounlock: |
4684 | return retval; | 4684 | return retval; |
4685 | out_unlock: | 4685 | out_unlock: |
4686 | read_unlock(&tasklist_lock); | 4686 | read_unlock(&tasklist_lock); |
4687 | return retval; | 4687 | return retval; |
4688 | } | 4688 | } |
4689 | 4689 | ||
4690 | static const char stat_nam[] = "RSDTtZX"; | 4690 | static const char stat_nam[] = "RSDTtZX"; |
4691 | 4691 | ||
4692 | static void show_task(struct task_struct *p) | 4692 | static void show_task(struct task_struct *p) |
4693 | { | 4693 | { |
4694 | unsigned long free = 0; | 4694 | unsigned long free = 0; |
4695 | unsigned state; | 4695 | unsigned state; |
4696 | 4696 | ||
4697 | state = p->state ? __ffs(p->state) + 1 : 0; | 4697 | state = p->state ? __ffs(p->state) + 1 : 0; |
4698 | printk("%-13.13s %c", p->comm, | 4698 | printk("%-13.13s %c", p->comm, |
4699 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 4699 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
4700 | #if (BITS_PER_LONG == 32) | 4700 | #if (BITS_PER_LONG == 32) |
4701 | if (state == TASK_RUNNING) | 4701 | if (state == TASK_RUNNING) |
4702 | printk(" running "); | 4702 | printk(" running "); |
4703 | else | 4703 | else |
4704 | printk(" %08lX ", thread_saved_pc(p)); | 4704 | printk(" %08lX ", thread_saved_pc(p)); |
4705 | #else | 4705 | #else |
4706 | if (state == TASK_RUNNING) | 4706 | if (state == TASK_RUNNING) |
4707 | printk(" running task "); | 4707 | printk(" running task "); |
4708 | else | 4708 | else |
4709 | printk(" %016lx ", thread_saved_pc(p)); | 4709 | printk(" %016lx ", thread_saved_pc(p)); |
4710 | #endif | 4710 | #endif |
4711 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4711 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4712 | { | 4712 | { |
4713 | unsigned long *n = end_of_stack(p); | 4713 | unsigned long *n = end_of_stack(p); |
4714 | while (!*n) | 4714 | while (!*n) |
4715 | n++; | 4715 | n++; |
4716 | free = (unsigned long)n - (unsigned long)end_of_stack(p); | 4716 | free = (unsigned long)n - (unsigned long)end_of_stack(p); |
4717 | } | 4717 | } |
4718 | #endif | 4718 | #endif |
4719 | printk("%5lu %5d %6d", free, p->pid, p->parent->pid); | 4719 | printk("%5lu %5d %6d", free, p->pid, p->parent->pid); |
4720 | if (!p->mm) | 4720 | if (!p->mm) |
4721 | printk(" (L-TLB)\n"); | 4721 | printk(" (L-TLB)\n"); |
4722 | else | 4722 | else |
4723 | printk(" (NOTLB)\n"); | 4723 | printk(" (NOTLB)\n"); |
4724 | 4724 | ||
4725 | if (state != TASK_RUNNING) | 4725 | if (state != TASK_RUNNING) |
4726 | show_stack(p, NULL); | 4726 | show_stack(p, NULL); |
4727 | } | 4727 | } |
4728 | 4728 | ||
4729 | void show_state_filter(unsigned long state_filter) | 4729 | void show_state_filter(unsigned long state_filter) |
4730 | { | 4730 | { |
4731 | struct task_struct *g, *p; | 4731 | struct task_struct *g, *p; |
4732 | 4732 | ||
4733 | #if (BITS_PER_LONG == 32) | 4733 | #if (BITS_PER_LONG == 32) |
4734 | printk("\n" | 4734 | printk("\n" |
4735 | " free sibling\n"); | 4735 | " free sibling\n"); |
4736 | printk(" task PC stack pid father child younger older\n"); | 4736 | printk(" task PC stack pid father child younger older\n"); |
4737 | #else | 4737 | #else |
4738 | printk("\n" | 4738 | printk("\n" |
4739 | " free sibling\n"); | 4739 | " free sibling\n"); |
4740 | printk(" task PC stack pid father child younger older\n"); | 4740 | printk(" task PC stack pid father child younger older\n"); |
4741 | #endif | 4741 | #endif |
4742 | read_lock(&tasklist_lock); | 4742 | read_lock(&tasklist_lock); |
4743 | do_each_thread(g, p) { | 4743 | do_each_thread(g, p) { |
4744 | /* | 4744 | /* |
4745 | * reset the NMI-timeout, listing all files on a slow | 4745 | * reset the NMI-timeout, listing all files on a slow |
4746 | * console might take alot of time: | 4746 | * console might take alot of time: |
4747 | */ | 4747 | */ |
4748 | touch_nmi_watchdog(); | 4748 | touch_nmi_watchdog(); |
4749 | if (!state_filter || (p->state & state_filter)) | 4749 | if (!state_filter || (p->state & state_filter)) |
4750 | show_task(p); | 4750 | show_task(p); |
4751 | } while_each_thread(g, p); | 4751 | } while_each_thread(g, p); |
4752 | 4752 | ||
4753 | read_unlock(&tasklist_lock); | 4753 | read_unlock(&tasklist_lock); |
4754 | /* | 4754 | /* |
4755 | * Only show locks if all tasks are dumped: | 4755 | * Only show locks if all tasks are dumped: |
4756 | */ | 4756 | */ |
4757 | if (state_filter == -1) | 4757 | if (state_filter == -1) |
4758 | debug_show_all_locks(); | 4758 | debug_show_all_locks(); |
4759 | } | 4759 | } |
4760 | 4760 | ||
4761 | /** | 4761 | /** |
4762 | * init_idle - set up an idle thread for a given CPU | 4762 | * init_idle - set up an idle thread for a given CPU |
4763 | * @idle: task in question | 4763 | * @idle: task in question |
4764 | * @cpu: cpu the idle task belongs to | 4764 | * @cpu: cpu the idle task belongs to |
4765 | * | 4765 | * |
4766 | * NOTE: this function does not set the idle thread's NEED_RESCHED | 4766 | * NOTE: this function does not set the idle thread's NEED_RESCHED |
4767 | * flag, to make booting more robust. | 4767 | * flag, to make booting more robust. |
4768 | */ | 4768 | */ |
4769 | void __cpuinit init_idle(struct task_struct *idle, int cpu) | 4769 | void __cpuinit init_idle(struct task_struct *idle, int cpu) |
4770 | { | 4770 | { |
4771 | struct rq *rq = cpu_rq(cpu); | 4771 | struct rq *rq = cpu_rq(cpu); |
4772 | unsigned long flags; | 4772 | unsigned long flags; |
4773 | 4773 | ||
4774 | idle->timestamp = sched_clock(); | 4774 | idle->timestamp = sched_clock(); |
4775 | idle->sleep_avg = 0; | 4775 | idle->sleep_avg = 0; |
4776 | idle->array = NULL; | 4776 | idle->array = NULL; |
4777 | idle->prio = idle->normal_prio = MAX_PRIO; | 4777 | idle->prio = idle->normal_prio = MAX_PRIO; |
4778 | idle->state = TASK_RUNNING; | 4778 | idle->state = TASK_RUNNING; |
4779 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4779 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4780 | set_task_cpu(idle, cpu); | 4780 | set_task_cpu(idle, cpu); |
4781 | 4781 | ||
4782 | spin_lock_irqsave(&rq->lock, flags); | 4782 | spin_lock_irqsave(&rq->lock, flags); |
4783 | rq->curr = rq->idle = idle; | 4783 | rq->curr = rq->idle = idle; |
4784 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 4784 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
4785 | idle->oncpu = 1; | 4785 | idle->oncpu = 1; |
4786 | #endif | 4786 | #endif |
4787 | spin_unlock_irqrestore(&rq->lock, flags); | 4787 | spin_unlock_irqrestore(&rq->lock, flags); |
4788 | 4788 | ||
4789 | /* Set the preempt count _outside_ the spinlocks! */ | 4789 | /* Set the preempt count _outside_ the spinlocks! */ |
4790 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | 4790 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) |
4791 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | 4791 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); |
4792 | #else | 4792 | #else |
4793 | task_thread_info(idle)->preempt_count = 0; | 4793 | task_thread_info(idle)->preempt_count = 0; |
4794 | #endif | 4794 | #endif |
4795 | } | 4795 | } |
4796 | 4796 | ||
4797 | /* | 4797 | /* |
4798 | * In a system that switches off the HZ timer nohz_cpu_mask | 4798 | * In a system that switches off the HZ timer nohz_cpu_mask |
4799 | * indicates which cpus entered this state. This is used | 4799 | * indicates which cpus entered this state. This is used |
4800 | * in the rcu update to wait only for active cpus. For system | 4800 | * in the rcu update to wait only for active cpus. For system |
4801 | * which do not switch off the HZ timer nohz_cpu_mask should | 4801 | * which do not switch off the HZ timer nohz_cpu_mask should |
4802 | * always be CPU_MASK_NONE. | 4802 | * always be CPU_MASK_NONE. |
4803 | */ | 4803 | */ |
4804 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 4804 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4805 | 4805 | ||
4806 | #ifdef CONFIG_SMP | 4806 | #ifdef CONFIG_SMP |
4807 | /* | 4807 | /* |
4808 | * This is how migration works: | 4808 | * This is how migration works: |
4809 | * | 4809 | * |
4810 | * 1) we queue a struct migration_req structure in the source CPU's | 4810 | * 1) we queue a struct migration_req structure in the source CPU's |
4811 | * runqueue and wake up that CPU's migration thread. | 4811 | * runqueue and wake up that CPU's migration thread. |
4812 | * 2) we down() the locked semaphore => thread blocks. | 4812 | * 2) we down() the locked semaphore => thread blocks. |
4813 | * 3) migration thread wakes up (implicitly it forces the migrated | 4813 | * 3) migration thread wakes up (implicitly it forces the migrated |
4814 | * thread off the CPU) | 4814 | * thread off the CPU) |
4815 | * 4) it gets the migration request and checks whether the migrated | 4815 | * 4) it gets the migration request and checks whether the migrated |
4816 | * task is still in the wrong runqueue. | 4816 | * task is still in the wrong runqueue. |
4817 | * 5) if it's in the wrong runqueue then the migration thread removes | 4817 | * 5) if it's in the wrong runqueue then the migration thread removes |
4818 | * it and puts it into the right queue. | 4818 | * it and puts it into the right queue. |
4819 | * 6) migration thread up()s the semaphore. | 4819 | * 6) migration thread up()s the semaphore. |
4820 | * 7) we wake up and the migration is done. | 4820 | * 7) we wake up and the migration is done. |
4821 | */ | 4821 | */ |
4822 | 4822 | ||
4823 | /* | 4823 | /* |
4824 | * Change a given task's CPU affinity. Migrate the thread to a | 4824 | * Change a given task's CPU affinity. Migrate the thread to a |
4825 | * proper CPU and schedule it away if the CPU it's executing on | 4825 | * proper CPU and schedule it away if the CPU it's executing on |
4826 | * is removed from the allowed bitmask. | 4826 | * is removed from the allowed bitmask. |
4827 | * | 4827 | * |
4828 | * NOTE: the caller must have a valid reference to the task, the | 4828 | * NOTE: the caller must have a valid reference to the task, the |
4829 | * task must not exit() & deallocate itself prematurely. The | 4829 | * task must not exit() & deallocate itself prematurely. The |
4830 | * call is not atomic; no spinlocks may be held. | 4830 | * call is not atomic; no spinlocks may be held. |
4831 | */ | 4831 | */ |
4832 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 4832 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
4833 | { | 4833 | { |
4834 | struct migration_req req; | 4834 | struct migration_req req; |
4835 | unsigned long flags; | 4835 | unsigned long flags; |
4836 | struct rq *rq; | 4836 | struct rq *rq; |
4837 | int ret = 0; | 4837 | int ret = 0; |
4838 | 4838 | ||
4839 | rq = task_rq_lock(p, &flags); | 4839 | rq = task_rq_lock(p, &flags); |
4840 | if (!cpus_intersects(new_mask, cpu_online_map)) { | 4840 | if (!cpus_intersects(new_mask, cpu_online_map)) { |
4841 | ret = -EINVAL; | 4841 | ret = -EINVAL; |
4842 | goto out; | 4842 | goto out; |
4843 | } | 4843 | } |
4844 | 4844 | ||
4845 | p->cpus_allowed = new_mask; | 4845 | p->cpus_allowed = new_mask; |
4846 | /* Can the task run on the task's current CPU? If so, we're done */ | 4846 | /* Can the task run on the task's current CPU? If so, we're done */ |
4847 | if (cpu_isset(task_cpu(p), new_mask)) | 4847 | if (cpu_isset(task_cpu(p), new_mask)) |
4848 | goto out; | 4848 | goto out; |
4849 | 4849 | ||
4850 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { | 4850 | if (migrate_task(p, any_online_cpu(new_mask), &req)) { |
4851 | /* Need help from migration thread: drop lock and wait. */ | 4851 | /* Need help from migration thread: drop lock and wait. */ |
4852 | task_rq_unlock(rq, &flags); | 4852 | task_rq_unlock(rq, &flags); |
4853 | wake_up_process(rq->migration_thread); | 4853 | wake_up_process(rq->migration_thread); |
4854 | wait_for_completion(&req.done); | 4854 | wait_for_completion(&req.done); |
4855 | tlb_migrate_finish(p->mm); | 4855 | tlb_migrate_finish(p->mm); |
4856 | return 0; | 4856 | return 0; |
4857 | } | 4857 | } |
4858 | out: | 4858 | out: |
4859 | task_rq_unlock(rq, &flags); | 4859 | task_rq_unlock(rq, &flags); |
4860 | 4860 | ||
4861 | return ret; | 4861 | return ret; |
4862 | } | 4862 | } |
4863 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 4863 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
4864 | 4864 | ||
4865 | /* | 4865 | /* |
4866 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 4866 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
4867 | * this because either it can't run here any more (set_cpus_allowed() | 4867 | * this because either it can't run here any more (set_cpus_allowed() |
4868 | * away from this CPU, or CPU going down), or because we're | 4868 | * away from this CPU, or CPU going down), or because we're |
4869 | * attempting to rebalance this task on exec (sched_exec). | 4869 | * attempting to rebalance this task on exec (sched_exec). |
4870 | * | 4870 | * |
4871 | * So we race with normal scheduler movements, but that's OK, as long | 4871 | * So we race with normal scheduler movements, but that's OK, as long |
4872 | * as the task is no longer on this CPU. | 4872 | * as the task is no longer on this CPU. |
4873 | * | 4873 | * |
4874 | * Returns non-zero if task was successfully migrated. | 4874 | * Returns non-zero if task was successfully migrated. |
4875 | */ | 4875 | */ |
4876 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4876 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4877 | { | 4877 | { |
4878 | struct rq *rq_dest, *rq_src; | 4878 | struct rq *rq_dest, *rq_src; |
4879 | int ret = 0; | 4879 | int ret = 0; |
4880 | 4880 | ||
4881 | if (unlikely(cpu_is_offline(dest_cpu))) | 4881 | if (unlikely(cpu_is_offline(dest_cpu))) |
4882 | return ret; | 4882 | return ret; |
4883 | 4883 | ||
4884 | rq_src = cpu_rq(src_cpu); | 4884 | rq_src = cpu_rq(src_cpu); |
4885 | rq_dest = cpu_rq(dest_cpu); | 4885 | rq_dest = cpu_rq(dest_cpu); |
4886 | 4886 | ||
4887 | double_rq_lock(rq_src, rq_dest); | 4887 | double_rq_lock(rq_src, rq_dest); |
4888 | /* Already moved. */ | 4888 | /* Already moved. */ |
4889 | if (task_cpu(p) != src_cpu) | 4889 | if (task_cpu(p) != src_cpu) |
4890 | goto out; | 4890 | goto out; |
4891 | /* Affinity changed (again). */ | 4891 | /* Affinity changed (again). */ |
4892 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) | 4892 | if (!cpu_isset(dest_cpu, p->cpus_allowed)) |
4893 | goto out; | 4893 | goto out; |
4894 | 4894 | ||
4895 | set_task_cpu(p, dest_cpu); | 4895 | set_task_cpu(p, dest_cpu); |
4896 | if (p->array) { | 4896 | if (p->array) { |
4897 | /* | 4897 | /* |
4898 | * Sync timestamp with rq_dest's before activating. | 4898 | * Sync timestamp with rq_dest's before activating. |
4899 | * The same thing could be achieved by doing this step | 4899 | * The same thing could be achieved by doing this step |
4900 | * afterwards, and pretending it was a local activate. | 4900 | * afterwards, and pretending it was a local activate. |
4901 | * This way is cleaner and logically correct. | 4901 | * This way is cleaner and logically correct. |
4902 | */ | 4902 | */ |
4903 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp | 4903 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp |
4904 | + rq_dest->most_recent_timestamp; | 4904 | + rq_dest->most_recent_timestamp; |
4905 | deactivate_task(p, rq_src); | 4905 | deactivate_task(p, rq_src); |
4906 | __activate_task(p, rq_dest); | 4906 | __activate_task(p, rq_dest); |
4907 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4907 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4908 | resched_task(rq_dest->curr); | 4908 | resched_task(rq_dest->curr); |
4909 | } | 4909 | } |
4910 | ret = 1; | 4910 | ret = 1; |
4911 | out: | 4911 | out: |
4912 | double_rq_unlock(rq_src, rq_dest); | 4912 | double_rq_unlock(rq_src, rq_dest); |
4913 | return ret; | 4913 | return ret; |
4914 | } | 4914 | } |
4915 | 4915 | ||
4916 | /* | 4916 | /* |
4917 | * migration_thread - this is a highprio system thread that performs | 4917 | * migration_thread - this is a highprio system thread that performs |
4918 | * thread migration by bumping thread off CPU then 'pushing' onto | 4918 | * thread migration by bumping thread off CPU then 'pushing' onto |
4919 | * another runqueue. | 4919 | * another runqueue. |
4920 | */ | 4920 | */ |
4921 | static int migration_thread(void *data) | 4921 | static int migration_thread(void *data) |
4922 | { | 4922 | { |
4923 | int cpu = (long)data; | 4923 | int cpu = (long)data; |
4924 | struct rq *rq; | 4924 | struct rq *rq; |
4925 | 4925 | ||
4926 | rq = cpu_rq(cpu); | 4926 | rq = cpu_rq(cpu); |
4927 | BUG_ON(rq->migration_thread != current); | 4927 | BUG_ON(rq->migration_thread != current); |
4928 | 4928 | ||
4929 | set_current_state(TASK_INTERRUPTIBLE); | 4929 | set_current_state(TASK_INTERRUPTIBLE); |
4930 | while (!kthread_should_stop()) { | 4930 | while (!kthread_should_stop()) { |
4931 | struct migration_req *req; | 4931 | struct migration_req *req; |
4932 | struct list_head *head; | 4932 | struct list_head *head; |
4933 | 4933 | ||
4934 | try_to_freeze(); | 4934 | try_to_freeze(); |
4935 | 4935 | ||
4936 | spin_lock_irq(&rq->lock); | 4936 | spin_lock_irq(&rq->lock); |
4937 | 4937 | ||
4938 | if (cpu_is_offline(cpu)) { | 4938 | if (cpu_is_offline(cpu)) { |
4939 | spin_unlock_irq(&rq->lock); | 4939 | spin_unlock_irq(&rq->lock); |
4940 | goto wait_to_die; | 4940 | goto wait_to_die; |
4941 | } | 4941 | } |
4942 | 4942 | ||
4943 | if (rq->active_balance) { | 4943 | if (rq->active_balance) { |
4944 | active_load_balance(rq, cpu); | 4944 | active_load_balance(rq, cpu); |
4945 | rq->active_balance = 0; | 4945 | rq->active_balance = 0; |
4946 | } | 4946 | } |
4947 | 4947 | ||
4948 | head = &rq->migration_queue; | 4948 | head = &rq->migration_queue; |
4949 | 4949 | ||
4950 | if (list_empty(head)) { | 4950 | if (list_empty(head)) { |
4951 | spin_unlock_irq(&rq->lock); | 4951 | spin_unlock_irq(&rq->lock); |
4952 | schedule(); | 4952 | schedule(); |
4953 | set_current_state(TASK_INTERRUPTIBLE); | 4953 | set_current_state(TASK_INTERRUPTIBLE); |
4954 | continue; | 4954 | continue; |
4955 | } | 4955 | } |
4956 | req = list_entry(head->next, struct migration_req, list); | 4956 | req = list_entry(head->next, struct migration_req, list); |
4957 | list_del_init(head->next); | 4957 | list_del_init(head->next); |
4958 | 4958 | ||
4959 | spin_unlock(&rq->lock); | 4959 | spin_unlock(&rq->lock); |
4960 | __migrate_task(req->task, cpu, req->dest_cpu); | 4960 | __migrate_task(req->task, cpu, req->dest_cpu); |
4961 | local_irq_enable(); | 4961 | local_irq_enable(); |
4962 | 4962 | ||
4963 | complete(&req->done); | 4963 | complete(&req->done); |
4964 | } | 4964 | } |
4965 | __set_current_state(TASK_RUNNING); | 4965 | __set_current_state(TASK_RUNNING); |
4966 | return 0; | 4966 | return 0; |
4967 | 4967 | ||
4968 | wait_to_die: | 4968 | wait_to_die: |
4969 | /* Wait for kthread_stop */ | 4969 | /* Wait for kthread_stop */ |
4970 | set_current_state(TASK_INTERRUPTIBLE); | 4970 | set_current_state(TASK_INTERRUPTIBLE); |
4971 | while (!kthread_should_stop()) { | 4971 | while (!kthread_should_stop()) { |
4972 | schedule(); | 4972 | schedule(); |
4973 | set_current_state(TASK_INTERRUPTIBLE); | 4973 | set_current_state(TASK_INTERRUPTIBLE); |
4974 | } | 4974 | } |
4975 | __set_current_state(TASK_RUNNING); | 4975 | __set_current_state(TASK_RUNNING); |
4976 | return 0; | 4976 | return 0; |
4977 | } | 4977 | } |
4978 | 4978 | ||
4979 | #ifdef CONFIG_HOTPLUG_CPU | 4979 | #ifdef CONFIG_HOTPLUG_CPU |
4980 | /* | 4980 | /* |
4981 | * Figure out where task on dead CPU should go, use force if neccessary. | 4981 | * Figure out where task on dead CPU should go, use force if neccessary. |
4982 | * NOTE: interrupts should be disabled by the caller | 4982 | * NOTE: interrupts should be disabled by the caller |
4983 | */ | 4983 | */ |
4984 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 4984 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
4985 | { | 4985 | { |
4986 | unsigned long flags; | 4986 | unsigned long flags; |
4987 | cpumask_t mask; | 4987 | cpumask_t mask; |
4988 | struct rq *rq; | 4988 | struct rq *rq; |
4989 | int dest_cpu; | 4989 | int dest_cpu; |
4990 | 4990 | ||
4991 | restart: | 4991 | restart: |
4992 | /* On same node? */ | 4992 | /* On same node? */ |
4993 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 4993 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4994 | cpus_and(mask, mask, p->cpus_allowed); | 4994 | cpus_and(mask, mask, p->cpus_allowed); |
4995 | dest_cpu = any_online_cpu(mask); | 4995 | dest_cpu = any_online_cpu(mask); |
4996 | 4996 | ||
4997 | /* On any allowed CPU? */ | 4997 | /* On any allowed CPU? */ |
4998 | if (dest_cpu == NR_CPUS) | 4998 | if (dest_cpu == NR_CPUS) |
4999 | dest_cpu = any_online_cpu(p->cpus_allowed); | 4999 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5000 | 5000 | ||
5001 | /* No more Mr. Nice Guy. */ | 5001 | /* No more Mr. Nice Guy. */ |
5002 | if (dest_cpu == NR_CPUS) { | 5002 | if (dest_cpu == NR_CPUS) { |
5003 | rq = task_rq_lock(p, &flags); | 5003 | rq = task_rq_lock(p, &flags); |
5004 | cpus_setall(p->cpus_allowed); | 5004 | cpus_setall(p->cpus_allowed); |
5005 | dest_cpu = any_online_cpu(p->cpus_allowed); | 5005 | dest_cpu = any_online_cpu(p->cpus_allowed); |
5006 | task_rq_unlock(rq, &flags); | 5006 | task_rq_unlock(rq, &flags); |
5007 | 5007 | ||
5008 | /* | 5008 | /* |
5009 | * Don't tell them about moving exiting tasks or | 5009 | * Don't tell them about moving exiting tasks or |
5010 | * kernel threads (both mm NULL), since they never | 5010 | * kernel threads (both mm NULL), since they never |
5011 | * leave kernel. | 5011 | * leave kernel. |
5012 | */ | 5012 | */ |
5013 | if (p->mm && printk_ratelimit()) | 5013 | if (p->mm && printk_ratelimit()) |
5014 | printk(KERN_INFO "process %d (%s) no " | 5014 | printk(KERN_INFO "process %d (%s) no " |
5015 | "longer affine to cpu%d\n", | 5015 | "longer affine to cpu%d\n", |
5016 | p->pid, p->comm, dead_cpu); | 5016 | p->pid, p->comm, dead_cpu); |
5017 | } | 5017 | } |
5018 | if (!__migrate_task(p, dead_cpu, dest_cpu)) | 5018 | if (!__migrate_task(p, dead_cpu, dest_cpu)) |
5019 | goto restart; | 5019 | goto restart; |
5020 | } | 5020 | } |
5021 | 5021 | ||
5022 | /* | 5022 | /* |
5023 | * While a dead CPU has no uninterruptible tasks queued at this point, | 5023 | * While a dead CPU has no uninterruptible tasks queued at this point, |
5024 | * it might still have a nonzero ->nr_uninterruptible counter, because | 5024 | * it might still have a nonzero ->nr_uninterruptible counter, because |
5025 | * for performance reasons the counter is not stricly tracking tasks to | 5025 | * for performance reasons the counter is not stricly tracking tasks to |
5026 | * their home CPUs. So we just add the counter to another CPU's counter, | 5026 | * their home CPUs. So we just add the counter to another CPU's counter, |
5027 | * to keep the global sum constant after CPU-down: | 5027 | * to keep the global sum constant after CPU-down: |
5028 | */ | 5028 | */ |
5029 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5029 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5030 | { | 5030 | { |
5031 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); | 5031 | struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); |
5032 | unsigned long flags; | 5032 | unsigned long flags; |
5033 | 5033 | ||
5034 | local_irq_save(flags); | 5034 | local_irq_save(flags); |
5035 | double_rq_lock(rq_src, rq_dest); | 5035 | double_rq_lock(rq_src, rq_dest); |
5036 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5036 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5037 | rq_src->nr_uninterruptible = 0; | 5037 | rq_src->nr_uninterruptible = 0; |
5038 | double_rq_unlock(rq_src, rq_dest); | 5038 | double_rq_unlock(rq_src, rq_dest); |
5039 | local_irq_restore(flags); | 5039 | local_irq_restore(flags); |
5040 | } | 5040 | } |
5041 | 5041 | ||
5042 | /* Run through task list and migrate tasks from the dead cpu. */ | 5042 | /* Run through task list and migrate tasks from the dead cpu. */ |
5043 | static void migrate_live_tasks(int src_cpu) | 5043 | static void migrate_live_tasks(int src_cpu) |
5044 | { | 5044 | { |
5045 | struct task_struct *p, *t; | 5045 | struct task_struct *p, *t; |
5046 | 5046 | ||
5047 | write_lock_irq(&tasklist_lock); | 5047 | write_lock_irq(&tasklist_lock); |
5048 | 5048 | ||
5049 | do_each_thread(t, p) { | 5049 | do_each_thread(t, p) { |
5050 | if (p == current) | 5050 | if (p == current) |
5051 | continue; | 5051 | continue; |
5052 | 5052 | ||
5053 | if (task_cpu(p) == src_cpu) | 5053 | if (task_cpu(p) == src_cpu) |
5054 | move_task_off_dead_cpu(src_cpu, p); | 5054 | move_task_off_dead_cpu(src_cpu, p); |
5055 | } while_each_thread(t, p); | 5055 | } while_each_thread(t, p); |
5056 | 5056 | ||
5057 | write_unlock_irq(&tasklist_lock); | 5057 | write_unlock_irq(&tasklist_lock); |
5058 | } | 5058 | } |
5059 | 5059 | ||
5060 | /* Schedules idle task to be the next runnable task on current CPU. | 5060 | /* Schedules idle task to be the next runnable task on current CPU. |
5061 | * It does so by boosting its priority to highest possible and adding it to | 5061 | * It does so by boosting its priority to highest possible and adding it to |
5062 | * the _front_ of the runqueue. Used by CPU offline code. | 5062 | * the _front_ of the runqueue. Used by CPU offline code. |
5063 | */ | 5063 | */ |
5064 | void sched_idle_next(void) | 5064 | void sched_idle_next(void) |
5065 | { | 5065 | { |
5066 | int this_cpu = smp_processor_id(); | 5066 | int this_cpu = smp_processor_id(); |
5067 | struct rq *rq = cpu_rq(this_cpu); | 5067 | struct rq *rq = cpu_rq(this_cpu); |
5068 | struct task_struct *p = rq->idle; | 5068 | struct task_struct *p = rq->idle; |
5069 | unsigned long flags; | 5069 | unsigned long flags; |
5070 | 5070 | ||
5071 | /* cpu has to be offline */ | 5071 | /* cpu has to be offline */ |
5072 | BUG_ON(cpu_online(this_cpu)); | 5072 | BUG_ON(cpu_online(this_cpu)); |
5073 | 5073 | ||
5074 | /* | 5074 | /* |
5075 | * Strictly not necessary since rest of the CPUs are stopped by now | 5075 | * Strictly not necessary since rest of the CPUs are stopped by now |
5076 | * and interrupts disabled on the current cpu. | 5076 | * and interrupts disabled on the current cpu. |
5077 | */ | 5077 | */ |
5078 | spin_lock_irqsave(&rq->lock, flags); | 5078 | spin_lock_irqsave(&rq->lock, flags); |
5079 | 5079 | ||
5080 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5080 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
5081 | 5081 | ||
5082 | /* Add idle task to the _front_ of its priority queue: */ | 5082 | /* Add idle task to the _front_ of its priority queue: */ |
5083 | __activate_idle_task(p, rq); | 5083 | __activate_idle_task(p, rq); |
5084 | 5084 | ||
5085 | spin_unlock_irqrestore(&rq->lock, flags); | 5085 | spin_unlock_irqrestore(&rq->lock, flags); |
5086 | } | 5086 | } |
5087 | 5087 | ||
5088 | /* | 5088 | /* |
5089 | * Ensures that the idle task is using init_mm right before its cpu goes | 5089 | * Ensures that the idle task is using init_mm right before its cpu goes |
5090 | * offline. | 5090 | * offline. |
5091 | */ | 5091 | */ |
5092 | void idle_task_exit(void) | 5092 | void idle_task_exit(void) |
5093 | { | 5093 | { |
5094 | struct mm_struct *mm = current->active_mm; | 5094 | struct mm_struct *mm = current->active_mm; |
5095 | 5095 | ||
5096 | BUG_ON(cpu_online(smp_processor_id())); | 5096 | BUG_ON(cpu_online(smp_processor_id())); |
5097 | 5097 | ||
5098 | if (mm != &init_mm) | 5098 | if (mm != &init_mm) |
5099 | switch_mm(mm, &init_mm, current); | 5099 | switch_mm(mm, &init_mm, current); |
5100 | mmdrop(mm); | 5100 | mmdrop(mm); |
5101 | } | 5101 | } |
5102 | 5102 | ||
5103 | /* called under rq->lock with disabled interrupts */ | 5103 | /* called under rq->lock with disabled interrupts */ |
5104 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | 5104 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
5105 | { | 5105 | { |
5106 | struct rq *rq = cpu_rq(dead_cpu); | 5106 | struct rq *rq = cpu_rq(dead_cpu); |
5107 | 5107 | ||
5108 | /* Must be exiting, otherwise would be on tasklist. */ | 5108 | /* Must be exiting, otherwise would be on tasklist. */ |
5109 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); | 5109 | BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); |
5110 | 5110 | ||
5111 | /* Cannot have done final schedule yet: would have vanished. */ | 5111 | /* Cannot have done final schedule yet: would have vanished. */ |
5112 | BUG_ON(p->state == TASK_DEAD); | 5112 | BUG_ON(p->state == TASK_DEAD); |
5113 | 5113 | ||
5114 | get_task_struct(p); | 5114 | get_task_struct(p); |
5115 | 5115 | ||
5116 | /* | 5116 | /* |
5117 | * Drop lock around migration; if someone else moves it, | 5117 | * Drop lock around migration; if someone else moves it, |
5118 | * that's OK. No task can be added to this CPU, so iteration is | 5118 | * that's OK. No task can be added to this CPU, so iteration is |
5119 | * fine. | 5119 | * fine. |
5120 | * NOTE: interrupts should be left disabled --dev@ | 5120 | * NOTE: interrupts should be left disabled --dev@ |
5121 | */ | 5121 | */ |
5122 | spin_unlock(&rq->lock); | 5122 | spin_unlock(&rq->lock); |
5123 | move_task_off_dead_cpu(dead_cpu, p); | 5123 | move_task_off_dead_cpu(dead_cpu, p); |
5124 | spin_lock(&rq->lock); | 5124 | spin_lock(&rq->lock); |
5125 | 5125 | ||
5126 | put_task_struct(p); | 5126 | put_task_struct(p); |
5127 | } | 5127 | } |
5128 | 5128 | ||
5129 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | 5129 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ |
5130 | static void migrate_dead_tasks(unsigned int dead_cpu) | 5130 | static void migrate_dead_tasks(unsigned int dead_cpu) |
5131 | { | 5131 | { |
5132 | struct rq *rq = cpu_rq(dead_cpu); | 5132 | struct rq *rq = cpu_rq(dead_cpu); |
5133 | unsigned int arr, i; | 5133 | unsigned int arr, i; |
5134 | 5134 | ||
5135 | for (arr = 0; arr < 2; arr++) { | 5135 | for (arr = 0; arr < 2; arr++) { |
5136 | for (i = 0; i < MAX_PRIO; i++) { | 5136 | for (i = 0; i < MAX_PRIO; i++) { |
5137 | struct list_head *list = &rq->arrays[arr].queue[i]; | 5137 | struct list_head *list = &rq->arrays[arr].queue[i]; |
5138 | 5138 | ||
5139 | while (!list_empty(list)) | 5139 | while (!list_empty(list)) |
5140 | migrate_dead(dead_cpu, list_entry(list->next, | 5140 | migrate_dead(dead_cpu, list_entry(list->next, |
5141 | struct task_struct, run_list)); | 5141 | struct task_struct, run_list)); |
5142 | } | 5142 | } |
5143 | } | 5143 | } |
5144 | } | 5144 | } |
5145 | #endif /* CONFIG_HOTPLUG_CPU */ | 5145 | #endif /* CONFIG_HOTPLUG_CPU */ |
5146 | 5146 | ||
5147 | /* | 5147 | /* |
5148 | * migration_call - callback that gets triggered when a CPU is added. | 5148 | * migration_call - callback that gets triggered when a CPU is added. |
5149 | * Here we can start up the necessary migration thread for the new CPU. | 5149 | * Here we can start up the necessary migration thread for the new CPU. |
5150 | */ | 5150 | */ |
5151 | static int __cpuinit | 5151 | static int __cpuinit |
5152 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | 5152 | migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) |
5153 | { | 5153 | { |
5154 | struct task_struct *p; | 5154 | struct task_struct *p; |
5155 | int cpu = (long)hcpu; | 5155 | int cpu = (long)hcpu; |
5156 | unsigned long flags; | 5156 | unsigned long flags; |
5157 | struct rq *rq; | 5157 | struct rq *rq; |
5158 | 5158 | ||
5159 | switch (action) { | 5159 | switch (action) { |
5160 | case CPU_UP_PREPARE: | 5160 | case CPU_UP_PREPARE: |
5161 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); | 5161 | p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); |
5162 | if (IS_ERR(p)) | 5162 | if (IS_ERR(p)) |
5163 | return NOTIFY_BAD; | 5163 | return NOTIFY_BAD; |
5164 | p->flags |= PF_NOFREEZE; | 5164 | p->flags |= PF_NOFREEZE; |
5165 | kthread_bind(p, cpu); | 5165 | kthread_bind(p, cpu); |
5166 | /* Must be high prio: stop_machine expects to yield to it. */ | 5166 | /* Must be high prio: stop_machine expects to yield to it. */ |
5167 | rq = task_rq_lock(p, &flags); | 5167 | rq = task_rq_lock(p, &flags); |
5168 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); | 5168 | __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); |
5169 | task_rq_unlock(rq, &flags); | 5169 | task_rq_unlock(rq, &flags); |
5170 | cpu_rq(cpu)->migration_thread = p; | 5170 | cpu_rq(cpu)->migration_thread = p; |
5171 | break; | 5171 | break; |
5172 | 5172 | ||
5173 | case CPU_ONLINE: | 5173 | case CPU_ONLINE: |
5174 | /* Strictly unneccessary, as first user will wake it. */ | 5174 | /* Strictly unneccessary, as first user will wake it. */ |
5175 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5175 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5176 | break; | 5176 | break; |
5177 | 5177 | ||
5178 | #ifdef CONFIG_HOTPLUG_CPU | 5178 | #ifdef CONFIG_HOTPLUG_CPU |
5179 | case CPU_UP_CANCELED: | 5179 | case CPU_UP_CANCELED: |
5180 | if (!cpu_rq(cpu)->migration_thread) | 5180 | if (!cpu_rq(cpu)->migration_thread) |
5181 | break; | 5181 | break; |
5182 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5182 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
5183 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5183 | kthread_bind(cpu_rq(cpu)->migration_thread, |
5184 | any_online_cpu(cpu_online_map)); | 5184 | any_online_cpu(cpu_online_map)); |
5185 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5185 | kthread_stop(cpu_rq(cpu)->migration_thread); |
5186 | cpu_rq(cpu)->migration_thread = NULL; | 5186 | cpu_rq(cpu)->migration_thread = NULL; |
5187 | break; | 5187 | break; |
5188 | 5188 | ||
5189 | case CPU_DEAD: | 5189 | case CPU_DEAD: |
5190 | migrate_live_tasks(cpu); | 5190 | migrate_live_tasks(cpu); |
5191 | rq = cpu_rq(cpu); | 5191 | rq = cpu_rq(cpu); |
5192 | kthread_stop(rq->migration_thread); | 5192 | kthread_stop(rq->migration_thread); |
5193 | rq->migration_thread = NULL; | 5193 | rq->migration_thread = NULL; |
5194 | /* Idle task back to normal (off runqueue, low prio) */ | 5194 | /* Idle task back to normal (off runqueue, low prio) */ |
5195 | rq = task_rq_lock(rq->idle, &flags); | 5195 | rq = task_rq_lock(rq->idle, &flags); |
5196 | deactivate_task(rq->idle, rq); | 5196 | deactivate_task(rq->idle, rq); |
5197 | rq->idle->static_prio = MAX_PRIO; | 5197 | rq->idle->static_prio = MAX_PRIO; |
5198 | __setscheduler(rq->idle, SCHED_NORMAL, 0); | 5198 | __setscheduler(rq->idle, SCHED_NORMAL, 0); |
5199 | migrate_dead_tasks(cpu); | 5199 | migrate_dead_tasks(cpu); |
5200 | task_rq_unlock(rq, &flags); | 5200 | task_rq_unlock(rq, &flags); |
5201 | migrate_nr_uninterruptible(rq); | 5201 | migrate_nr_uninterruptible(rq); |
5202 | BUG_ON(rq->nr_running != 0); | 5202 | BUG_ON(rq->nr_running != 0); |
5203 | 5203 | ||
5204 | /* No need to migrate the tasks: it was best-effort if | 5204 | /* No need to migrate the tasks: it was best-effort if |
5205 | * they didn't do lock_cpu_hotplug(). Just wake up | 5205 | * they didn't do lock_cpu_hotplug(). Just wake up |
5206 | * the requestors. */ | 5206 | * the requestors. */ |
5207 | spin_lock_irq(&rq->lock); | 5207 | spin_lock_irq(&rq->lock); |
5208 | while (!list_empty(&rq->migration_queue)) { | 5208 | while (!list_empty(&rq->migration_queue)) { |
5209 | struct migration_req *req; | 5209 | struct migration_req *req; |
5210 | 5210 | ||
5211 | req = list_entry(rq->migration_queue.next, | 5211 | req = list_entry(rq->migration_queue.next, |
5212 | struct migration_req, list); | 5212 | struct migration_req, list); |
5213 | list_del_init(&req->list); | 5213 | list_del_init(&req->list); |
5214 | complete(&req->done); | 5214 | complete(&req->done); |
5215 | } | 5215 | } |
5216 | spin_unlock_irq(&rq->lock); | 5216 | spin_unlock_irq(&rq->lock); |
5217 | break; | 5217 | break; |
5218 | #endif | 5218 | #endif |
5219 | } | 5219 | } |
5220 | return NOTIFY_OK; | 5220 | return NOTIFY_OK; |
5221 | } | 5221 | } |
5222 | 5222 | ||
5223 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5223 | /* Register at highest priority so that task migration (migrate_all_tasks) |
5224 | * happens before everything else. | 5224 | * happens before everything else. |
5225 | */ | 5225 | */ |
5226 | static struct notifier_block __cpuinitdata migration_notifier = { | 5226 | static struct notifier_block __cpuinitdata migration_notifier = { |
5227 | .notifier_call = migration_call, | 5227 | .notifier_call = migration_call, |
5228 | .priority = 10 | 5228 | .priority = 10 |
5229 | }; | 5229 | }; |
5230 | 5230 | ||
5231 | int __init migration_init(void) | 5231 | int __init migration_init(void) |
5232 | { | 5232 | { |
5233 | void *cpu = (void *)(long)smp_processor_id(); | 5233 | void *cpu = (void *)(long)smp_processor_id(); |
5234 | int err; | 5234 | int err; |
5235 | 5235 | ||
5236 | /* Start one for the boot CPU: */ | 5236 | /* Start one for the boot CPU: */ |
5237 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); | 5237 | err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); |
5238 | BUG_ON(err == NOTIFY_BAD); | 5238 | BUG_ON(err == NOTIFY_BAD); |
5239 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5239 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5240 | register_cpu_notifier(&migration_notifier); | 5240 | register_cpu_notifier(&migration_notifier); |
5241 | 5241 | ||
5242 | return 0; | 5242 | return 0; |
5243 | } | 5243 | } |
5244 | #endif | 5244 | #endif |
5245 | 5245 | ||
5246 | #ifdef CONFIG_SMP | 5246 | #ifdef CONFIG_SMP |
5247 | |||
5248 | /* Number of possible processor ids */ | ||
5249 | int nr_cpu_ids __read_mostly = NR_CPUS; | ||
5250 | EXPORT_SYMBOL(nr_cpu_ids); | ||
5251 | |||
5247 | #undef SCHED_DOMAIN_DEBUG | 5252 | #undef SCHED_DOMAIN_DEBUG |
5248 | #ifdef SCHED_DOMAIN_DEBUG | 5253 | #ifdef SCHED_DOMAIN_DEBUG |
5249 | static void sched_domain_debug(struct sched_domain *sd, int cpu) | 5254 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
5250 | { | 5255 | { |
5251 | int level = 0; | 5256 | int level = 0; |
5252 | 5257 | ||
5253 | if (!sd) { | 5258 | if (!sd) { |
5254 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); | 5259 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); |
5255 | return; | 5260 | return; |
5256 | } | 5261 | } |
5257 | 5262 | ||
5258 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); | 5263 | printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); |
5259 | 5264 | ||
5260 | do { | 5265 | do { |
5261 | int i; | 5266 | int i; |
5262 | char str[NR_CPUS]; | 5267 | char str[NR_CPUS]; |
5263 | struct sched_group *group = sd->groups; | 5268 | struct sched_group *group = sd->groups; |
5264 | cpumask_t groupmask; | 5269 | cpumask_t groupmask; |
5265 | 5270 | ||
5266 | cpumask_scnprintf(str, NR_CPUS, sd->span); | 5271 | cpumask_scnprintf(str, NR_CPUS, sd->span); |
5267 | cpus_clear(groupmask); | 5272 | cpus_clear(groupmask); |
5268 | 5273 | ||
5269 | printk(KERN_DEBUG); | 5274 | printk(KERN_DEBUG); |
5270 | for (i = 0; i < level + 1; i++) | 5275 | for (i = 0; i < level + 1; i++) |
5271 | printk(" "); | 5276 | printk(" "); |
5272 | printk("domain %d: ", level); | 5277 | printk("domain %d: ", level); |
5273 | 5278 | ||
5274 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 5279 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
5275 | printk("does not load-balance\n"); | 5280 | printk("does not load-balance\n"); |
5276 | if (sd->parent) | 5281 | if (sd->parent) |
5277 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 5282 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
5278 | " has parent"); | 5283 | " has parent"); |
5279 | break; | 5284 | break; |
5280 | } | 5285 | } |
5281 | 5286 | ||
5282 | printk("span %s\n", str); | 5287 | printk("span %s\n", str); |
5283 | 5288 | ||
5284 | if (!cpu_isset(cpu, sd->span)) | 5289 | if (!cpu_isset(cpu, sd->span)) |
5285 | printk(KERN_ERR "ERROR: domain->span does not contain " | 5290 | printk(KERN_ERR "ERROR: domain->span does not contain " |
5286 | "CPU%d\n", cpu); | 5291 | "CPU%d\n", cpu); |
5287 | if (!cpu_isset(cpu, group->cpumask)) | 5292 | if (!cpu_isset(cpu, group->cpumask)) |
5288 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 5293 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
5289 | " CPU%d\n", cpu); | 5294 | " CPU%d\n", cpu); |
5290 | 5295 | ||
5291 | printk(KERN_DEBUG); | 5296 | printk(KERN_DEBUG); |
5292 | for (i = 0; i < level + 2; i++) | 5297 | for (i = 0; i < level + 2; i++) |
5293 | printk(" "); | 5298 | printk(" "); |
5294 | printk("groups:"); | 5299 | printk("groups:"); |
5295 | do { | 5300 | do { |
5296 | if (!group) { | 5301 | if (!group) { |
5297 | printk("\n"); | 5302 | printk("\n"); |
5298 | printk(KERN_ERR "ERROR: group is NULL\n"); | 5303 | printk(KERN_ERR "ERROR: group is NULL\n"); |
5299 | break; | 5304 | break; |
5300 | } | 5305 | } |
5301 | 5306 | ||
5302 | if (!group->cpu_power) { | 5307 | if (!group->cpu_power) { |
5303 | printk("\n"); | 5308 | printk("\n"); |
5304 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5309 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5305 | "set\n"); | 5310 | "set\n"); |
5306 | } | 5311 | } |
5307 | 5312 | ||
5308 | if (!cpus_weight(group->cpumask)) { | 5313 | if (!cpus_weight(group->cpumask)) { |
5309 | printk("\n"); | 5314 | printk("\n"); |
5310 | printk(KERN_ERR "ERROR: empty group\n"); | 5315 | printk(KERN_ERR "ERROR: empty group\n"); |
5311 | } | 5316 | } |
5312 | 5317 | ||
5313 | if (cpus_intersects(groupmask, group->cpumask)) { | 5318 | if (cpus_intersects(groupmask, group->cpumask)) { |
5314 | printk("\n"); | 5319 | printk("\n"); |
5315 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5320 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5316 | } | 5321 | } |
5317 | 5322 | ||
5318 | cpus_or(groupmask, groupmask, group->cpumask); | 5323 | cpus_or(groupmask, groupmask, group->cpumask); |
5319 | 5324 | ||
5320 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); | 5325 | cpumask_scnprintf(str, NR_CPUS, group->cpumask); |
5321 | printk(" %s", str); | 5326 | printk(" %s", str); |
5322 | 5327 | ||
5323 | group = group->next; | 5328 | group = group->next; |
5324 | } while (group != sd->groups); | 5329 | } while (group != sd->groups); |
5325 | printk("\n"); | 5330 | printk("\n"); |
5326 | 5331 | ||
5327 | if (!cpus_equal(sd->span, groupmask)) | 5332 | if (!cpus_equal(sd->span, groupmask)) |
5328 | printk(KERN_ERR "ERROR: groups don't span " | 5333 | printk(KERN_ERR "ERROR: groups don't span " |
5329 | "domain->span\n"); | 5334 | "domain->span\n"); |
5330 | 5335 | ||
5331 | level++; | 5336 | level++; |
5332 | sd = sd->parent; | 5337 | sd = sd->parent; |
5333 | if (!sd) | 5338 | if (!sd) |
5334 | continue; | 5339 | continue; |
5335 | 5340 | ||
5336 | if (!cpus_subset(groupmask, sd->span)) | 5341 | if (!cpus_subset(groupmask, sd->span)) |
5337 | printk(KERN_ERR "ERROR: parent span is not a superset " | 5342 | printk(KERN_ERR "ERROR: parent span is not a superset " |
5338 | "of domain->span\n"); | 5343 | "of domain->span\n"); |
5339 | 5344 | ||
5340 | } while (sd); | 5345 | } while (sd); |
5341 | } | 5346 | } |
5342 | #else | 5347 | #else |
5343 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5348 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5344 | #endif | 5349 | #endif |
5345 | 5350 | ||
5346 | static int sd_degenerate(struct sched_domain *sd) | 5351 | static int sd_degenerate(struct sched_domain *sd) |
5347 | { | 5352 | { |
5348 | if (cpus_weight(sd->span) == 1) | 5353 | if (cpus_weight(sd->span) == 1) |
5349 | return 1; | 5354 | return 1; |
5350 | 5355 | ||
5351 | /* Following flags need at least 2 groups */ | 5356 | /* Following flags need at least 2 groups */ |
5352 | if (sd->flags & (SD_LOAD_BALANCE | | 5357 | if (sd->flags & (SD_LOAD_BALANCE | |
5353 | SD_BALANCE_NEWIDLE | | 5358 | SD_BALANCE_NEWIDLE | |
5354 | SD_BALANCE_FORK | | 5359 | SD_BALANCE_FORK | |
5355 | SD_BALANCE_EXEC | | 5360 | SD_BALANCE_EXEC | |
5356 | SD_SHARE_CPUPOWER | | 5361 | SD_SHARE_CPUPOWER | |
5357 | SD_SHARE_PKG_RESOURCES)) { | 5362 | SD_SHARE_PKG_RESOURCES)) { |
5358 | if (sd->groups != sd->groups->next) | 5363 | if (sd->groups != sd->groups->next) |
5359 | return 0; | 5364 | return 0; |
5360 | } | 5365 | } |
5361 | 5366 | ||
5362 | /* Following flags don't use groups */ | 5367 | /* Following flags don't use groups */ |
5363 | if (sd->flags & (SD_WAKE_IDLE | | 5368 | if (sd->flags & (SD_WAKE_IDLE | |
5364 | SD_WAKE_AFFINE | | 5369 | SD_WAKE_AFFINE | |
5365 | SD_WAKE_BALANCE)) | 5370 | SD_WAKE_BALANCE)) |
5366 | return 0; | 5371 | return 0; |
5367 | 5372 | ||
5368 | return 1; | 5373 | return 1; |
5369 | } | 5374 | } |
5370 | 5375 | ||
5371 | static int | 5376 | static int |
5372 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | 5377 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
5373 | { | 5378 | { |
5374 | unsigned long cflags = sd->flags, pflags = parent->flags; | 5379 | unsigned long cflags = sd->flags, pflags = parent->flags; |
5375 | 5380 | ||
5376 | if (sd_degenerate(parent)) | 5381 | if (sd_degenerate(parent)) |
5377 | return 1; | 5382 | return 1; |
5378 | 5383 | ||
5379 | if (!cpus_equal(sd->span, parent->span)) | 5384 | if (!cpus_equal(sd->span, parent->span)) |
5380 | return 0; | 5385 | return 0; |
5381 | 5386 | ||
5382 | /* Does parent contain flags not in child? */ | 5387 | /* Does parent contain flags not in child? */ |
5383 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | 5388 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ |
5384 | if (cflags & SD_WAKE_AFFINE) | 5389 | if (cflags & SD_WAKE_AFFINE) |
5385 | pflags &= ~SD_WAKE_BALANCE; | 5390 | pflags &= ~SD_WAKE_BALANCE; |
5386 | /* Flags needing groups don't count if only 1 group in parent */ | 5391 | /* Flags needing groups don't count if only 1 group in parent */ |
5387 | if (parent->groups == parent->groups->next) { | 5392 | if (parent->groups == parent->groups->next) { |
5388 | pflags &= ~(SD_LOAD_BALANCE | | 5393 | pflags &= ~(SD_LOAD_BALANCE | |
5389 | SD_BALANCE_NEWIDLE | | 5394 | SD_BALANCE_NEWIDLE | |
5390 | SD_BALANCE_FORK | | 5395 | SD_BALANCE_FORK | |
5391 | SD_BALANCE_EXEC | | 5396 | SD_BALANCE_EXEC | |
5392 | SD_SHARE_CPUPOWER | | 5397 | SD_SHARE_CPUPOWER | |
5393 | SD_SHARE_PKG_RESOURCES); | 5398 | SD_SHARE_PKG_RESOURCES); |
5394 | } | 5399 | } |
5395 | if (~cflags & pflags) | 5400 | if (~cflags & pflags) |
5396 | return 0; | 5401 | return 0; |
5397 | 5402 | ||
5398 | return 1; | 5403 | return 1; |
5399 | } | 5404 | } |
5400 | 5405 | ||
5401 | /* | 5406 | /* |
5402 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 5407 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5403 | * hold the hotplug lock. | 5408 | * hold the hotplug lock. |
5404 | */ | 5409 | */ |
5405 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 5410 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
5406 | { | 5411 | { |
5407 | struct rq *rq = cpu_rq(cpu); | 5412 | struct rq *rq = cpu_rq(cpu); |
5408 | struct sched_domain *tmp; | 5413 | struct sched_domain *tmp; |
5409 | 5414 | ||
5410 | /* Remove the sched domains which do not contribute to scheduling. */ | 5415 | /* Remove the sched domains which do not contribute to scheduling. */ |
5411 | for (tmp = sd; tmp; tmp = tmp->parent) { | 5416 | for (tmp = sd; tmp; tmp = tmp->parent) { |
5412 | struct sched_domain *parent = tmp->parent; | 5417 | struct sched_domain *parent = tmp->parent; |
5413 | if (!parent) | 5418 | if (!parent) |
5414 | break; | 5419 | break; |
5415 | if (sd_parent_degenerate(tmp, parent)) { | 5420 | if (sd_parent_degenerate(tmp, parent)) { |
5416 | tmp->parent = parent->parent; | 5421 | tmp->parent = parent->parent; |
5417 | if (parent->parent) | 5422 | if (parent->parent) |
5418 | parent->parent->child = tmp; | 5423 | parent->parent->child = tmp; |
5419 | } | 5424 | } |
5420 | } | 5425 | } |
5421 | 5426 | ||
5422 | if (sd && sd_degenerate(sd)) { | 5427 | if (sd && sd_degenerate(sd)) { |
5423 | sd = sd->parent; | 5428 | sd = sd->parent; |
5424 | if (sd) | 5429 | if (sd) |
5425 | sd->child = NULL; | 5430 | sd->child = NULL; |
5426 | } | 5431 | } |
5427 | 5432 | ||
5428 | sched_domain_debug(sd, cpu); | 5433 | sched_domain_debug(sd, cpu); |
5429 | 5434 | ||
5430 | rcu_assign_pointer(rq->sd, sd); | 5435 | rcu_assign_pointer(rq->sd, sd); |
5431 | } | 5436 | } |
5432 | 5437 | ||
5433 | /* cpus with isolated domains */ | 5438 | /* cpus with isolated domains */ |
5434 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; | 5439 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; |
5435 | 5440 | ||
5436 | /* Setup the mask of cpus configured for isolated domains */ | 5441 | /* Setup the mask of cpus configured for isolated domains */ |
5437 | static int __init isolated_cpu_setup(char *str) | 5442 | static int __init isolated_cpu_setup(char *str) |
5438 | { | 5443 | { |
5439 | int ints[NR_CPUS], i; | 5444 | int ints[NR_CPUS], i; |
5440 | 5445 | ||
5441 | str = get_options(str, ARRAY_SIZE(ints), ints); | 5446 | str = get_options(str, ARRAY_SIZE(ints), ints); |
5442 | cpus_clear(cpu_isolated_map); | 5447 | cpus_clear(cpu_isolated_map); |
5443 | for (i = 1; i <= ints[0]; i++) | 5448 | for (i = 1; i <= ints[0]; i++) |
5444 | if (ints[i] < NR_CPUS) | 5449 | if (ints[i] < NR_CPUS) |
5445 | cpu_set(ints[i], cpu_isolated_map); | 5450 | cpu_set(ints[i], cpu_isolated_map); |
5446 | return 1; | 5451 | return 1; |
5447 | } | 5452 | } |
5448 | 5453 | ||
5449 | __setup ("isolcpus=", isolated_cpu_setup); | 5454 | __setup ("isolcpus=", isolated_cpu_setup); |
5450 | 5455 | ||
5451 | /* | 5456 | /* |
5452 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer | 5457 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
5453 | * to a function which identifies what group(along with sched group) a CPU | 5458 | * to a function which identifies what group(along with sched group) a CPU |
5454 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS | 5459 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS |
5455 | * (due to the fact that we keep track of groups covered with a cpumask_t). | 5460 | * (due to the fact that we keep track of groups covered with a cpumask_t). |
5456 | * | 5461 | * |
5457 | * init_sched_build_groups will build a circular linked list of the groups | 5462 | * init_sched_build_groups will build a circular linked list of the groups |
5458 | * covered by the given span, and will set each group's ->cpumask correctly, | 5463 | * covered by the given span, and will set each group's ->cpumask correctly, |
5459 | * and ->cpu_power to 0. | 5464 | * and ->cpu_power to 0. |
5460 | */ | 5465 | */ |
5461 | static void | 5466 | static void |
5462 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | 5467 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, |
5463 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, | 5468 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
5464 | struct sched_group **sg)) | 5469 | struct sched_group **sg)) |
5465 | { | 5470 | { |
5466 | struct sched_group *first = NULL, *last = NULL; | 5471 | struct sched_group *first = NULL, *last = NULL; |
5467 | cpumask_t covered = CPU_MASK_NONE; | 5472 | cpumask_t covered = CPU_MASK_NONE; |
5468 | int i; | 5473 | int i; |
5469 | 5474 | ||
5470 | for_each_cpu_mask(i, span) { | 5475 | for_each_cpu_mask(i, span) { |
5471 | struct sched_group *sg; | 5476 | struct sched_group *sg; |
5472 | int group = group_fn(i, cpu_map, &sg); | 5477 | int group = group_fn(i, cpu_map, &sg); |
5473 | int j; | 5478 | int j; |
5474 | 5479 | ||
5475 | if (cpu_isset(i, covered)) | 5480 | if (cpu_isset(i, covered)) |
5476 | continue; | 5481 | continue; |
5477 | 5482 | ||
5478 | sg->cpumask = CPU_MASK_NONE; | 5483 | sg->cpumask = CPU_MASK_NONE; |
5479 | sg->cpu_power = 0; | 5484 | sg->cpu_power = 0; |
5480 | 5485 | ||
5481 | for_each_cpu_mask(j, span) { | 5486 | for_each_cpu_mask(j, span) { |
5482 | if (group_fn(j, cpu_map, NULL) != group) | 5487 | if (group_fn(j, cpu_map, NULL) != group) |
5483 | continue; | 5488 | continue; |
5484 | 5489 | ||
5485 | cpu_set(j, covered); | 5490 | cpu_set(j, covered); |
5486 | cpu_set(j, sg->cpumask); | 5491 | cpu_set(j, sg->cpumask); |
5487 | } | 5492 | } |
5488 | if (!first) | 5493 | if (!first) |
5489 | first = sg; | 5494 | first = sg; |
5490 | if (last) | 5495 | if (last) |
5491 | last->next = sg; | 5496 | last->next = sg; |
5492 | last = sg; | 5497 | last = sg; |
5493 | } | 5498 | } |
5494 | last->next = first; | 5499 | last->next = first; |
5495 | } | 5500 | } |
5496 | 5501 | ||
5497 | #define SD_NODES_PER_DOMAIN 16 | 5502 | #define SD_NODES_PER_DOMAIN 16 |
5498 | 5503 | ||
5499 | /* | 5504 | /* |
5500 | * Self-tuning task migration cost measurement between source and target CPUs. | 5505 | * Self-tuning task migration cost measurement between source and target CPUs. |
5501 | * | 5506 | * |
5502 | * This is done by measuring the cost of manipulating buffers of varying | 5507 | * This is done by measuring the cost of manipulating buffers of varying |
5503 | * sizes. For a given buffer-size here are the steps that are taken: | 5508 | * sizes. For a given buffer-size here are the steps that are taken: |
5504 | * | 5509 | * |
5505 | * 1) the source CPU reads+dirties a shared buffer | 5510 | * 1) the source CPU reads+dirties a shared buffer |
5506 | * 2) the target CPU reads+dirties the same shared buffer | 5511 | * 2) the target CPU reads+dirties the same shared buffer |
5507 | * | 5512 | * |
5508 | * We measure how long they take, in the following 4 scenarios: | 5513 | * We measure how long they take, in the following 4 scenarios: |
5509 | * | 5514 | * |
5510 | * - source: CPU1, target: CPU2 | cost1 | 5515 | * - source: CPU1, target: CPU2 | cost1 |
5511 | * - source: CPU2, target: CPU1 | cost2 | 5516 | * - source: CPU2, target: CPU1 | cost2 |
5512 | * - source: CPU1, target: CPU1 | cost3 | 5517 | * - source: CPU1, target: CPU1 | cost3 |
5513 | * - source: CPU2, target: CPU2 | cost4 | 5518 | * - source: CPU2, target: CPU2 | cost4 |
5514 | * | 5519 | * |
5515 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is | 5520 | * We then calculate the cost3+cost4-cost1-cost2 difference - this is |
5516 | * the cost of migration. | 5521 | * the cost of migration. |
5517 | * | 5522 | * |
5518 | * We then start off from a small buffer-size and iterate up to larger | 5523 | * We then start off from a small buffer-size and iterate up to larger |
5519 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and | 5524 | * buffer sizes, in 5% steps - measuring each buffer-size separately, and |
5520 | * doing a maximum search for the cost. (The maximum cost for a migration | 5525 | * doing a maximum search for the cost. (The maximum cost for a migration |
5521 | * normally occurs when the working set size is around the effective cache | 5526 | * normally occurs when the working set size is around the effective cache |
5522 | * size.) | 5527 | * size.) |
5523 | */ | 5528 | */ |
5524 | #define SEARCH_SCOPE 2 | 5529 | #define SEARCH_SCOPE 2 |
5525 | #define MIN_CACHE_SIZE (64*1024U) | 5530 | #define MIN_CACHE_SIZE (64*1024U) |
5526 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) | 5531 | #define DEFAULT_CACHE_SIZE (5*1024*1024U) |
5527 | #define ITERATIONS 1 | 5532 | #define ITERATIONS 1 |
5528 | #define SIZE_THRESH 130 | 5533 | #define SIZE_THRESH 130 |
5529 | #define COST_THRESH 130 | 5534 | #define COST_THRESH 130 |
5530 | 5535 | ||
5531 | /* | 5536 | /* |
5532 | * The migration cost is a function of 'domain distance'. Domain | 5537 | * The migration cost is a function of 'domain distance'. Domain |
5533 | * distance is the number of steps a CPU has to iterate down its | 5538 | * distance is the number of steps a CPU has to iterate down its |
5534 | * domain tree to share a domain with the other CPU. The farther | 5539 | * domain tree to share a domain with the other CPU. The farther |
5535 | * two CPUs are from each other, the larger the distance gets. | 5540 | * two CPUs are from each other, the larger the distance gets. |
5536 | * | 5541 | * |
5537 | * Note that we use the distance only to cache measurement results, | 5542 | * Note that we use the distance only to cache measurement results, |
5538 | * the distance value is not used numerically otherwise. When two | 5543 | * the distance value is not used numerically otherwise. When two |
5539 | * CPUs have the same distance it is assumed that the migration | 5544 | * CPUs have the same distance it is assumed that the migration |
5540 | * cost is the same. (this is a simplification but quite practical) | 5545 | * cost is the same. (this is a simplification but quite practical) |
5541 | */ | 5546 | */ |
5542 | #define MAX_DOMAIN_DISTANCE 32 | 5547 | #define MAX_DOMAIN_DISTANCE 32 |
5543 | 5548 | ||
5544 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = | 5549 | static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = |
5545 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = | 5550 | { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = |
5546 | /* | 5551 | /* |
5547 | * Architectures may override the migration cost and thus avoid | 5552 | * Architectures may override the migration cost and thus avoid |
5548 | * boot-time calibration. Unit is nanoseconds. Mostly useful for | 5553 | * boot-time calibration. Unit is nanoseconds. Mostly useful for |
5549 | * virtualized hardware: | 5554 | * virtualized hardware: |
5550 | */ | 5555 | */ |
5551 | #ifdef CONFIG_DEFAULT_MIGRATION_COST | 5556 | #ifdef CONFIG_DEFAULT_MIGRATION_COST |
5552 | CONFIG_DEFAULT_MIGRATION_COST | 5557 | CONFIG_DEFAULT_MIGRATION_COST |
5553 | #else | 5558 | #else |
5554 | -1LL | 5559 | -1LL |
5555 | #endif | 5560 | #endif |
5556 | }; | 5561 | }; |
5557 | 5562 | ||
5558 | /* | 5563 | /* |
5559 | * Allow override of migration cost - in units of microseconds. | 5564 | * Allow override of migration cost - in units of microseconds. |
5560 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost | 5565 | * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost |
5561 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: | 5566 | * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: |
5562 | */ | 5567 | */ |
5563 | static int __init migration_cost_setup(char *str) | 5568 | static int __init migration_cost_setup(char *str) |
5564 | { | 5569 | { |
5565 | int ints[MAX_DOMAIN_DISTANCE+1], i; | 5570 | int ints[MAX_DOMAIN_DISTANCE+1], i; |
5566 | 5571 | ||
5567 | str = get_options(str, ARRAY_SIZE(ints), ints); | 5572 | str = get_options(str, ARRAY_SIZE(ints), ints); |
5568 | 5573 | ||
5569 | printk("#ints: %d\n", ints[0]); | 5574 | printk("#ints: %d\n", ints[0]); |
5570 | for (i = 1; i <= ints[0]; i++) { | 5575 | for (i = 1; i <= ints[0]; i++) { |
5571 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; | 5576 | migration_cost[i-1] = (unsigned long long)ints[i]*1000; |
5572 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); | 5577 | printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); |
5573 | } | 5578 | } |
5574 | return 1; | 5579 | return 1; |
5575 | } | 5580 | } |
5576 | 5581 | ||
5577 | __setup ("migration_cost=", migration_cost_setup); | 5582 | __setup ("migration_cost=", migration_cost_setup); |
5578 | 5583 | ||
5579 | /* | 5584 | /* |
5580 | * Global multiplier (divisor) for migration-cutoff values, | 5585 | * Global multiplier (divisor) for migration-cutoff values, |
5581 | * in percentiles. E.g. use a value of 150 to get 1.5 times | 5586 | * in percentiles. E.g. use a value of 150 to get 1.5 times |
5582 | * longer cache-hot cutoff times. | 5587 | * longer cache-hot cutoff times. |
5583 | * | 5588 | * |
5584 | * (We scale it from 100 to 128 to long long handling easier.) | 5589 | * (We scale it from 100 to 128 to long long handling easier.) |
5585 | */ | 5590 | */ |
5586 | 5591 | ||
5587 | #define MIGRATION_FACTOR_SCALE 128 | 5592 | #define MIGRATION_FACTOR_SCALE 128 |
5588 | 5593 | ||
5589 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; | 5594 | static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; |
5590 | 5595 | ||
5591 | static int __init setup_migration_factor(char *str) | 5596 | static int __init setup_migration_factor(char *str) |
5592 | { | 5597 | { |
5593 | get_option(&str, &migration_factor); | 5598 | get_option(&str, &migration_factor); |
5594 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; | 5599 | migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; |
5595 | return 1; | 5600 | return 1; |
5596 | } | 5601 | } |
5597 | 5602 | ||
5598 | __setup("migration_factor=", setup_migration_factor); | 5603 | __setup("migration_factor=", setup_migration_factor); |
5599 | 5604 | ||
5600 | /* | 5605 | /* |
5601 | * Estimated distance of two CPUs, measured via the number of domains | 5606 | * Estimated distance of two CPUs, measured via the number of domains |
5602 | * we have to pass for the two CPUs to be in the same span: | 5607 | * we have to pass for the two CPUs to be in the same span: |
5603 | */ | 5608 | */ |
5604 | static unsigned long domain_distance(int cpu1, int cpu2) | 5609 | static unsigned long domain_distance(int cpu1, int cpu2) |
5605 | { | 5610 | { |
5606 | unsigned long distance = 0; | 5611 | unsigned long distance = 0; |
5607 | struct sched_domain *sd; | 5612 | struct sched_domain *sd; |
5608 | 5613 | ||
5609 | for_each_domain(cpu1, sd) { | 5614 | for_each_domain(cpu1, sd) { |
5610 | WARN_ON(!cpu_isset(cpu1, sd->span)); | 5615 | WARN_ON(!cpu_isset(cpu1, sd->span)); |
5611 | if (cpu_isset(cpu2, sd->span)) | 5616 | if (cpu_isset(cpu2, sd->span)) |
5612 | return distance; | 5617 | return distance; |
5613 | distance++; | 5618 | distance++; |
5614 | } | 5619 | } |
5615 | if (distance >= MAX_DOMAIN_DISTANCE) { | 5620 | if (distance >= MAX_DOMAIN_DISTANCE) { |
5616 | WARN_ON(1); | 5621 | WARN_ON(1); |
5617 | distance = MAX_DOMAIN_DISTANCE-1; | 5622 | distance = MAX_DOMAIN_DISTANCE-1; |
5618 | } | 5623 | } |
5619 | 5624 | ||
5620 | return distance; | 5625 | return distance; |
5621 | } | 5626 | } |
5622 | 5627 | ||
5623 | static unsigned int migration_debug; | 5628 | static unsigned int migration_debug; |
5624 | 5629 | ||
5625 | static int __init setup_migration_debug(char *str) | 5630 | static int __init setup_migration_debug(char *str) |
5626 | { | 5631 | { |
5627 | get_option(&str, &migration_debug); | 5632 | get_option(&str, &migration_debug); |
5628 | return 1; | 5633 | return 1; |
5629 | } | 5634 | } |
5630 | 5635 | ||
5631 | __setup("migration_debug=", setup_migration_debug); | 5636 | __setup("migration_debug=", setup_migration_debug); |
5632 | 5637 | ||
5633 | /* | 5638 | /* |
5634 | * Maximum cache-size that the scheduler should try to measure. | 5639 | * Maximum cache-size that the scheduler should try to measure. |
5635 | * Architectures with larger caches should tune this up during | 5640 | * Architectures with larger caches should tune this up during |
5636 | * bootup. Gets used in the domain-setup code (i.e. during SMP | 5641 | * bootup. Gets used in the domain-setup code (i.e. during SMP |
5637 | * bootup). | 5642 | * bootup). |
5638 | */ | 5643 | */ |
5639 | unsigned int max_cache_size; | 5644 | unsigned int max_cache_size; |
5640 | 5645 | ||
5641 | static int __init setup_max_cache_size(char *str) | 5646 | static int __init setup_max_cache_size(char *str) |
5642 | { | 5647 | { |
5643 | get_option(&str, &max_cache_size); | 5648 | get_option(&str, &max_cache_size); |
5644 | return 1; | 5649 | return 1; |
5645 | } | 5650 | } |
5646 | 5651 | ||
5647 | __setup("max_cache_size=", setup_max_cache_size); | 5652 | __setup("max_cache_size=", setup_max_cache_size); |
5648 | 5653 | ||
5649 | /* | 5654 | /* |
5650 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This | 5655 | * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This |
5651 | * is the operation that is timed, so we try to generate unpredictable | 5656 | * is the operation that is timed, so we try to generate unpredictable |
5652 | * cachemisses that still end up filling the L2 cache: | 5657 | * cachemisses that still end up filling the L2 cache: |
5653 | */ | 5658 | */ |
5654 | static void touch_cache(void *__cache, unsigned long __size) | 5659 | static void touch_cache(void *__cache, unsigned long __size) |
5655 | { | 5660 | { |
5656 | unsigned long size = __size / sizeof(long); | 5661 | unsigned long size = __size / sizeof(long); |
5657 | unsigned long chunk1 = size / 3; | 5662 | unsigned long chunk1 = size / 3; |
5658 | unsigned long chunk2 = 2 * size / 3; | 5663 | unsigned long chunk2 = 2 * size / 3; |
5659 | unsigned long *cache = __cache; | 5664 | unsigned long *cache = __cache; |
5660 | int i; | 5665 | int i; |
5661 | 5666 | ||
5662 | for (i = 0; i < size/6; i += 8) { | 5667 | for (i = 0; i < size/6; i += 8) { |
5663 | switch (i % 6) { | 5668 | switch (i % 6) { |
5664 | case 0: cache[i]++; | 5669 | case 0: cache[i]++; |
5665 | case 1: cache[size-1-i]++; | 5670 | case 1: cache[size-1-i]++; |
5666 | case 2: cache[chunk1-i]++; | 5671 | case 2: cache[chunk1-i]++; |
5667 | case 3: cache[chunk1+i]++; | 5672 | case 3: cache[chunk1+i]++; |
5668 | case 4: cache[chunk2-i]++; | 5673 | case 4: cache[chunk2-i]++; |
5669 | case 5: cache[chunk2+i]++; | 5674 | case 5: cache[chunk2+i]++; |
5670 | } | 5675 | } |
5671 | } | 5676 | } |
5672 | } | 5677 | } |
5673 | 5678 | ||
5674 | /* | 5679 | /* |
5675 | * Measure the cache-cost of one task migration. Returns in units of nsec. | 5680 | * Measure the cache-cost of one task migration. Returns in units of nsec. |
5676 | */ | 5681 | */ |
5677 | static unsigned long long | 5682 | static unsigned long long |
5678 | measure_one(void *cache, unsigned long size, int source, int target) | 5683 | measure_one(void *cache, unsigned long size, int source, int target) |
5679 | { | 5684 | { |
5680 | cpumask_t mask, saved_mask; | 5685 | cpumask_t mask, saved_mask; |
5681 | unsigned long long t0, t1, t2, t3, cost; | 5686 | unsigned long long t0, t1, t2, t3, cost; |
5682 | 5687 | ||
5683 | saved_mask = current->cpus_allowed; | 5688 | saved_mask = current->cpus_allowed; |
5684 | 5689 | ||
5685 | /* | 5690 | /* |
5686 | * Flush source caches to RAM and invalidate them: | 5691 | * Flush source caches to RAM and invalidate them: |
5687 | */ | 5692 | */ |
5688 | sched_cacheflush(); | 5693 | sched_cacheflush(); |
5689 | 5694 | ||
5690 | /* | 5695 | /* |
5691 | * Migrate to the source CPU: | 5696 | * Migrate to the source CPU: |
5692 | */ | 5697 | */ |
5693 | mask = cpumask_of_cpu(source); | 5698 | mask = cpumask_of_cpu(source); |
5694 | set_cpus_allowed(current, mask); | 5699 | set_cpus_allowed(current, mask); |
5695 | WARN_ON(smp_processor_id() != source); | 5700 | WARN_ON(smp_processor_id() != source); |
5696 | 5701 | ||
5697 | /* | 5702 | /* |
5698 | * Dirty the working set: | 5703 | * Dirty the working set: |
5699 | */ | 5704 | */ |
5700 | t0 = sched_clock(); | 5705 | t0 = sched_clock(); |
5701 | touch_cache(cache, size); | 5706 | touch_cache(cache, size); |
5702 | t1 = sched_clock(); | 5707 | t1 = sched_clock(); |
5703 | 5708 | ||
5704 | /* | 5709 | /* |
5705 | * Migrate to the target CPU, dirty the L2 cache and access | 5710 | * Migrate to the target CPU, dirty the L2 cache and access |
5706 | * the shared buffer. (which represents the working set | 5711 | * the shared buffer. (which represents the working set |
5707 | * of a migrated task.) | 5712 | * of a migrated task.) |
5708 | */ | 5713 | */ |
5709 | mask = cpumask_of_cpu(target); | 5714 | mask = cpumask_of_cpu(target); |
5710 | set_cpus_allowed(current, mask); | 5715 | set_cpus_allowed(current, mask); |
5711 | WARN_ON(smp_processor_id() != target); | 5716 | WARN_ON(smp_processor_id() != target); |
5712 | 5717 | ||
5713 | t2 = sched_clock(); | 5718 | t2 = sched_clock(); |
5714 | touch_cache(cache, size); | 5719 | touch_cache(cache, size); |
5715 | t3 = sched_clock(); | 5720 | t3 = sched_clock(); |
5716 | 5721 | ||
5717 | cost = t1-t0 + t3-t2; | 5722 | cost = t1-t0 + t3-t2; |
5718 | 5723 | ||
5719 | if (migration_debug >= 2) | 5724 | if (migration_debug >= 2) |
5720 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", | 5725 | printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", |
5721 | source, target, t1-t0, t1-t0, t3-t2, cost); | 5726 | source, target, t1-t0, t1-t0, t3-t2, cost); |
5722 | /* | 5727 | /* |
5723 | * Flush target caches to RAM and invalidate them: | 5728 | * Flush target caches to RAM and invalidate them: |
5724 | */ | 5729 | */ |
5725 | sched_cacheflush(); | 5730 | sched_cacheflush(); |
5726 | 5731 | ||
5727 | set_cpus_allowed(current, saved_mask); | 5732 | set_cpus_allowed(current, saved_mask); |
5728 | 5733 | ||
5729 | return cost; | 5734 | return cost; |
5730 | } | 5735 | } |
5731 | 5736 | ||
5732 | /* | 5737 | /* |
5733 | * Measure a series of task migrations and return the average | 5738 | * Measure a series of task migrations and return the average |
5734 | * result. Since this code runs early during bootup the system | 5739 | * result. Since this code runs early during bootup the system |
5735 | * is 'undisturbed' and the average latency makes sense. | 5740 | * is 'undisturbed' and the average latency makes sense. |
5736 | * | 5741 | * |
5737 | * The algorithm in essence auto-detects the relevant cache-size, | 5742 | * The algorithm in essence auto-detects the relevant cache-size, |
5738 | * so it will properly detect different cachesizes for different | 5743 | * so it will properly detect different cachesizes for different |
5739 | * cache-hierarchies, depending on how the CPUs are connected. | 5744 | * cache-hierarchies, depending on how the CPUs are connected. |
5740 | * | 5745 | * |
5741 | * Architectures can prime the upper limit of the search range via | 5746 | * Architectures can prime the upper limit of the search range via |
5742 | * max_cache_size, otherwise the search range defaults to 20MB...64K. | 5747 | * max_cache_size, otherwise the search range defaults to 20MB...64K. |
5743 | */ | 5748 | */ |
5744 | static unsigned long long | 5749 | static unsigned long long |
5745 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | 5750 | measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) |
5746 | { | 5751 | { |
5747 | unsigned long long cost1, cost2; | 5752 | unsigned long long cost1, cost2; |
5748 | int i; | 5753 | int i; |
5749 | 5754 | ||
5750 | /* | 5755 | /* |
5751 | * Measure the migration cost of 'size' bytes, over an | 5756 | * Measure the migration cost of 'size' bytes, over an |
5752 | * average of 10 runs: | 5757 | * average of 10 runs: |
5753 | * | 5758 | * |
5754 | * (We perturb the cache size by a small (0..4k) | 5759 | * (We perturb the cache size by a small (0..4k) |
5755 | * value to compensate size/alignment related artifacts. | 5760 | * value to compensate size/alignment related artifacts. |
5756 | * We also subtract the cost of the operation done on | 5761 | * We also subtract the cost of the operation done on |
5757 | * the same CPU.) | 5762 | * the same CPU.) |
5758 | */ | 5763 | */ |
5759 | cost1 = 0; | 5764 | cost1 = 0; |
5760 | 5765 | ||
5761 | /* | 5766 | /* |
5762 | * dry run, to make sure we start off cache-cold on cpu1, | 5767 | * dry run, to make sure we start off cache-cold on cpu1, |
5763 | * and to get any vmalloc pagefaults in advance: | 5768 | * and to get any vmalloc pagefaults in advance: |
5764 | */ | 5769 | */ |
5765 | measure_one(cache, size, cpu1, cpu2); | 5770 | measure_one(cache, size, cpu1, cpu2); |
5766 | for (i = 0; i < ITERATIONS; i++) | 5771 | for (i = 0; i < ITERATIONS; i++) |
5767 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); | 5772 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); |
5768 | 5773 | ||
5769 | measure_one(cache, size, cpu2, cpu1); | 5774 | measure_one(cache, size, cpu2, cpu1); |
5770 | for (i = 0; i < ITERATIONS; i++) | 5775 | for (i = 0; i < ITERATIONS; i++) |
5771 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); | 5776 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); |
5772 | 5777 | ||
5773 | /* | 5778 | /* |
5774 | * (We measure the non-migrating [cached] cost on both | 5779 | * (We measure the non-migrating [cached] cost on both |
5775 | * cpu1 and cpu2, to handle CPUs with different speeds) | 5780 | * cpu1 and cpu2, to handle CPUs with different speeds) |
5776 | */ | 5781 | */ |
5777 | cost2 = 0; | 5782 | cost2 = 0; |
5778 | 5783 | ||
5779 | measure_one(cache, size, cpu1, cpu1); | 5784 | measure_one(cache, size, cpu1, cpu1); |
5780 | for (i = 0; i < ITERATIONS; i++) | 5785 | for (i = 0; i < ITERATIONS; i++) |
5781 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); | 5786 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); |
5782 | 5787 | ||
5783 | measure_one(cache, size, cpu2, cpu2); | 5788 | measure_one(cache, size, cpu2, cpu2); |
5784 | for (i = 0; i < ITERATIONS; i++) | 5789 | for (i = 0; i < ITERATIONS; i++) |
5785 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); | 5790 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); |
5786 | 5791 | ||
5787 | /* | 5792 | /* |
5788 | * Get the per-iteration migration cost: | 5793 | * Get the per-iteration migration cost: |
5789 | */ | 5794 | */ |
5790 | do_div(cost1, 2 * ITERATIONS); | 5795 | do_div(cost1, 2 * ITERATIONS); |
5791 | do_div(cost2, 2 * ITERATIONS); | 5796 | do_div(cost2, 2 * ITERATIONS); |
5792 | 5797 | ||
5793 | return cost1 - cost2; | 5798 | return cost1 - cost2; |
5794 | } | 5799 | } |
5795 | 5800 | ||
5796 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) | 5801 | static unsigned long long measure_migration_cost(int cpu1, int cpu2) |
5797 | { | 5802 | { |
5798 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; | 5803 | unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; |
5799 | unsigned int max_size, size, size_found = 0; | 5804 | unsigned int max_size, size, size_found = 0; |
5800 | long long cost = 0, prev_cost; | 5805 | long long cost = 0, prev_cost; |
5801 | void *cache; | 5806 | void *cache; |
5802 | 5807 | ||
5803 | /* | 5808 | /* |
5804 | * Search from max_cache_size*5 down to 64K - the real relevant | 5809 | * Search from max_cache_size*5 down to 64K - the real relevant |
5805 | * cachesize has to lie somewhere inbetween. | 5810 | * cachesize has to lie somewhere inbetween. |
5806 | */ | 5811 | */ |
5807 | if (max_cache_size) { | 5812 | if (max_cache_size) { |
5808 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); | 5813 | max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); |
5809 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); | 5814 | size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); |
5810 | } else { | 5815 | } else { |
5811 | /* | 5816 | /* |
5812 | * Since we have no estimation about the relevant | 5817 | * Since we have no estimation about the relevant |
5813 | * search range | 5818 | * search range |
5814 | */ | 5819 | */ |
5815 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; | 5820 | max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; |
5816 | size = MIN_CACHE_SIZE; | 5821 | size = MIN_CACHE_SIZE; |
5817 | } | 5822 | } |
5818 | 5823 | ||
5819 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { | 5824 | if (!cpu_online(cpu1) || !cpu_online(cpu2)) { |
5820 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); | 5825 | printk("cpu %d and %d not both online!\n", cpu1, cpu2); |
5821 | return 0; | 5826 | return 0; |
5822 | } | 5827 | } |
5823 | 5828 | ||
5824 | /* | 5829 | /* |
5825 | * Allocate the working set: | 5830 | * Allocate the working set: |
5826 | */ | 5831 | */ |
5827 | cache = vmalloc(max_size); | 5832 | cache = vmalloc(max_size); |
5828 | if (!cache) { | 5833 | if (!cache) { |
5829 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); | 5834 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); |
5830 | return 1000000; /* return 1 msec on very small boxen */ | 5835 | return 1000000; /* return 1 msec on very small boxen */ |
5831 | } | 5836 | } |
5832 | 5837 | ||
5833 | while (size <= max_size) { | 5838 | while (size <= max_size) { |
5834 | prev_cost = cost; | 5839 | prev_cost = cost; |
5835 | cost = measure_cost(cpu1, cpu2, cache, size); | 5840 | cost = measure_cost(cpu1, cpu2, cache, size); |
5836 | 5841 | ||
5837 | /* | 5842 | /* |
5838 | * Update the max: | 5843 | * Update the max: |
5839 | */ | 5844 | */ |
5840 | if (cost > 0) { | 5845 | if (cost > 0) { |
5841 | if (max_cost < cost) { | 5846 | if (max_cost < cost) { |
5842 | max_cost = cost; | 5847 | max_cost = cost; |
5843 | size_found = size; | 5848 | size_found = size; |
5844 | } | 5849 | } |
5845 | } | 5850 | } |
5846 | /* | 5851 | /* |
5847 | * Calculate average fluctuation, we use this to prevent | 5852 | * Calculate average fluctuation, we use this to prevent |
5848 | * noise from triggering an early break out of the loop: | 5853 | * noise from triggering an early break out of the loop: |
5849 | */ | 5854 | */ |
5850 | fluct = abs(cost - prev_cost); | 5855 | fluct = abs(cost - prev_cost); |
5851 | avg_fluct = (avg_fluct + fluct)/2; | 5856 | avg_fluct = (avg_fluct + fluct)/2; |
5852 | 5857 | ||
5853 | if (migration_debug) | 5858 | if (migration_debug) |
5854 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " | 5859 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " |
5855 | "(%8Ld %8Ld)\n", | 5860 | "(%8Ld %8Ld)\n", |
5856 | cpu1, cpu2, size, | 5861 | cpu1, cpu2, size, |
5857 | (long)cost / 1000000, | 5862 | (long)cost / 1000000, |
5858 | ((long)cost / 100000) % 10, | 5863 | ((long)cost / 100000) % 10, |
5859 | (long)max_cost / 1000000, | 5864 | (long)max_cost / 1000000, |
5860 | ((long)max_cost / 100000) % 10, | 5865 | ((long)max_cost / 100000) % 10, |
5861 | domain_distance(cpu1, cpu2), | 5866 | domain_distance(cpu1, cpu2), |
5862 | cost, avg_fluct); | 5867 | cost, avg_fluct); |
5863 | 5868 | ||
5864 | /* | 5869 | /* |
5865 | * If we iterated at least 20% past the previous maximum, | 5870 | * If we iterated at least 20% past the previous maximum, |
5866 | * and the cost has dropped by more than 20% already, | 5871 | * and the cost has dropped by more than 20% already, |
5867 | * (taking fluctuations into account) then we assume to | 5872 | * (taking fluctuations into account) then we assume to |
5868 | * have found the maximum and break out of the loop early: | 5873 | * have found the maximum and break out of the loop early: |
5869 | */ | 5874 | */ |
5870 | if (size_found && (size*100 > size_found*SIZE_THRESH)) | 5875 | if (size_found && (size*100 > size_found*SIZE_THRESH)) |
5871 | if (cost+avg_fluct <= 0 || | 5876 | if (cost+avg_fluct <= 0 || |
5872 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { | 5877 | max_cost*100 > (cost+avg_fluct)*COST_THRESH) { |
5873 | 5878 | ||
5874 | if (migration_debug) | 5879 | if (migration_debug) |
5875 | printk("-> found max.\n"); | 5880 | printk("-> found max.\n"); |
5876 | break; | 5881 | break; |
5877 | } | 5882 | } |
5878 | /* | 5883 | /* |
5879 | * Increase the cachesize in 10% steps: | 5884 | * Increase the cachesize in 10% steps: |
5880 | */ | 5885 | */ |
5881 | size = size * 10 / 9; | 5886 | size = size * 10 / 9; |
5882 | } | 5887 | } |
5883 | 5888 | ||
5884 | if (migration_debug) | 5889 | if (migration_debug) |
5885 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", | 5890 | printk("[%d][%d] working set size found: %d, cost: %Ld\n", |
5886 | cpu1, cpu2, size_found, max_cost); | 5891 | cpu1, cpu2, size_found, max_cost); |
5887 | 5892 | ||
5888 | vfree(cache); | 5893 | vfree(cache); |
5889 | 5894 | ||
5890 | /* | 5895 | /* |
5891 | * A task is considered 'cache cold' if at least 2 times | 5896 | * A task is considered 'cache cold' if at least 2 times |
5892 | * the worst-case cost of migration has passed. | 5897 | * the worst-case cost of migration has passed. |
5893 | * | 5898 | * |
5894 | * (this limit is only listened to if the load-balancing | 5899 | * (this limit is only listened to if the load-balancing |
5895 | * situation is 'nice' - if there is a large imbalance we | 5900 | * situation is 'nice' - if there is a large imbalance we |
5896 | * ignore it for the sake of CPU utilization and | 5901 | * ignore it for the sake of CPU utilization and |
5897 | * processing fairness.) | 5902 | * processing fairness.) |
5898 | */ | 5903 | */ |
5899 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; | 5904 | return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; |
5900 | } | 5905 | } |
5901 | 5906 | ||
5902 | static void calibrate_migration_costs(const cpumask_t *cpu_map) | 5907 | static void calibrate_migration_costs(const cpumask_t *cpu_map) |
5903 | { | 5908 | { |
5904 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); | 5909 | int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); |
5905 | unsigned long j0, j1, distance, max_distance = 0; | 5910 | unsigned long j0, j1, distance, max_distance = 0; |
5906 | struct sched_domain *sd; | 5911 | struct sched_domain *sd; |
5907 | 5912 | ||
5908 | j0 = jiffies; | 5913 | j0 = jiffies; |
5909 | 5914 | ||
5910 | /* | 5915 | /* |
5911 | * First pass - calculate the cacheflush times: | 5916 | * First pass - calculate the cacheflush times: |
5912 | */ | 5917 | */ |
5913 | for_each_cpu_mask(cpu1, *cpu_map) { | 5918 | for_each_cpu_mask(cpu1, *cpu_map) { |
5914 | for_each_cpu_mask(cpu2, *cpu_map) { | 5919 | for_each_cpu_mask(cpu2, *cpu_map) { |
5915 | if (cpu1 == cpu2) | 5920 | if (cpu1 == cpu2) |
5916 | continue; | 5921 | continue; |
5917 | distance = domain_distance(cpu1, cpu2); | 5922 | distance = domain_distance(cpu1, cpu2); |
5918 | max_distance = max(max_distance, distance); | 5923 | max_distance = max(max_distance, distance); |
5919 | /* | 5924 | /* |
5920 | * No result cached yet? | 5925 | * No result cached yet? |
5921 | */ | 5926 | */ |
5922 | if (migration_cost[distance] == -1LL) | 5927 | if (migration_cost[distance] == -1LL) |
5923 | migration_cost[distance] = | 5928 | migration_cost[distance] = |
5924 | measure_migration_cost(cpu1, cpu2); | 5929 | measure_migration_cost(cpu1, cpu2); |
5925 | } | 5930 | } |
5926 | } | 5931 | } |
5927 | /* | 5932 | /* |
5928 | * Second pass - update the sched domain hierarchy with | 5933 | * Second pass - update the sched domain hierarchy with |
5929 | * the new cache-hot-time estimations: | 5934 | * the new cache-hot-time estimations: |
5930 | */ | 5935 | */ |
5931 | for_each_cpu_mask(cpu, *cpu_map) { | 5936 | for_each_cpu_mask(cpu, *cpu_map) { |
5932 | distance = 0; | 5937 | distance = 0; |
5933 | for_each_domain(cpu, sd) { | 5938 | for_each_domain(cpu, sd) { |
5934 | sd->cache_hot_time = migration_cost[distance]; | 5939 | sd->cache_hot_time = migration_cost[distance]; |
5935 | distance++; | 5940 | distance++; |
5936 | } | 5941 | } |
5937 | } | 5942 | } |
5938 | /* | 5943 | /* |
5939 | * Print the matrix: | 5944 | * Print the matrix: |
5940 | */ | 5945 | */ |
5941 | if (migration_debug) | 5946 | if (migration_debug) |
5942 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", | 5947 | printk("migration: max_cache_size: %d, cpu: %d MHz:\n", |
5943 | max_cache_size, | 5948 | max_cache_size, |
5944 | #ifdef CONFIG_X86 | 5949 | #ifdef CONFIG_X86 |
5945 | cpu_khz/1000 | 5950 | cpu_khz/1000 |
5946 | #else | 5951 | #else |
5947 | -1 | 5952 | -1 |
5948 | #endif | 5953 | #endif |
5949 | ); | 5954 | ); |
5950 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { | 5955 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { |
5951 | printk("migration_cost="); | 5956 | printk("migration_cost="); |
5952 | for (distance = 0; distance <= max_distance; distance++) { | 5957 | for (distance = 0; distance <= max_distance; distance++) { |
5953 | if (distance) | 5958 | if (distance) |
5954 | printk(","); | 5959 | printk(","); |
5955 | printk("%ld", (long)migration_cost[distance] / 1000); | 5960 | printk("%ld", (long)migration_cost[distance] / 1000); |
5956 | } | 5961 | } |
5957 | printk("\n"); | 5962 | printk("\n"); |
5958 | } | 5963 | } |
5959 | j1 = jiffies; | 5964 | j1 = jiffies; |
5960 | if (migration_debug) | 5965 | if (migration_debug) |
5961 | printk("migration: %ld seconds\n", (j1-j0) / HZ); | 5966 | printk("migration: %ld seconds\n", (j1-j0) / HZ); |
5962 | 5967 | ||
5963 | /* | 5968 | /* |
5964 | * Move back to the original CPU. NUMA-Q gets confused | 5969 | * Move back to the original CPU. NUMA-Q gets confused |
5965 | * if we migrate to another quad during bootup. | 5970 | * if we migrate to another quad during bootup. |
5966 | */ | 5971 | */ |
5967 | if (raw_smp_processor_id() != orig_cpu) { | 5972 | if (raw_smp_processor_id() != orig_cpu) { |
5968 | cpumask_t mask = cpumask_of_cpu(orig_cpu), | 5973 | cpumask_t mask = cpumask_of_cpu(orig_cpu), |
5969 | saved_mask = current->cpus_allowed; | 5974 | saved_mask = current->cpus_allowed; |
5970 | 5975 | ||
5971 | set_cpus_allowed(current, mask); | 5976 | set_cpus_allowed(current, mask); |
5972 | set_cpus_allowed(current, saved_mask); | 5977 | set_cpus_allowed(current, saved_mask); |
5973 | } | 5978 | } |
5974 | } | 5979 | } |
5975 | 5980 | ||
5976 | #ifdef CONFIG_NUMA | 5981 | #ifdef CONFIG_NUMA |
5977 | 5982 | ||
5978 | /** | 5983 | /** |
5979 | * find_next_best_node - find the next node to include in a sched_domain | 5984 | * find_next_best_node - find the next node to include in a sched_domain |
5980 | * @node: node whose sched_domain we're building | 5985 | * @node: node whose sched_domain we're building |
5981 | * @used_nodes: nodes already in the sched_domain | 5986 | * @used_nodes: nodes already in the sched_domain |
5982 | * | 5987 | * |
5983 | * Find the next node to include in a given scheduling domain. Simply | 5988 | * Find the next node to include in a given scheduling domain. Simply |
5984 | * finds the closest node not already in the @used_nodes map. | 5989 | * finds the closest node not already in the @used_nodes map. |
5985 | * | 5990 | * |
5986 | * Should use nodemask_t. | 5991 | * Should use nodemask_t. |
5987 | */ | 5992 | */ |
5988 | static int find_next_best_node(int node, unsigned long *used_nodes) | 5993 | static int find_next_best_node(int node, unsigned long *used_nodes) |
5989 | { | 5994 | { |
5990 | int i, n, val, min_val, best_node = 0; | 5995 | int i, n, val, min_val, best_node = 0; |
5991 | 5996 | ||
5992 | min_val = INT_MAX; | 5997 | min_val = INT_MAX; |
5993 | 5998 | ||
5994 | for (i = 0; i < MAX_NUMNODES; i++) { | 5999 | for (i = 0; i < MAX_NUMNODES; i++) { |
5995 | /* Start at @node */ | 6000 | /* Start at @node */ |
5996 | n = (node + i) % MAX_NUMNODES; | 6001 | n = (node + i) % MAX_NUMNODES; |
5997 | 6002 | ||
5998 | if (!nr_cpus_node(n)) | 6003 | if (!nr_cpus_node(n)) |
5999 | continue; | 6004 | continue; |
6000 | 6005 | ||
6001 | /* Skip already used nodes */ | 6006 | /* Skip already used nodes */ |
6002 | if (test_bit(n, used_nodes)) | 6007 | if (test_bit(n, used_nodes)) |
6003 | continue; | 6008 | continue; |
6004 | 6009 | ||
6005 | /* Simple min distance search */ | 6010 | /* Simple min distance search */ |
6006 | val = node_distance(node, n); | 6011 | val = node_distance(node, n); |
6007 | 6012 | ||
6008 | if (val < min_val) { | 6013 | if (val < min_val) { |
6009 | min_val = val; | 6014 | min_val = val; |
6010 | best_node = n; | 6015 | best_node = n; |
6011 | } | 6016 | } |
6012 | } | 6017 | } |
6013 | 6018 | ||
6014 | set_bit(best_node, used_nodes); | 6019 | set_bit(best_node, used_nodes); |
6015 | return best_node; | 6020 | return best_node; |
6016 | } | 6021 | } |
6017 | 6022 | ||
6018 | /** | 6023 | /** |
6019 | * sched_domain_node_span - get a cpumask for a node's sched_domain | 6024 | * sched_domain_node_span - get a cpumask for a node's sched_domain |
6020 | * @node: node whose cpumask we're constructing | 6025 | * @node: node whose cpumask we're constructing |
6021 | * @size: number of nodes to include in this span | 6026 | * @size: number of nodes to include in this span |
6022 | * | 6027 | * |
6023 | * Given a node, construct a good cpumask for its sched_domain to span. It | 6028 | * Given a node, construct a good cpumask for its sched_domain to span. It |
6024 | * should be one that prevents unnecessary balancing, but also spreads tasks | 6029 | * should be one that prevents unnecessary balancing, but also spreads tasks |
6025 | * out optimally. | 6030 | * out optimally. |
6026 | */ | 6031 | */ |
6027 | static cpumask_t sched_domain_node_span(int node) | 6032 | static cpumask_t sched_domain_node_span(int node) |
6028 | { | 6033 | { |
6029 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | 6034 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); |
6030 | cpumask_t span, nodemask; | 6035 | cpumask_t span, nodemask; |
6031 | int i; | 6036 | int i; |
6032 | 6037 | ||
6033 | cpus_clear(span); | 6038 | cpus_clear(span); |
6034 | bitmap_zero(used_nodes, MAX_NUMNODES); | 6039 | bitmap_zero(used_nodes, MAX_NUMNODES); |
6035 | 6040 | ||
6036 | nodemask = node_to_cpumask(node); | 6041 | nodemask = node_to_cpumask(node); |
6037 | cpus_or(span, span, nodemask); | 6042 | cpus_or(span, span, nodemask); |
6038 | set_bit(node, used_nodes); | 6043 | set_bit(node, used_nodes); |
6039 | 6044 | ||
6040 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | 6045 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { |
6041 | int next_node = find_next_best_node(node, used_nodes); | 6046 | int next_node = find_next_best_node(node, used_nodes); |
6042 | 6047 | ||
6043 | nodemask = node_to_cpumask(next_node); | 6048 | nodemask = node_to_cpumask(next_node); |
6044 | cpus_or(span, span, nodemask); | 6049 | cpus_or(span, span, nodemask); |
6045 | } | 6050 | } |
6046 | 6051 | ||
6047 | return span; | 6052 | return span; |
6048 | } | 6053 | } |
6049 | #endif | 6054 | #endif |
6050 | 6055 | ||
6051 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | 6056 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; |
6052 | 6057 | ||
6053 | /* | 6058 | /* |
6054 | * SMT sched-domains: | 6059 | * SMT sched-domains: |
6055 | */ | 6060 | */ |
6056 | #ifdef CONFIG_SCHED_SMT | 6061 | #ifdef CONFIG_SCHED_SMT |
6057 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6062 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
6058 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 6063 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
6059 | 6064 | ||
6060 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, | 6065 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, |
6061 | struct sched_group **sg) | 6066 | struct sched_group **sg) |
6062 | { | 6067 | { |
6063 | if (sg) | 6068 | if (sg) |
6064 | *sg = &per_cpu(sched_group_cpus, cpu); | 6069 | *sg = &per_cpu(sched_group_cpus, cpu); |
6065 | return cpu; | 6070 | return cpu; |
6066 | } | 6071 | } |
6067 | #endif | 6072 | #endif |
6068 | 6073 | ||
6069 | /* | 6074 | /* |
6070 | * multi-core sched-domains: | 6075 | * multi-core sched-domains: |
6071 | */ | 6076 | */ |
6072 | #ifdef CONFIG_SCHED_MC | 6077 | #ifdef CONFIG_SCHED_MC |
6073 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6078 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6074 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); | 6079 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6075 | #endif | 6080 | #endif |
6076 | 6081 | ||
6077 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6082 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6078 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 6083 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
6079 | struct sched_group **sg) | 6084 | struct sched_group **sg) |
6080 | { | 6085 | { |
6081 | int group; | 6086 | int group; |
6082 | cpumask_t mask = cpu_sibling_map[cpu]; | 6087 | cpumask_t mask = cpu_sibling_map[cpu]; |
6083 | cpus_and(mask, mask, *cpu_map); | 6088 | cpus_and(mask, mask, *cpu_map); |
6084 | group = first_cpu(mask); | 6089 | group = first_cpu(mask); |
6085 | if (sg) | 6090 | if (sg) |
6086 | *sg = &per_cpu(sched_group_core, group); | 6091 | *sg = &per_cpu(sched_group_core, group); |
6087 | return group; | 6092 | return group; |
6088 | } | 6093 | } |
6089 | #elif defined(CONFIG_SCHED_MC) | 6094 | #elif defined(CONFIG_SCHED_MC) |
6090 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 6095 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
6091 | struct sched_group **sg) | 6096 | struct sched_group **sg) |
6092 | { | 6097 | { |
6093 | if (sg) | 6098 | if (sg) |
6094 | *sg = &per_cpu(sched_group_core, cpu); | 6099 | *sg = &per_cpu(sched_group_core, cpu); |
6095 | return cpu; | 6100 | return cpu; |
6096 | } | 6101 | } |
6097 | #endif | 6102 | #endif |
6098 | 6103 | ||
6099 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6104 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
6100 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 6105 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6101 | 6106 | ||
6102 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, | 6107 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, |
6103 | struct sched_group **sg) | 6108 | struct sched_group **sg) |
6104 | { | 6109 | { |
6105 | int group; | 6110 | int group; |
6106 | #ifdef CONFIG_SCHED_MC | 6111 | #ifdef CONFIG_SCHED_MC |
6107 | cpumask_t mask = cpu_coregroup_map(cpu); | 6112 | cpumask_t mask = cpu_coregroup_map(cpu); |
6108 | cpus_and(mask, mask, *cpu_map); | 6113 | cpus_and(mask, mask, *cpu_map); |
6109 | group = first_cpu(mask); | 6114 | group = first_cpu(mask); |
6110 | #elif defined(CONFIG_SCHED_SMT) | 6115 | #elif defined(CONFIG_SCHED_SMT) |
6111 | cpumask_t mask = cpu_sibling_map[cpu]; | 6116 | cpumask_t mask = cpu_sibling_map[cpu]; |
6112 | cpus_and(mask, mask, *cpu_map); | 6117 | cpus_and(mask, mask, *cpu_map); |
6113 | group = first_cpu(mask); | 6118 | group = first_cpu(mask); |
6114 | #else | 6119 | #else |
6115 | group = cpu; | 6120 | group = cpu; |
6116 | #endif | 6121 | #endif |
6117 | if (sg) | 6122 | if (sg) |
6118 | *sg = &per_cpu(sched_group_phys, group); | 6123 | *sg = &per_cpu(sched_group_phys, group); |
6119 | return group; | 6124 | return group; |
6120 | } | 6125 | } |
6121 | 6126 | ||
6122 | #ifdef CONFIG_NUMA | 6127 | #ifdef CONFIG_NUMA |
6123 | /* | 6128 | /* |
6124 | * The init_sched_build_groups can't handle what we want to do with node | 6129 | * The init_sched_build_groups can't handle what we want to do with node |
6125 | * groups, so roll our own. Now each node has its own list of groups which | 6130 | * groups, so roll our own. Now each node has its own list of groups which |
6126 | * gets dynamically allocated. | 6131 | * gets dynamically allocated. |
6127 | */ | 6132 | */ |
6128 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | 6133 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
6129 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 6134 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
6130 | 6135 | ||
6131 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 6136 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
6132 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); | 6137 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
6133 | 6138 | ||
6134 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, | 6139 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
6135 | struct sched_group **sg) | 6140 | struct sched_group **sg) |
6136 | { | 6141 | { |
6137 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); | 6142 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); |
6138 | int group; | 6143 | int group; |
6139 | 6144 | ||
6140 | cpus_and(nodemask, nodemask, *cpu_map); | 6145 | cpus_and(nodemask, nodemask, *cpu_map); |
6141 | group = first_cpu(nodemask); | 6146 | group = first_cpu(nodemask); |
6142 | 6147 | ||
6143 | if (sg) | 6148 | if (sg) |
6144 | *sg = &per_cpu(sched_group_allnodes, group); | 6149 | *sg = &per_cpu(sched_group_allnodes, group); |
6145 | return group; | 6150 | return group; |
6146 | } | 6151 | } |
6147 | 6152 | ||
6148 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 6153 | static void init_numa_sched_groups_power(struct sched_group *group_head) |
6149 | { | 6154 | { |
6150 | struct sched_group *sg = group_head; | 6155 | struct sched_group *sg = group_head; |
6151 | int j; | 6156 | int j; |
6152 | 6157 | ||
6153 | if (!sg) | 6158 | if (!sg) |
6154 | return; | 6159 | return; |
6155 | next_sg: | 6160 | next_sg: |
6156 | for_each_cpu_mask(j, sg->cpumask) { | 6161 | for_each_cpu_mask(j, sg->cpumask) { |
6157 | struct sched_domain *sd; | 6162 | struct sched_domain *sd; |
6158 | 6163 | ||
6159 | sd = &per_cpu(phys_domains, j); | 6164 | sd = &per_cpu(phys_domains, j); |
6160 | if (j != first_cpu(sd->groups->cpumask)) { | 6165 | if (j != first_cpu(sd->groups->cpumask)) { |
6161 | /* | 6166 | /* |
6162 | * Only add "power" once for each | 6167 | * Only add "power" once for each |
6163 | * physical package. | 6168 | * physical package. |
6164 | */ | 6169 | */ |
6165 | continue; | 6170 | continue; |
6166 | } | 6171 | } |
6167 | 6172 | ||
6168 | sg->cpu_power += sd->groups->cpu_power; | 6173 | sg->cpu_power += sd->groups->cpu_power; |
6169 | } | 6174 | } |
6170 | sg = sg->next; | 6175 | sg = sg->next; |
6171 | if (sg != group_head) | 6176 | if (sg != group_head) |
6172 | goto next_sg; | 6177 | goto next_sg; |
6173 | } | 6178 | } |
6174 | #endif | 6179 | #endif |
6175 | 6180 | ||
6176 | #ifdef CONFIG_NUMA | 6181 | #ifdef CONFIG_NUMA |
6177 | /* Free memory allocated for various sched_group structures */ | 6182 | /* Free memory allocated for various sched_group structures */ |
6178 | static void free_sched_groups(const cpumask_t *cpu_map) | 6183 | static void free_sched_groups(const cpumask_t *cpu_map) |
6179 | { | 6184 | { |
6180 | int cpu, i; | 6185 | int cpu, i; |
6181 | 6186 | ||
6182 | for_each_cpu_mask(cpu, *cpu_map) { | 6187 | for_each_cpu_mask(cpu, *cpu_map) { |
6183 | struct sched_group **sched_group_nodes | 6188 | struct sched_group **sched_group_nodes |
6184 | = sched_group_nodes_bycpu[cpu]; | 6189 | = sched_group_nodes_bycpu[cpu]; |
6185 | 6190 | ||
6186 | if (!sched_group_nodes) | 6191 | if (!sched_group_nodes) |
6187 | continue; | 6192 | continue; |
6188 | 6193 | ||
6189 | for (i = 0; i < MAX_NUMNODES; i++) { | 6194 | for (i = 0; i < MAX_NUMNODES; i++) { |
6190 | cpumask_t nodemask = node_to_cpumask(i); | 6195 | cpumask_t nodemask = node_to_cpumask(i); |
6191 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | 6196 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; |
6192 | 6197 | ||
6193 | cpus_and(nodemask, nodemask, *cpu_map); | 6198 | cpus_and(nodemask, nodemask, *cpu_map); |
6194 | if (cpus_empty(nodemask)) | 6199 | if (cpus_empty(nodemask)) |
6195 | continue; | 6200 | continue; |
6196 | 6201 | ||
6197 | if (sg == NULL) | 6202 | if (sg == NULL) |
6198 | continue; | 6203 | continue; |
6199 | sg = sg->next; | 6204 | sg = sg->next; |
6200 | next_sg: | 6205 | next_sg: |
6201 | oldsg = sg; | 6206 | oldsg = sg; |
6202 | sg = sg->next; | 6207 | sg = sg->next; |
6203 | kfree(oldsg); | 6208 | kfree(oldsg); |
6204 | if (oldsg != sched_group_nodes[i]) | 6209 | if (oldsg != sched_group_nodes[i]) |
6205 | goto next_sg; | 6210 | goto next_sg; |
6206 | } | 6211 | } |
6207 | kfree(sched_group_nodes); | 6212 | kfree(sched_group_nodes); |
6208 | sched_group_nodes_bycpu[cpu] = NULL; | 6213 | sched_group_nodes_bycpu[cpu] = NULL; |
6209 | } | 6214 | } |
6210 | } | 6215 | } |
6211 | #else | 6216 | #else |
6212 | static void free_sched_groups(const cpumask_t *cpu_map) | 6217 | static void free_sched_groups(const cpumask_t *cpu_map) |
6213 | { | 6218 | { |
6214 | } | 6219 | } |
6215 | #endif | 6220 | #endif |
6216 | 6221 | ||
6217 | /* | 6222 | /* |
6218 | * Initialize sched groups cpu_power. | 6223 | * Initialize sched groups cpu_power. |
6219 | * | 6224 | * |
6220 | * cpu_power indicates the capacity of sched group, which is used while | 6225 | * cpu_power indicates the capacity of sched group, which is used while |
6221 | * distributing the load between different sched groups in a sched domain. | 6226 | * distributing the load between different sched groups in a sched domain. |
6222 | * Typically cpu_power for all the groups in a sched domain will be same unless | 6227 | * Typically cpu_power for all the groups in a sched domain will be same unless |
6223 | * there are asymmetries in the topology. If there are asymmetries, group | 6228 | * there are asymmetries in the topology. If there are asymmetries, group |
6224 | * having more cpu_power will pickup more load compared to the group having | 6229 | * having more cpu_power will pickup more load compared to the group having |
6225 | * less cpu_power. | 6230 | * less cpu_power. |
6226 | * | 6231 | * |
6227 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | 6232 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents |
6228 | * the maximum number of tasks a group can handle in the presence of other idle | 6233 | * the maximum number of tasks a group can handle in the presence of other idle |
6229 | * or lightly loaded groups in the same sched domain. | 6234 | * or lightly loaded groups in the same sched domain. |
6230 | */ | 6235 | */ |
6231 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 6236 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
6232 | { | 6237 | { |
6233 | struct sched_domain *child; | 6238 | struct sched_domain *child; |
6234 | struct sched_group *group; | 6239 | struct sched_group *group; |
6235 | 6240 | ||
6236 | WARN_ON(!sd || !sd->groups); | 6241 | WARN_ON(!sd || !sd->groups); |
6237 | 6242 | ||
6238 | if (cpu != first_cpu(sd->groups->cpumask)) | 6243 | if (cpu != first_cpu(sd->groups->cpumask)) |
6239 | return; | 6244 | return; |
6240 | 6245 | ||
6241 | child = sd->child; | 6246 | child = sd->child; |
6242 | 6247 | ||
6243 | /* | 6248 | /* |
6244 | * For perf policy, if the groups in child domain share resources | 6249 | * For perf policy, if the groups in child domain share resources |
6245 | * (for example cores sharing some portions of the cache hierarchy | 6250 | * (for example cores sharing some portions of the cache hierarchy |
6246 | * or SMT), then set this domain groups cpu_power such that each group | 6251 | * or SMT), then set this domain groups cpu_power such that each group |
6247 | * can handle only one task, when there are other idle groups in the | 6252 | * can handle only one task, when there are other idle groups in the |
6248 | * same sched domain. | 6253 | * same sched domain. |
6249 | */ | 6254 | */ |
6250 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 6255 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && |
6251 | (child->flags & | 6256 | (child->flags & |
6252 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 6257 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { |
6253 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6258 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
6254 | return; | 6259 | return; |
6255 | } | 6260 | } |
6256 | 6261 | ||
6257 | sd->groups->cpu_power = 0; | 6262 | sd->groups->cpu_power = 0; |
6258 | 6263 | ||
6259 | /* | 6264 | /* |
6260 | * add cpu_power of each child group to this groups cpu_power | 6265 | * add cpu_power of each child group to this groups cpu_power |
6261 | */ | 6266 | */ |
6262 | group = child->groups; | 6267 | group = child->groups; |
6263 | do { | 6268 | do { |
6264 | sd->groups->cpu_power += group->cpu_power; | 6269 | sd->groups->cpu_power += group->cpu_power; |
6265 | group = group->next; | 6270 | group = group->next; |
6266 | } while (group != child->groups); | 6271 | } while (group != child->groups); |
6267 | } | 6272 | } |
6268 | 6273 | ||
6269 | /* | 6274 | /* |
6270 | * Build sched domains for a given set of cpus and attach the sched domains | 6275 | * Build sched domains for a given set of cpus and attach the sched domains |
6271 | * to the individual cpus | 6276 | * to the individual cpus |
6272 | */ | 6277 | */ |
6273 | static int build_sched_domains(const cpumask_t *cpu_map) | 6278 | static int build_sched_domains(const cpumask_t *cpu_map) |
6274 | { | 6279 | { |
6275 | int i; | 6280 | int i; |
6276 | struct sched_domain *sd; | 6281 | struct sched_domain *sd; |
6277 | #ifdef CONFIG_NUMA | 6282 | #ifdef CONFIG_NUMA |
6278 | struct sched_group **sched_group_nodes = NULL; | 6283 | struct sched_group **sched_group_nodes = NULL; |
6279 | int sd_allnodes = 0; | 6284 | int sd_allnodes = 0; |
6280 | 6285 | ||
6281 | /* | 6286 | /* |
6282 | * Allocate the per-node list of sched groups | 6287 | * Allocate the per-node list of sched groups |
6283 | */ | 6288 | */ |
6284 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6289 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
6285 | GFP_KERNEL); | 6290 | GFP_KERNEL); |
6286 | if (!sched_group_nodes) { | 6291 | if (!sched_group_nodes) { |
6287 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6292 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6288 | return -ENOMEM; | 6293 | return -ENOMEM; |
6289 | } | 6294 | } |
6290 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6295 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6291 | #endif | 6296 | #endif |
6292 | 6297 | ||
6293 | /* | 6298 | /* |
6294 | * Set up domains for cpus specified by the cpu_map. | 6299 | * Set up domains for cpus specified by the cpu_map. |
6295 | */ | 6300 | */ |
6296 | for_each_cpu_mask(i, *cpu_map) { | 6301 | for_each_cpu_mask(i, *cpu_map) { |
6297 | struct sched_domain *sd = NULL, *p; | 6302 | struct sched_domain *sd = NULL, *p; |
6298 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 6303 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
6299 | 6304 | ||
6300 | cpus_and(nodemask, nodemask, *cpu_map); | 6305 | cpus_and(nodemask, nodemask, *cpu_map); |
6301 | 6306 | ||
6302 | #ifdef CONFIG_NUMA | 6307 | #ifdef CONFIG_NUMA |
6303 | if (cpus_weight(*cpu_map) | 6308 | if (cpus_weight(*cpu_map) |
6304 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 6309 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
6305 | sd = &per_cpu(allnodes_domains, i); | 6310 | sd = &per_cpu(allnodes_domains, i); |
6306 | *sd = SD_ALLNODES_INIT; | 6311 | *sd = SD_ALLNODES_INIT; |
6307 | sd->span = *cpu_map; | 6312 | sd->span = *cpu_map; |
6308 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); | 6313 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); |
6309 | p = sd; | 6314 | p = sd; |
6310 | sd_allnodes = 1; | 6315 | sd_allnodes = 1; |
6311 | } else | 6316 | } else |
6312 | p = NULL; | 6317 | p = NULL; |
6313 | 6318 | ||
6314 | sd = &per_cpu(node_domains, i); | 6319 | sd = &per_cpu(node_domains, i); |
6315 | *sd = SD_NODE_INIT; | 6320 | *sd = SD_NODE_INIT; |
6316 | sd->span = sched_domain_node_span(cpu_to_node(i)); | 6321 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
6317 | sd->parent = p; | 6322 | sd->parent = p; |
6318 | if (p) | 6323 | if (p) |
6319 | p->child = sd; | 6324 | p->child = sd; |
6320 | cpus_and(sd->span, sd->span, *cpu_map); | 6325 | cpus_and(sd->span, sd->span, *cpu_map); |
6321 | #endif | 6326 | #endif |
6322 | 6327 | ||
6323 | p = sd; | 6328 | p = sd; |
6324 | sd = &per_cpu(phys_domains, i); | 6329 | sd = &per_cpu(phys_domains, i); |
6325 | *sd = SD_CPU_INIT; | 6330 | *sd = SD_CPU_INIT; |
6326 | sd->span = nodemask; | 6331 | sd->span = nodemask; |
6327 | sd->parent = p; | 6332 | sd->parent = p; |
6328 | if (p) | 6333 | if (p) |
6329 | p->child = sd; | 6334 | p->child = sd; |
6330 | cpu_to_phys_group(i, cpu_map, &sd->groups); | 6335 | cpu_to_phys_group(i, cpu_map, &sd->groups); |
6331 | 6336 | ||
6332 | #ifdef CONFIG_SCHED_MC | 6337 | #ifdef CONFIG_SCHED_MC |
6333 | p = sd; | 6338 | p = sd; |
6334 | sd = &per_cpu(core_domains, i); | 6339 | sd = &per_cpu(core_domains, i); |
6335 | *sd = SD_MC_INIT; | 6340 | *sd = SD_MC_INIT; |
6336 | sd->span = cpu_coregroup_map(i); | 6341 | sd->span = cpu_coregroup_map(i); |
6337 | cpus_and(sd->span, sd->span, *cpu_map); | 6342 | cpus_and(sd->span, sd->span, *cpu_map); |
6338 | sd->parent = p; | 6343 | sd->parent = p; |
6339 | p->child = sd; | 6344 | p->child = sd; |
6340 | cpu_to_core_group(i, cpu_map, &sd->groups); | 6345 | cpu_to_core_group(i, cpu_map, &sd->groups); |
6341 | #endif | 6346 | #endif |
6342 | 6347 | ||
6343 | #ifdef CONFIG_SCHED_SMT | 6348 | #ifdef CONFIG_SCHED_SMT |
6344 | p = sd; | 6349 | p = sd; |
6345 | sd = &per_cpu(cpu_domains, i); | 6350 | sd = &per_cpu(cpu_domains, i); |
6346 | *sd = SD_SIBLING_INIT; | 6351 | *sd = SD_SIBLING_INIT; |
6347 | sd->span = cpu_sibling_map[i]; | 6352 | sd->span = cpu_sibling_map[i]; |
6348 | cpus_and(sd->span, sd->span, *cpu_map); | 6353 | cpus_and(sd->span, sd->span, *cpu_map); |
6349 | sd->parent = p; | 6354 | sd->parent = p; |
6350 | p->child = sd; | 6355 | p->child = sd; |
6351 | cpu_to_cpu_group(i, cpu_map, &sd->groups); | 6356 | cpu_to_cpu_group(i, cpu_map, &sd->groups); |
6352 | #endif | 6357 | #endif |
6353 | } | 6358 | } |
6354 | 6359 | ||
6355 | #ifdef CONFIG_SCHED_SMT | 6360 | #ifdef CONFIG_SCHED_SMT |
6356 | /* Set up CPU (sibling) groups */ | 6361 | /* Set up CPU (sibling) groups */ |
6357 | for_each_cpu_mask(i, *cpu_map) { | 6362 | for_each_cpu_mask(i, *cpu_map) { |
6358 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 6363 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
6359 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 6364 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
6360 | if (i != first_cpu(this_sibling_map)) | 6365 | if (i != first_cpu(this_sibling_map)) |
6361 | continue; | 6366 | continue; |
6362 | 6367 | ||
6363 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); | 6368 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); |
6364 | } | 6369 | } |
6365 | #endif | 6370 | #endif |
6366 | 6371 | ||
6367 | #ifdef CONFIG_SCHED_MC | 6372 | #ifdef CONFIG_SCHED_MC |
6368 | /* Set up multi-core groups */ | 6373 | /* Set up multi-core groups */ |
6369 | for_each_cpu_mask(i, *cpu_map) { | 6374 | for_each_cpu_mask(i, *cpu_map) { |
6370 | cpumask_t this_core_map = cpu_coregroup_map(i); | 6375 | cpumask_t this_core_map = cpu_coregroup_map(i); |
6371 | cpus_and(this_core_map, this_core_map, *cpu_map); | 6376 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6372 | if (i != first_cpu(this_core_map)) | 6377 | if (i != first_cpu(this_core_map)) |
6373 | continue; | 6378 | continue; |
6374 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); | 6379 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); |
6375 | } | 6380 | } |
6376 | #endif | 6381 | #endif |
6377 | 6382 | ||
6378 | 6383 | ||
6379 | /* Set up physical groups */ | 6384 | /* Set up physical groups */ |
6380 | for (i = 0; i < MAX_NUMNODES; i++) { | 6385 | for (i = 0; i < MAX_NUMNODES; i++) { |
6381 | cpumask_t nodemask = node_to_cpumask(i); | 6386 | cpumask_t nodemask = node_to_cpumask(i); |
6382 | 6387 | ||
6383 | cpus_and(nodemask, nodemask, *cpu_map); | 6388 | cpus_and(nodemask, nodemask, *cpu_map); |
6384 | if (cpus_empty(nodemask)) | 6389 | if (cpus_empty(nodemask)) |
6385 | continue; | 6390 | continue; |
6386 | 6391 | ||
6387 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); | 6392 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); |
6388 | } | 6393 | } |
6389 | 6394 | ||
6390 | #ifdef CONFIG_NUMA | 6395 | #ifdef CONFIG_NUMA |
6391 | /* Set up node groups */ | 6396 | /* Set up node groups */ |
6392 | if (sd_allnodes) | 6397 | if (sd_allnodes) |
6393 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); | 6398 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); |
6394 | 6399 | ||
6395 | for (i = 0; i < MAX_NUMNODES; i++) { | 6400 | for (i = 0; i < MAX_NUMNODES; i++) { |
6396 | /* Set up node groups */ | 6401 | /* Set up node groups */ |
6397 | struct sched_group *sg, *prev; | 6402 | struct sched_group *sg, *prev; |
6398 | cpumask_t nodemask = node_to_cpumask(i); | 6403 | cpumask_t nodemask = node_to_cpumask(i); |
6399 | cpumask_t domainspan; | 6404 | cpumask_t domainspan; |
6400 | cpumask_t covered = CPU_MASK_NONE; | 6405 | cpumask_t covered = CPU_MASK_NONE; |
6401 | int j; | 6406 | int j; |
6402 | 6407 | ||
6403 | cpus_and(nodemask, nodemask, *cpu_map); | 6408 | cpus_and(nodemask, nodemask, *cpu_map); |
6404 | if (cpus_empty(nodemask)) { | 6409 | if (cpus_empty(nodemask)) { |
6405 | sched_group_nodes[i] = NULL; | 6410 | sched_group_nodes[i] = NULL; |
6406 | continue; | 6411 | continue; |
6407 | } | 6412 | } |
6408 | 6413 | ||
6409 | domainspan = sched_domain_node_span(i); | 6414 | domainspan = sched_domain_node_span(i); |
6410 | cpus_and(domainspan, domainspan, *cpu_map); | 6415 | cpus_and(domainspan, domainspan, *cpu_map); |
6411 | 6416 | ||
6412 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); | 6417 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6413 | if (!sg) { | 6418 | if (!sg) { |
6414 | printk(KERN_WARNING "Can not alloc domain group for " | 6419 | printk(KERN_WARNING "Can not alloc domain group for " |
6415 | "node %d\n", i); | 6420 | "node %d\n", i); |
6416 | goto error; | 6421 | goto error; |
6417 | } | 6422 | } |
6418 | sched_group_nodes[i] = sg; | 6423 | sched_group_nodes[i] = sg; |
6419 | for_each_cpu_mask(j, nodemask) { | 6424 | for_each_cpu_mask(j, nodemask) { |
6420 | struct sched_domain *sd; | 6425 | struct sched_domain *sd; |
6421 | sd = &per_cpu(node_domains, j); | 6426 | sd = &per_cpu(node_domains, j); |
6422 | sd->groups = sg; | 6427 | sd->groups = sg; |
6423 | } | 6428 | } |
6424 | sg->cpu_power = 0; | 6429 | sg->cpu_power = 0; |
6425 | sg->cpumask = nodemask; | 6430 | sg->cpumask = nodemask; |
6426 | sg->next = sg; | 6431 | sg->next = sg; |
6427 | cpus_or(covered, covered, nodemask); | 6432 | cpus_or(covered, covered, nodemask); |
6428 | prev = sg; | 6433 | prev = sg; |
6429 | 6434 | ||
6430 | for (j = 0; j < MAX_NUMNODES; j++) { | 6435 | for (j = 0; j < MAX_NUMNODES; j++) { |
6431 | cpumask_t tmp, notcovered; | 6436 | cpumask_t tmp, notcovered; |
6432 | int n = (i + j) % MAX_NUMNODES; | 6437 | int n = (i + j) % MAX_NUMNODES; |
6433 | 6438 | ||
6434 | cpus_complement(notcovered, covered); | 6439 | cpus_complement(notcovered, covered); |
6435 | cpus_and(tmp, notcovered, *cpu_map); | 6440 | cpus_and(tmp, notcovered, *cpu_map); |
6436 | cpus_and(tmp, tmp, domainspan); | 6441 | cpus_and(tmp, tmp, domainspan); |
6437 | if (cpus_empty(tmp)) | 6442 | if (cpus_empty(tmp)) |
6438 | break; | 6443 | break; |
6439 | 6444 | ||
6440 | nodemask = node_to_cpumask(n); | 6445 | nodemask = node_to_cpumask(n); |
6441 | cpus_and(tmp, tmp, nodemask); | 6446 | cpus_and(tmp, tmp, nodemask); |
6442 | if (cpus_empty(tmp)) | 6447 | if (cpus_empty(tmp)) |
6443 | continue; | 6448 | continue; |
6444 | 6449 | ||
6445 | sg = kmalloc_node(sizeof(struct sched_group), | 6450 | sg = kmalloc_node(sizeof(struct sched_group), |
6446 | GFP_KERNEL, i); | 6451 | GFP_KERNEL, i); |
6447 | if (!sg) { | 6452 | if (!sg) { |
6448 | printk(KERN_WARNING | 6453 | printk(KERN_WARNING |
6449 | "Can not alloc domain group for node %d\n", j); | 6454 | "Can not alloc domain group for node %d\n", j); |
6450 | goto error; | 6455 | goto error; |
6451 | } | 6456 | } |
6452 | sg->cpu_power = 0; | 6457 | sg->cpu_power = 0; |
6453 | sg->cpumask = tmp; | 6458 | sg->cpumask = tmp; |
6454 | sg->next = prev->next; | 6459 | sg->next = prev->next; |
6455 | cpus_or(covered, covered, tmp); | 6460 | cpus_or(covered, covered, tmp); |
6456 | prev->next = sg; | 6461 | prev->next = sg; |
6457 | prev = sg; | 6462 | prev = sg; |
6458 | } | 6463 | } |
6459 | } | 6464 | } |
6460 | #endif | 6465 | #endif |
6461 | 6466 | ||
6462 | /* Calculate CPU power for physical packages and nodes */ | 6467 | /* Calculate CPU power for physical packages and nodes */ |
6463 | #ifdef CONFIG_SCHED_SMT | 6468 | #ifdef CONFIG_SCHED_SMT |
6464 | for_each_cpu_mask(i, *cpu_map) { | 6469 | for_each_cpu_mask(i, *cpu_map) { |
6465 | sd = &per_cpu(cpu_domains, i); | 6470 | sd = &per_cpu(cpu_domains, i); |
6466 | init_sched_groups_power(i, sd); | 6471 | init_sched_groups_power(i, sd); |
6467 | } | 6472 | } |
6468 | #endif | 6473 | #endif |
6469 | #ifdef CONFIG_SCHED_MC | 6474 | #ifdef CONFIG_SCHED_MC |
6470 | for_each_cpu_mask(i, *cpu_map) { | 6475 | for_each_cpu_mask(i, *cpu_map) { |
6471 | sd = &per_cpu(core_domains, i); | 6476 | sd = &per_cpu(core_domains, i); |
6472 | init_sched_groups_power(i, sd); | 6477 | init_sched_groups_power(i, sd); |
6473 | } | 6478 | } |
6474 | #endif | 6479 | #endif |
6475 | 6480 | ||
6476 | for_each_cpu_mask(i, *cpu_map) { | 6481 | for_each_cpu_mask(i, *cpu_map) { |
6477 | sd = &per_cpu(phys_domains, i); | 6482 | sd = &per_cpu(phys_domains, i); |
6478 | init_sched_groups_power(i, sd); | 6483 | init_sched_groups_power(i, sd); |
6479 | } | 6484 | } |
6480 | 6485 | ||
6481 | #ifdef CONFIG_NUMA | 6486 | #ifdef CONFIG_NUMA |
6482 | for (i = 0; i < MAX_NUMNODES; i++) | 6487 | for (i = 0; i < MAX_NUMNODES; i++) |
6483 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6488 | init_numa_sched_groups_power(sched_group_nodes[i]); |
6484 | 6489 | ||
6485 | if (sd_allnodes) { | 6490 | if (sd_allnodes) { |
6486 | struct sched_group *sg; | 6491 | struct sched_group *sg; |
6487 | 6492 | ||
6488 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | 6493 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); |
6489 | init_numa_sched_groups_power(sg); | 6494 | init_numa_sched_groups_power(sg); |
6490 | } | 6495 | } |
6491 | #endif | 6496 | #endif |
6492 | 6497 | ||
6493 | /* Attach the domains */ | 6498 | /* Attach the domains */ |
6494 | for_each_cpu_mask(i, *cpu_map) { | 6499 | for_each_cpu_mask(i, *cpu_map) { |
6495 | struct sched_domain *sd; | 6500 | struct sched_domain *sd; |
6496 | #ifdef CONFIG_SCHED_SMT | 6501 | #ifdef CONFIG_SCHED_SMT |
6497 | sd = &per_cpu(cpu_domains, i); | 6502 | sd = &per_cpu(cpu_domains, i); |
6498 | #elif defined(CONFIG_SCHED_MC) | 6503 | #elif defined(CONFIG_SCHED_MC) |
6499 | sd = &per_cpu(core_domains, i); | 6504 | sd = &per_cpu(core_domains, i); |
6500 | #else | 6505 | #else |
6501 | sd = &per_cpu(phys_domains, i); | 6506 | sd = &per_cpu(phys_domains, i); |
6502 | #endif | 6507 | #endif |
6503 | cpu_attach_domain(sd, i); | 6508 | cpu_attach_domain(sd, i); |
6504 | } | 6509 | } |
6505 | /* | 6510 | /* |
6506 | * Tune cache-hot values: | 6511 | * Tune cache-hot values: |
6507 | */ | 6512 | */ |
6508 | calibrate_migration_costs(cpu_map); | 6513 | calibrate_migration_costs(cpu_map); |
6509 | 6514 | ||
6510 | return 0; | 6515 | return 0; |
6511 | 6516 | ||
6512 | #ifdef CONFIG_NUMA | 6517 | #ifdef CONFIG_NUMA |
6513 | error: | 6518 | error: |
6514 | free_sched_groups(cpu_map); | 6519 | free_sched_groups(cpu_map); |
6515 | return -ENOMEM; | 6520 | return -ENOMEM; |
6516 | #endif | 6521 | #endif |
6517 | } | 6522 | } |
6518 | /* | 6523 | /* |
6519 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6524 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6520 | */ | 6525 | */ |
6521 | static int arch_init_sched_domains(const cpumask_t *cpu_map) | 6526 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
6522 | { | 6527 | { |
6523 | cpumask_t cpu_default_map; | 6528 | cpumask_t cpu_default_map; |
6524 | int err; | 6529 | int err; |
6525 | 6530 | ||
6526 | /* | 6531 | /* |
6527 | * Setup mask for cpus without special case scheduling requirements. | 6532 | * Setup mask for cpus without special case scheduling requirements. |
6528 | * For now this just excludes isolated cpus, but could be used to | 6533 | * For now this just excludes isolated cpus, but could be used to |
6529 | * exclude other special cases in the future. | 6534 | * exclude other special cases in the future. |
6530 | */ | 6535 | */ |
6531 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6536 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
6532 | 6537 | ||
6533 | err = build_sched_domains(&cpu_default_map); | 6538 | err = build_sched_domains(&cpu_default_map); |
6534 | 6539 | ||
6535 | return err; | 6540 | return err; |
6536 | } | 6541 | } |
6537 | 6542 | ||
6538 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6543 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
6539 | { | 6544 | { |
6540 | free_sched_groups(cpu_map); | 6545 | free_sched_groups(cpu_map); |
6541 | } | 6546 | } |
6542 | 6547 | ||
6543 | /* | 6548 | /* |
6544 | * Detach sched domains from a group of cpus specified in cpu_map | 6549 | * Detach sched domains from a group of cpus specified in cpu_map |
6545 | * These cpus will now be attached to the NULL domain | 6550 | * These cpus will now be attached to the NULL domain |
6546 | */ | 6551 | */ |
6547 | static void detach_destroy_domains(const cpumask_t *cpu_map) | 6552 | static void detach_destroy_domains(const cpumask_t *cpu_map) |
6548 | { | 6553 | { |
6549 | int i; | 6554 | int i; |
6550 | 6555 | ||
6551 | for_each_cpu_mask(i, *cpu_map) | 6556 | for_each_cpu_mask(i, *cpu_map) |
6552 | cpu_attach_domain(NULL, i); | 6557 | cpu_attach_domain(NULL, i); |
6553 | synchronize_sched(); | 6558 | synchronize_sched(); |
6554 | arch_destroy_sched_domains(cpu_map); | 6559 | arch_destroy_sched_domains(cpu_map); |
6555 | } | 6560 | } |
6556 | 6561 | ||
6557 | /* | 6562 | /* |
6558 | * Partition sched domains as specified by the cpumasks below. | 6563 | * Partition sched domains as specified by the cpumasks below. |
6559 | * This attaches all cpus from the cpumasks to the NULL domain, | 6564 | * This attaches all cpus from the cpumasks to the NULL domain, |
6560 | * waits for a RCU quiescent period, recalculates sched | 6565 | * waits for a RCU quiescent period, recalculates sched |
6561 | * domain information and then attaches them back to the | 6566 | * domain information and then attaches them back to the |
6562 | * correct sched domains | 6567 | * correct sched domains |
6563 | * Call with hotplug lock held | 6568 | * Call with hotplug lock held |
6564 | */ | 6569 | */ |
6565 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6570 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6566 | { | 6571 | { |
6567 | cpumask_t change_map; | 6572 | cpumask_t change_map; |
6568 | int err = 0; | 6573 | int err = 0; |
6569 | 6574 | ||
6570 | cpus_and(*partition1, *partition1, cpu_online_map); | 6575 | cpus_and(*partition1, *partition1, cpu_online_map); |
6571 | cpus_and(*partition2, *partition2, cpu_online_map); | 6576 | cpus_and(*partition2, *partition2, cpu_online_map); |
6572 | cpus_or(change_map, *partition1, *partition2); | 6577 | cpus_or(change_map, *partition1, *partition2); |
6573 | 6578 | ||
6574 | /* Detach sched domains from all of the affected cpus */ | 6579 | /* Detach sched domains from all of the affected cpus */ |
6575 | detach_destroy_domains(&change_map); | 6580 | detach_destroy_domains(&change_map); |
6576 | if (!cpus_empty(*partition1)) | 6581 | if (!cpus_empty(*partition1)) |
6577 | err = build_sched_domains(partition1); | 6582 | err = build_sched_domains(partition1); |
6578 | if (!err && !cpus_empty(*partition2)) | 6583 | if (!err && !cpus_empty(*partition2)) |
6579 | err = build_sched_domains(partition2); | 6584 | err = build_sched_domains(partition2); |
6580 | 6585 | ||
6581 | return err; | 6586 | return err; |
6582 | } | 6587 | } |
6583 | 6588 | ||
6584 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6589 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
6585 | int arch_reinit_sched_domains(void) | 6590 | int arch_reinit_sched_domains(void) |
6586 | { | 6591 | { |
6587 | int err; | 6592 | int err; |
6588 | 6593 | ||
6589 | lock_cpu_hotplug(); | 6594 | lock_cpu_hotplug(); |
6590 | detach_destroy_domains(&cpu_online_map); | 6595 | detach_destroy_domains(&cpu_online_map); |
6591 | err = arch_init_sched_domains(&cpu_online_map); | 6596 | err = arch_init_sched_domains(&cpu_online_map); |
6592 | unlock_cpu_hotplug(); | 6597 | unlock_cpu_hotplug(); |
6593 | 6598 | ||
6594 | return err; | 6599 | return err; |
6595 | } | 6600 | } |
6596 | 6601 | ||
6597 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | 6602 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) |
6598 | { | 6603 | { |
6599 | int ret; | 6604 | int ret; |
6600 | 6605 | ||
6601 | if (buf[0] != '0' && buf[0] != '1') | 6606 | if (buf[0] != '0' && buf[0] != '1') |
6602 | return -EINVAL; | 6607 | return -EINVAL; |
6603 | 6608 | ||
6604 | if (smt) | 6609 | if (smt) |
6605 | sched_smt_power_savings = (buf[0] == '1'); | 6610 | sched_smt_power_savings = (buf[0] == '1'); |
6606 | else | 6611 | else |
6607 | sched_mc_power_savings = (buf[0] == '1'); | 6612 | sched_mc_power_savings = (buf[0] == '1'); |
6608 | 6613 | ||
6609 | ret = arch_reinit_sched_domains(); | 6614 | ret = arch_reinit_sched_domains(); |
6610 | 6615 | ||
6611 | return ret ? ret : count; | 6616 | return ret ? ret : count; |
6612 | } | 6617 | } |
6613 | 6618 | ||
6614 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | 6619 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) |
6615 | { | 6620 | { |
6616 | int err = 0; | 6621 | int err = 0; |
6617 | 6622 | ||
6618 | #ifdef CONFIG_SCHED_SMT | 6623 | #ifdef CONFIG_SCHED_SMT |
6619 | if (smt_capable()) | 6624 | if (smt_capable()) |
6620 | err = sysfs_create_file(&cls->kset.kobj, | 6625 | err = sysfs_create_file(&cls->kset.kobj, |
6621 | &attr_sched_smt_power_savings.attr); | 6626 | &attr_sched_smt_power_savings.attr); |
6622 | #endif | 6627 | #endif |
6623 | #ifdef CONFIG_SCHED_MC | 6628 | #ifdef CONFIG_SCHED_MC |
6624 | if (!err && mc_capable()) | 6629 | if (!err && mc_capable()) |
6625 | err = sysfs_create_file(&cls->kset.kobj, | 6630 | err = sysfs_create_file(&cls->kset.kobj, |
6626 | &attr_sched_mc_power_savings.attr); | 6631 | &attr_sched_mc_power_savings.attr); |
6627 | #endif | 6632 | #endif |
6628 | return err; | 6633 | return err; |
6629 | } | 6634 | } |
6630 | #endif | 6635 | #endif |
6631 | 6636 | ||
6632 | #ifdef CONFIG_SCHED_MC | 6637 | #ifdef CONFIG_SCHED_MC |
6633 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | 6638 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) |
6634 | { | 6639 | { |
6635 | return sprintf(page, "%u\n", sched_mc_power_savings); | 6640 | return sprintf(page, "%u\n", sched_mc_power_savings); |
6636 | } | 6641 | } |
6637 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, | 6642 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, |
6638 | const char *buf, size_t count) | 6643 | const char *buf, size_t count) |
6639 | { | 6644 | { |
6640 | return sched_power_savings_store(buf, count, 0); | 6645 | return sched_power_savings_store(buf, count, 0); |
6641 | } | 6646 | } |
6642 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | 6647 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, |
6643 | sched_mc_power_savings_store); | 6648 | sched_mc_power_savings_store); |
6644 | #endif | 6649 | #endif |
6645 | 6650 | ||
6646 | #ifdef CONFIG_SCHED_SMT | 6651 | #ifdef CONFIG_SCHED_SMT |
6647 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | 6652 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) |
6648 | { | 6653 | { |
6649 | return sprintf(page, "%u\n", sched_smt_power_savings); | 6654 | return sprintf(page, "%u\n", sched_smt_power_savings); |
6650 | } | 6655 | } |
6651 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, | 6656 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, |
6652 | const char *buf, size_t count) | 6657 | const char *buf, size_t count) |
6653 | { | 6658 | { |
6654 | return sched_power_savings_store(buf, count, 1); | 6659 | return sched_power_savings_store(buf, count, 1); |
6655 | } | 6660 | } |
6656 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | 6661 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, |
6657 | sched_smt_power_savings_store); | 6662 | sched_smt_power_savings_store); |
6658 | #endif | 6663 | #endif |
6659 | 6664 | ||
6660 | /* | 6665 | /* |
6661 | * Force a reinitialization of the sched domains hierarchy. The domains | 6666 | * Force a reinitialization of the sched domains hierarchy. The domains |
6662 | * and groups cannot be updated in place without racing with the balancing | 6667 | * and groups cannot be updated in place without racing with the balancing |
6663 | * code, so we temporarily attach all running cpus to the NULL domain | 6668 | * code, so we temporarily attach all running cpus to the NULL domain |
6664 | * which will prevent rebalancing while the sched domains are recalculated. | 6669 | * which will prevent rebalancing while the sched domains are recalculated. |
6665 | */ | 6670 | */ |
6666 | static int update_sched_domains(struct notifier_block *nfb, | 6671 | static int update_sched_domains(struct notifier_block *nfb, |
6667 | unsigned long action, void *hcpu) | 6672 | unsigned long action, void *hcpu) |
6668 | { | 6673 | { |
6669 | switch (action) { | 6674 | switch (action) { |
6670 | case CPU_UP_PREPARE: | 6675 | case CPU_UP_PREPARE: |
6671 | case CPU_DOWN_PREPARE: | 6676 | case CPU_DOWN_PREPARE: |
6672 | detach_destroy_domains(&cpu_online_map); | 6677 | detach_destroy_domains(&cpu_online_map); |
6673 | return NOTIFY_OK; | 6678 | return NOTIFY_OK; |
6674 | 6679 | ||
6675 | case CPU_UP_CANCELED: | 6680 | case CPU_UP_CANCELED: |
6676 | case CPU_DOWN_FAILED: | 6681 | case CPU_DOWN_FAILED: |
6677 | case CPU_ONLINE: | 6682 | case CPU_ONLINE: |
6678 | case CPU_DEAD: | 6683 | case CPU_DEAD: |
6679 | /* | 6684 | /* |
6680 | * Fall through and re-initialise the domains. | 6685 | * Fall through and re-initialise the domains. |
6681 | */ | 6686 | */ |
6682 | break; | 6687 | break; |
6683 | default: | 6688 | default: |
6684 | return NOTIFY_DONE; | 6689 | return NOTIFY_DONE; |
6685 | } | 6690 | } |
6686 | 6691 | ||
6687 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 6692 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
6688 | arch_init_sched_domains(&cpu_online_map); | 6693 | arch_init_sched_domains(&cpu_online_map); |
6689 | 6694 | ||
6690 | return NOTIFY_OK; | 6695 | return NOTIFY_OK; |
6691 | } | 6696 | } |
6692 | 6697 | ||
6693 | void __init sched_init_smp(void) | 6698 | void __init sched_init_smp(void) |
6694 | { | 6699 | { |
6695 | cpumask_t non_isolated_cpus; | 6700 | cpumask_t non_isolated_cpus; |
6696 | 6701 | ||
6697 | lock_cpu_hotplug(); | 6702 | lock_cpu_hotplug(); |
6698 | arch_init_sched_domains(&cpu_online_map); | 6703 | arch_init_sched_domains(&cpu_online_map); |
6699 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 6704 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6700 | if (cpus_empty(non_isolated_cpus)) | 6705 | if (cpus_empty(non_isolated_cpus)) |
6701 | cpu_set(smp_processor_id(), non_isolated_cpus); | 6706 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6702 | unlock_cpu_hotplug(); | 6707 | unlock_cpu_hotplug(); |
6703 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 6708 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6704 | hotcpu_notifier(update_sched_domains, 0); | 6709 | hotcpu_notifier(update_sched_domains, 0); |
6705 | 6710 | ||
6706 | /* Move init over to a non-isolated CPU */ | 6711 | /* Move init over to a non-isolated CPU */ |
6707 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 6712 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6708 | BUG(); | 6713 | BUG(); |
6709 | } | 6714 | } |
6710 | #else | 6715 | #else |
6711 | void __init sched_init_smp(void) | 6716 | void __init sched_init_smp(void) |
6712 | { | 6717 | { |
6713 | } | 6718 | } |
6714 | #endif /* CONFIG_SMP */ | 6719 | #endif /* CONFIG_SMP */ |
6715 | 6720 | ||
6716 | int in_sched_functions(unsigned long addr) | 6721 | int in_sched_functions(unsigned long addr) |
6717 | { | 6722 | { |
6718 | /* Linker adds these: start and end of __sched functions */ | 6723 | /* Linker adds these: start and end of __sched functions */ |
6719 | extern char __sched_text_start[], __sched_text_end[]; | 6724 | extern char __sched_text_start[], __sched_text_end[]; |
6720 | 6725 | ||
6721 | return in_lock_functions(addr) || | 6726 | return in_lock_functions(addr) || |
6722 | (addr >= (unsigned long)__sched_text_start | 6727 | (addr >= (unsigned long)__sched_text_start |
6723 | && addr < (unsigned long)__sched_text_end); | 6728 | && addr < (unsigned long)__sched_text_end); |
6724 | } | 6729 | } |
6725 | 6730 | ||
6726 | void __init sched_init(void) | 6731 | void __init sched_init(void) |
6727 | { | 6732 | { |
6728 | int i, j, k; | 6733 | int i, j, k; |
6734 | int highest_cpu = 0; | ||
6729 | 6735 | ||
6730 | for_each_possible_cpu(i) { | 6736 | for_each_possible_cpu(i) { |
6731 | struct prio_array *array; | 6737 | struct prio_array *array; |
6732 | struct rq *rq; | 6738 | struct rq *rq; |
6733 | 6739 | ||
6734 | rq = cpu_rq(i); | 6740 | rq = cpu_rq(i); |
6735 | spin_lock_init(&rq->lock); | 6741 | spin_lock_init(&rq->lock); |
6736 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); | 6742 | lockdep_set_class(&rq->lock, &rq->rq_lock_key); |
6737 | rq->nr_running = 0; | 6743 | rq->nr_running = 0; |
6738 | rq->active = rq->arrays; | 6744 | rq->active = rq->arrays; |
6739 | rq->expired = rq->arrays + 1; | 6745 | rq->expired = rq->arrays + 1; |
6740 | rq->best_expired_prio = MAX_PRIO; | 6746 | rq->best_expired_prio = MAX_PRIO; |
6741 | 6747 | ||
6742 | #ifdef CONFIG_SMP | 6748 | #ifdef CONFIG_SMP |
6743 | rq->sd = NULL; | 6749 | rq->sd = NULL; |
6744 | for (j = 1; j < 3; j++) | 6750 | for (j = 1; j < 3; j++) |
6745 | rq->cpu_load[j] = 0; | 6751 | rq->cpu_load[j] = 0; |
6746 | rq->active_balance = 0; | 6752 | rq->active_balance = 0; |
6747 | rq->push_cpu = 0; | 6753 | rq->push_cpu = 0; |
6748 | rq->cpu = i; | 6754 | rq->cpu = i; |
6749 | rq->migration_thread = NULL; | 6755 | rq->migration_thread = NULL; |
6750 | INIT_LIST_HEAD(&rq->migration_queue); | 6756 | INIT_LIST_HEAD(&rq->migration_queue); |
6751 | #endif | 6757 | #endif |
6752 | atomic_set(&rq->nr_iowait, 0); | 6758 | atomic_set(&rq->nr_iowait, 0); |
6753 | 6759 | ||
6754 | for (j = 0; j < 2; j++) { | 6760 | for (j = 0; j < 2; j++) { |
6755 | array = rq->arrays + j; | 6761 | array = rq->arrays + j; |
6756 | for (k = 0; k < MAX_PRIO; k++) { | 6762 | for (k = 0; k < MAX_PRIO; k++) { |
6757 | INIT_LIST_HEAD(array->queue + k); | 6763 | INIT_LIST_HEAD(array->queue + k); |
6758 | __clear_bit(k, array->bitmap); | 6764 | __clear_bit(k, array->bitmap); |
6759 | } | 6765 | } |
6760 | // delimiter for bitsearch | 6766 | // delimiter for bitsearch |
6761 | __set_bit(MAX_PRIO, array->bitmap); | 6767 | __set_bit(MAX_PRIO, array->bitmap); |
6762 | } | 6768 | } |
6769 | highest_cpu = i; | ||
6763 | } | 6770 | } |
6764 | 6771 | ||
6765 | set_load_weight(&init_task); | 6772 | set_load_weight(&init_task); |
6766 | 6773 | ||
6767 | #ifdef CONFIG_SMP | 6774 | #ifdef CONFIG_SMP |
6775 | nr_cpu_ids = highest_cpu + 1; | ||
6768 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | 6776 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); |
6769 | #endif | 6777 | #endif |
6770 | 6778 | ||
6771 | #ifdef CONFIG_RT_MUTEXES | 6779 | #ifdef CONFIG_RT_MUTEXES |
6772 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 6780 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); |
6773 | #endif | 6781 | #endif |
6774 | 6782 | ||
6775 | /* | 6783 | /* |
6776 | * The boot idle thread does lazy MMU switching as well: | 6784 | * The boot idle thread does lazy MMU switching as well: |
6777 | */ | 6785 | */ |
6778 | atomic_inc(&init_mm.mm_count); | 6786 | atomic_inc(&init_mm.mm_count); |
6779 | enter_lazy_tlb(&init_mm, current); | 6787 | enter_lazy_tlb(&init_mm, current); |
6780 | 6788 | ||
6781 | /* | 6789 | /* |
6782 | * Make us the idle thread. Technically, schedule() should not be | 6790 | * Make us the idle thread. Technically, schedule() should not be |
6783 | * called from this thread, however somewhere below it might be, | 6791 | * called from this thread, however somewhere below it might be, |
6784 | * but because we are the idle thread, we just pick up running again | 6792 | * but because we are the idle thread, we just pick up running again |
6785 | * when this runqueue becomes "idle". | 6793 | * when this runqueue becomes "idle". |
6786 | */ | 6794 | */ |
6787 | init_idle(current, smp_processor_id()); | 6795 | init_idle(current, smp_processor_id()); |
6788 | } | 6796 | } |
6789 | 6797 | ||
6790 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6798 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
6791 | void __might_sleep(char *file, int line) | 6799 | void __might_sleep(char *file, int line) |
6792 | { | 6800 | { |
6793 | #ifdef in_atomic | 6801 | #ifdef in_atomic |
6794 | static unsigned long prev_jiffy; /* ratelimiting */ | 6802 | static unsigned long prev_jiffy; /* ratelimiting */ |
6795 | 6803 | ||
6796 | if ((in_atomic() || irqs_disabled()) && | 6804 | if ((in_atomic() || irqs_disabled()) && |
6797 | system_state == SYSTEM_RUNNING && !oops_in_progress) { | 6805 | system_state == SYSTEM_RUNNING && !oops_in_progress) { |
6798 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 6806 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
6799 | return; | 6807 | return; |
6800 | prev_jiffy = jiffies; | 6808 | prev_jiffy = jiffies; |
6801 | printk(KERN_ERR "BUG: sleeping function called from invalid" | 6809 | printk(KERN_ERR "BUG: sleeping function called from invalid" |
6802 | " context at %s:%d\n", file, line); | 6810 | " context at %s:%d\n", file, line); |
6803 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6811 | printk("in_atomic():%d, irqs_disabled():%d\n", |
6804 | in_atomic(), irqs_disabled()); | 6812 | in_atomic(), irqs_disabled()); |
6805 | debug_show_held_locks(current); | 6813 | debug_show_held_locks(current); |
6806 | if (irqs_disabled()) | 6814 | if (irqs_disabled()) |
6807 | print_irqtrace_events(current); | 6815 | print_irqtrace_events(current); |
6808 | dump_stack(); | 6816 | dump_stack(); |
6809 | } | 6817 | } |
6810 | #endif | 6818 | #endif |
6811 | } | 6819 | } |
6812 | EXPORT_SYMBOL(__might_sleep); | 6820 | EXPORT_SYMBOL(__might_sleep); |
6813 | #endif | 6821 | #endif |
6814 | 6822 | ||
6815 | #ifdef CONFIG_MAGIC_SYSRQ | 6823 | #ifdef CONFIG_MAGIC_SYSRQ |
6816 | void normalize_rt_tasks(void) | 6824 | void normalize_rt_tasks(void) |
6817 | { | 6825 | { |
6818 | struct prio_array *array; | 6826 | struct prio_array *array; |
6819 | struct task_struct *p; | 6827 | struct task_struct *p; |
6820 | unsigned long flags; | 6828 | unsigned long flags; |
6821 | struct rq *rq; | 6829 | struct rq *rq; |
6822 | 6830 | ||
6823 | read_lock_irq(&tasklist_lock); | 6831 | read_lock_irq(&tasklist_lock); |
6824 | for_each_process(p) { | 6832 | for_each_process(p) { |
6825 | if (!rt_task(p)) | 6833 | if (!rt_task(p)) |
6826 | continue; | 6834 | continue; |
6827 | 6835 | ||
6828 | spin_lock_irqsave(&p->pi_lock, flags); | 6836 | spin_lock_irqsave(&p->pi_lock, flags); |
6829 | rq = __task_rq_lock(p); | 6837 | rq = __task_rq_lock(p); |
6830 | 6838 | ||
6831 | array = p->array; | 6839 | array = p->array; |
6832 | if (array) | 6840 | if (array) |
6833 | deactivate_task(p, task_rq(p)); | 6841 | deactivate_task(p, task_rq(p)); |
6834 | __setscheduler(p, SCHED_NORMAL, 0); | 6842 | __setscheduler(p, SCHED_NORMAL, 0); |
6835 | if (array) { | 6843 | if (array) { |
6836 | __activate_task(p, task_rq(p)); | 6844 | __activate_task(p, task_rq(p)); |
6837 | resched_task(rq->curr); | 6845 | resched_task(rq->curr); |
6838 | } | 6846 | } |
6839 | 6847 | ||
6840 | __task_rq_unlock(rq); | 6848 | __task_rq_unlock(rq); |
6841 | spin_unlock_irqrestore(&p->pi_lock, flags); | 6849 | spin_unlock_irqrestore(&p->pi_lock, flags); |
6842 | } | 6850 | } |
6843 | read_unlock_irq(&tasklist_lock); | 6851 | read_unlock_irq(&tasklist_lock); |
6844 | } | 6852 | } |
6845 | 6853 | ||
6846 | #endif /* CONFIG_MAGIC_SYSRQ */ | 6854 | #endif /* CONFIG_MAGIC_SYSRQ */ |
6847 | 6855 | ||
6848 | #ifdef CONFIG_IA64 | 6856 | #ifdef CONFIG_IA64 |
6849 | /* | 6857 | /* |
6850 | * These functions are only useful for the IA64 MCA handling. | 6858 | * These functions are only useful for the IA64 MCA handling. |
6851 | * | 6859 | * |
6852 | * They can only be called when the whole system has been | 6860 | * They can only be called when the whole system has been |
6853 | * stopped - every CPU needs to be quiescent, and no scheduling | 6861 | * stopped - every CPU needs to be quiescent, and no scheduling |
6854 | * activity can take place. Using them for anything else would | 6862 | * activity can take place. Using them for anything else would |
6855 | * be a serious bug, and as a result, they aren't even visible | 6863 | * be a serious bug, and as a result, they aren't even visible |
6856 | * under any other configuration. | 6864 | * under any other configuration. |
6857 | */ | 6865 | */ |
6858 | 6866 | ||
6859 | /** | 6867 | /** |
6860 | * curr_task - return the current task for a given cpu. | 6868 | * curr_task - return the current task for a given cpu. |
6861 | * @cpu: the processor in question. | 6869 | * @cpu: the processor in question. |
6862 | * | 6870 | * |
6863 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6871 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6864 | */ | 6872 | */ |
6865 | struct task_struct *curr_task(int cpu) | 6873 | struct task_struct *curr_task(int cpu) |
6866 | { | 6874 | { |
6867 | return cpu_curr(cpu); | 6875 | return cpu_curr(cpu); |
6868 | } | 6876 | } |
6869 | 6877 | ||
6870 | /** | 6878 | /** |
6871 | * set_curr_task - set the current task for a given cpu. | 6879 | * set_curr_task - set the current task for a given cpu. |
6872 | * @cpu: the processor in question. | 6880 | * @cpu: the processor in question. |
6873 | * @p: the task pointer to set. | 6881 | * @p: the task pointer to set. |
6874 | * | 6882 | * |
6875 | * Description: This function must only be used when non-maskable interrupts | 6883 | * Description: This function must only be used when non-maskable interrupts |
6876 | * are serviced on a separate stack. It allows the architecture to switch the | 6884 | * are serviced on a separate stack. It allows the architecture to switch the |
6877 | * notion of the current task on a cpu in a non-blocking manner. This function | 6885 | * notion of the current task on a cpu in a non-blocking manner. This function |
6878 | * must be called with all CPU's synchronized, and interrupts disabled, the | 6886 | * must be called with all CPU's synchronized, and interrupts disabled, the |
6879 | * and caller must save the original value of the current task (see | 6887 | * and caller must save the original value of the current task (see |
6880 | * curr_task() above) and restore that value before reenabling interrupts and | 6888 | * curr_task() above) and restore that value before reenabling interrupts and |
6881 | * re-starting the system. | 6889 | * re-starting the system. |
6882 | * | 6890 | * |
6883 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | 6891 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! |
6884 | */ | 6892 | */ |
6885 | void set_curr_task(int cpu, struct task_struct *p) | 6893 | void set_curr_task(int cpu, struct task_struct *p) |
6886 | { | 6894 | { |
6887 | cpu_curr(cpu) = p; | 6895 | cpu_curr(cpu) = p; |
6888 | } | 6896 | } |
6889 | 6897 | ||
6890 | #endif | 6898 | #endif |
6891 | 6899 |
lib/cpumask.c
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/bitops.h> | 2 | #include <linux/bitops.h> |
3 | #include <linux/cpumask.h> | 3 | #include <linux/cpumask.h> |
4 | #include <linux/module.h> | 4 | #include <linux/module.h> |
5 | 5 | ||
6 | int __first_cpu(const cpumask_t *srcp) | 6 | int __first_cpu(const cpumask_t *srcp) |
7 | { | 7 | { |
8 | return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS)); | 8 | return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS)); |
9 | } | 9 | } |
10 | EXPORT_SYMBOL(__first_cpu); | 10 | EXPORT_SYMBOL(__first_cpu); |
11 | 11 | ||
12 | int __next_cpu(int n, const cpumask_t *srcp) | 12 | int __next_cpu(int n, const cpumask_t *srcp) |
13 | { | 13 | { |
14 | return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1)); | 14 | return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1)); |
15 | } | 15 | } |
16 | EXPORT_SYMBOL(__next_cpu); | 16 | EXPORT_SYMBOL(__next_cpu); |
17 | 17 | ||
18 | int nr_cpu_ids; | ||
19 | EXPORT_SYMBOL(nr_cpu_ids); | ||
20 | |||
21 | int __any_online_cpu(const cpumask_t *mask) | 18 | int __any_online_cpu(const cpumask_t *mask) |
22 | { | 19 | { |
23 | int cpu; | 20 | int cpu; |
24 | 21 | ||
25 | for_each_cpu_mask(cpu, *mask) { | 22 | for_each_cpu_mask(cpu, *mask) { |
26 | if (cpu_online(cpu)) | 23 | if (cpu_online(cpu)) |
27 | break; | 24 | break; |
28 | } | 25 | } |
29 | return cpu; | 26 | return cpu; |
30 | } | 27 | } |
31 | EXPORT_SYMBOL(__any_online_cpu); | 28 | EXPORT_SYMBOL(__any_online_cpu); |
32 | 29 |
mm/page_alloc.c
1 | /* | 1 | /* |
2 | * linux/mm/page_alloc.c | 2 | * linux/mm/page_alloc.c |
3 | * | 3 | * |
4 | * Manages the free list, the system allocates free pages here. | 4 | * Manages the free list, the system allocates free pages here. |
5 | * Note that kmalloc() lives in slab.c | 5 | * Note that kmalloc() lives in slab.c |
6 | * | 6 | * |
7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds | 7 | * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
8 | * Swap reorganised 29.12.95, Stephen Tweedie | 8 | * Swap reorganised 29.12.95, Stephen Tweedie |
9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 | 9 | * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 | 10 | * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | 11 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 | 12 | * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 | 13 | * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) | 14 | * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
15 | */ | 15 | */ |
16 | 16 | ||
17 | #include <linux/stddef.h> | 17 | #include <linux/stddef.h> |
18 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
19 | #include <linux/swap.h> | 19 | #include <linux/swap.h> |
20 | #include <linux/interrupt.h> | 20 | #include <linux/interrupt.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/compiler.h> | 23 | #include <linux/compiler.h> |
24 | #include <linux/kernel.h> | 24 | #include <linux/kernel.h> |
25 | #include <linux/module.h> | 25 | #include <linux/module.h> |
26 | #include <linux/suspend.h> | 26 | #include <linux/suspend.h> |
27 | #include <linux/pagevec.h> | 27 | #include <linux/pagevec.h> |
28 | #include <linux/blkdev.h> | 28 | #include <linux/blkdev.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
31 | #include <linux/topology.h> | 31 | #include <linux/topology.h> |
32 | #include <linux/sysctl.h> | 32 | #include <linux/sysctl.h> |
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
35 | #include <linux/memory_hotplug.h> | 35 | #include <linux/memory_hotplug.h> |
36 | #include <linux/nodemask.h> | 36 | #include <linux/nodemask.h> |
37 | #include <linux/vmalloc.h> | 37 | #include <linux/vmalloc.h> |
38 | #include <linux/mempolicy.h> | 38 | #include <linux/mempolicy.h> |
39 | #include <linux/stop_machine.h> | 39 | #include <linux/stop_machine.h> |
40 | #include <linux/sort.h> | 40 | #include <linux/sort.h> |
41 | #include <linux/pfn.h> | 41 | #include <linux/pfn.h> |
42 | #include <linux/backing-dev.h> | 42 | #include <linux/backing-dev.h> |
43 | #include <linux/fault-inject.h> | 43 | #include <linux/fault-inject.h> |
44 | 44 | ||
45 | #include <asm/tlbflush.h> | 45 | #include <asm/tlbflush.h> |
46 | #include <asm/div64.h> | 46 | #include <asm/div64.h> |
47 | #include "internal.h" | 47 | #include "internal.h" |
48 | 48 | ||
49 | /* | 49 | /* |
50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this | 50 | * MCD - HACK: Find somewhere to initialize this EARLY, or make this |
51 | * initializer cleaner | 51 | * initializer cleaner |
52 | */ | 52 | */ |
53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; | 53 | nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; |
54 | EXPORT_SYMBOL(node_online_map); | 54 | EXPORT_SYMBOL(node_online_map); |
55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; | 55 | nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; |
56 | EXPORT_SYMBOL(node_possible_map); | 56 | EXPORT_SYMBOL(node_possible_map); |
57 | unsigned long totalram_pages __read_mostly; | 57 | unsigned long totalram_pages __read_mostly; |
58 | unsigned long totalreserve_pages __read_mostly; | 58 | unsigned long totalreserve_pages __read_mostly; |
59 | long nr_swap_pages; | 59 | long nr_swap_pages; |
60 | int percpu_pagelist_fraction; | 60 | int percpu_pagelist_fraction; |
61 | 61 | ||
62 | static void __free_pages_ok(struct page *page, unsigned int order); | 62 | static void __free_pages_ok(struct page *page, unsigned int order); |
63 | 63 | ||
64 | /* | 64 | /* |
65 | * results with 256, 32 in the lowmem_reserve sysctl: | 65 | * results with 256, 32 in the lowmem_reserve sysctl: |
66 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) | 66 | * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
67 | * 1G machine -> (16M dma, 784M normal, 224M high) | 67 | * 1G machine -> (16M dma, 784M normal, 224M high) |
68 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA | 68 | * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
69 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL | 69 | * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
70 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA | 70 | * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA |
71 | * | 71 | * |
72 | * TBD: should special case ZONE_DMA32 machines here - in those we normally | 72 | * TBD: should special case ZONE_DMA32 machines here - in those we normally |
73 | * don't need any ZONE_NORMAL reservation | 73 | * don't need any ZONE_NORMAL reservation |
74 | */ | 74 | */ |
75 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { | 75 | int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { |
76 | #ifdef CONFIG_ZONE_DMA | 76 | #ifdef CONFIG_ZONE_DMA |
77 | 256, | 77 | 256, |
78 | #endif | 78 | #endif |
79 | #ifdef CONFIG_ZONE_DMA32 | 79 | #ifdef CONFIG_ZONE_DMA32 |
80 | 256, | 80 | 256, |
81 | #endif | 81 | #endif |
82 | #ifdef CONFIG_HIGHMEM | 82 | #ifdef CONFIG_HIGHMEM |
83 | 32 | 83 | 32 |
84 | #endif | 84 | #endif |
85 | }; | 85 | }; |
86 | 86 | ||
87 | EXPORT_SYMBOL(totalram_pages); | 87 | EXPORT_SYMBOL(totalram_pages); |
88 | 88 | ||
89 | static char * const zone_names[MAX_NR_ZONES] = { | 89 | static char * const zone_names[MAX_NR_ZONES] = { |
90 | #ifdef CONFIG_ZONE_DMA | 90 | #ifdef CONFIG_ZONE_DMA |
91 | "DMA", | 91 | "DMA", |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_ZONE_DMA32 | 93 | #ifdef CONFIG_ZONE_DMA32 |
94 | "DMA32", | 94 | "DMA32", |
95 | #endif | 95 | #endif |
96 | "Normal", | 96 | "Normal", |
97 | #ifdef CONFIG_HIGHMEM | 97 | #ifdef CONFIG_HIGHMEM |
98 | "HighMem" | 98 | "HighMem" |
99 | #endif | 99 | #endif |
100 | }; | 100 | }; |
101 | 101 | ||
102 | int min_free_kbytes = 1024; | 102 | int min_free_kbytes = 1024; |
103 | 103 | ||
104 | unsigned long __meminitdata nr_kernel_pages; | 104 | unsigned long __meminitdata nr_kernel_pages; |
105 | unsigned long __meminitdata nr_all_pages; | 105 | unsigned long __meminitdata nr_all_pages; |
106 | static unsigned long __initdata dma_reserve; | 106 | static unsigned long __initdata dma_reserve; |
107 | 107 | ||
108 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 108 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
109 | /* | 109 | /* |
110 | * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct | 110 | * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct |
111 | * ranges of memory (RAM) that may be registered with add_active_range(). | 111 | * ranges of memory (RAM) that may be registered with add_active_range(). |
112 | * Ranges passed to add_active_range() will be merged if possible | 112 | * Ranges passed to add_active_range() will be merged if possible |
113 | * so the number of times add_active_range() can be called is | 113 | * so the number of times add_active_range() can be called is |
114 | * related to the number of nodes and the number of holes | 114 | * related to the number of nodes and the number of holes |
115 | */ | 115 | */ |
116 | #ifdef CONFIG_MAX_ACTIVE_REGIONS | 116 | #ifdef CONFIG_MAX_ACTIVE_REGIONS |
117 | /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ | 117 | /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ |
118 | #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS | 118 | #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS |
119 | #else | 119 | #else |
120 | #if MAX_NUMNODES >= 32 | 120 | #if MAX_NUMNODES >= 32 |
121 | /* If there can be many nodes, allow up to 50 holes per node */ | 121 | /* If there can be many nodes, allow up to 50 holes per node */ |
122 | #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) | 122 | #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) |
123 | #else | 123 | #else |
124 | /* By default, allow up to 256 distinct regions */ | 124 | /* By default, allow up to 256 distinct regions */ |
125 | #define MAX_ACTIVE_REGIONS 256 | 125 | #define MAX_ACTIVE_REGIONS 256 |
126 | #endif | 126 | #endif |
127 | #endif | 127 | #endif |
128 | 128 | ||
129 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; | 129 | struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; |
130 | int __initdata nr_nodemap_entries; | 130 | int __initdata nr_nodemap_entries; |
131 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 131 | unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
132 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 132 | unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | 133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE |
134 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | 134 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; |
135 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | 135 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; |
136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
138 | 138 | ||
139 | #ifdef CONFIG_DEBUG_VM | 139 | #ifdef CONFIG_DEBUG_VM |
140 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 140 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
141 | { | 141 | { |
142 | int ret = 0; | 142 | int ret = 0; |
143 | unsigned seq; | 143 | unsigned seq; |
144 | unsigned long pfn = page_to_pfn(page); | 144 | unsigned long pfn = page_to_pfn(page); |
145 | 145 | ||
146 | do { | 146 | do { |
147 | seq = zone_span_seqbegin(zone); | 147 | seq = zone_span_seqbegin(zone); |
148 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) | 148 | if (pfn >= zone->zone_start_pfn + zone->spanned_pages) |
149 | ret = 1; | 149 | ret = 1; |
150 | else if (pfn < zone->zone_start_pfn) | 150 | else if (pfn < zone->zone_start_pfn) |
151 | ret = 1; | 151 | ret = 1; |
152 | } while (zone_span_seqretry(zone, seq)); | 152 | } while (zone_span_seqretry(zone, seq)); |
153 | 153 | ||
154 | return ret; | 154 | return ret; |
155 | } | 155 | } |
156 | 156 | ||
157 | static int page_is_consistent(struct zone *zone, struct page *page) | 157 | static int page_is_consistent(struct zone *zone, struct page *page) |
158 | { | 158 | { |
159 | #ifdef CONFIG_HOLES_IN_ZONE | 159 | #ifdef CONFIG_HOLES_IN_ZONE |
160 | if (!pfn_valid(page_to_pfn(page))) | 160 | if (!pfn_valid(page_to_pfn(page))) |
161 | return 0; | 161 | return 0; |
162 | #endif | 162 | #endif |
163 | if (zone != page_zone(page)) | 163 | if (zone != page_zone(page)) |
164 | return 0; | 164 | return 0; |
165 | 165 | ||
166 | return 1; | 166 | return 1; |
167 | } | 167 | } |
168 | /* | 168 | /* |
169 | * Temporary debugging check for pages not lying within a given zone. | 169 | * Temporary debugging check for pages not lying within a given zone. |
170 | */ | 170 | */ |
171 | static int bad_range(struct zone *zone, struct page *page) | 171 | static int bad_range(struct zone *zone, struct page *page) |
172 | { | 172 | { |
173 | if (page_outside_zone_boundaries(zone, page)) | 173 | if (page_outside_zone_boundaries(zone, page)) |
174 | return 1; | 174 | return 1; |
175 | if (!page_is_consistent(zone, page)) | 175 | if (!page_is_consistent(zone, page)) |
176 | return 1; | 176 | return 1; |
177 | 177 | ||
178 | return 0; | 178 | return 0; |
179 | } | 179 | } |
180 | #else | 180 | #else |
181 | static inline int bad_range(struct zone *zone, struct page *page) | 181 | static inline int bad_range(struct zone *zone, struct page *page) |
182 | { | 182 | { |
183 | return 0; | 183 | return 0; |
184 | } | 184 | } |
185 | #endif | 185 | #endif |
186 | 186 | ||
187 | static void bad_page(struct page *page) | 187 | static void bad_page(struct page *page) |
188 | { | 188 | { |
189 | printk(KERN_EMERG "Bad page state in process '%s'\n" | 189 | printk(KERN_EMERG "Bad page state in process '%s'\n" |
190 | KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" | 190 | KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" |
191 | KERN_EMERG "Trying to fix it up, but a reboot is needed\n" | 191 | KERN_EMERG "Trying to fix it up, but a reboot is needed\n" |
192 | KERN_EMERG "Backtrace:\n", | 192 | KERN_EMERG "Backtrace:\n", |
193 | current->comm, page, (int)(2*sizeof(unsigned long)), | 193 | current->comm, page, (int)(2*sizeof(unsigned long)), |
194 | (unsigned long)page->flags, page->mapping, | 194 | (unsigned long)page->flags, page->mapping, |
195 | page_mapcount(page), page_count(page)); | 195 | page_mapcount(page), page_count(page)); |
196 | dump_stack(); | 196 | dump_stack(); |
197 | page->flags &= ~(1 << PG_lru | | 197 | page->flags &= ~(1 << PG_lru | |
198 | 1 << PG_private | | 198 | 1 << PG_private | |
199 | 1 << PG_locked | | 199 | 1 << PG_locked | |
200 | 1 << PG_active | | 200 | 1 << PG_active | |
201 | 1 << PG_dirty | | 201 | 1 << PG_dirty | |
202 | 1 << PG_reclaim | | 202 | 1 << PG_reclaim | |
203 | 1 << PG_slab | | 203 | 1 << PG_slab | |
204 | 1 << PG_swapcache | | 204 | 1 << PG_swapcache | |
205 | 1 << PG_writeback | | 205 | 1 << PG_writeback | |
206 | 1 << PG_buddy ); | 206 | 1 << PG_buddy ); |
207 | set_page_count(page, 0); | 207 | set_page_count(page, 0); |
208 | reset_page_mapcount(page); | 208 | reset_page_mapcount(page); |
209 | page->mapping = NULL; | 209 | page->mapping = NULL; |
210 | add_taint(TAINT_BAD_PAGE); | 210 | add_taint(TAINT_BAD_PAGE); |
211 | } | 211 | } |
212 | 212 | ||
213 | /* | 213 | /* |
214 | * Higher-order pages are called "compound pages". They are structured thusly: | 214 | * Higher-order pages are called "compound pages". They are structured thusly: |
215 | * | 215 | * |
216 | * The first PAGE_SIZE page is called the "head page". | 216 | * The first PAGE_SIZE page is called the "head page". |
217 | * | 217 | * |
218 | * The remaining PAGE_SIZE pages are called "tail pages". | 218 | * The remaining PAGE_SIZE pages are called "tail pages". |
219 | * | 219 | * |
220 | * All pages have PG_compound set. All pages have their ->private pointing at | 220 | * All pages have PG_compound set. All pages have their ->private pointing at |
221 | * the head page (even the head page has this). | 221 | * the head page (even the head page has this). |
222 | * | 222 | * |
223 | * The first tail page's ->lru.next holds the address of the compound page's | 223 | * The first tail page's ->lru.next holds the address of the compound page's |
224 | * put_page() function. Its ->lru.prev holds the order of allocation. | 224 | * put_page() function. Its ->lru.prev holds the order of allocation. |
225 | * This usage means that zero-order pages may not be compound. | 225 | * This usage means that zero-order pages may not be compound. |
226 | */ | 226 | */ |
227 | 227 | ||
228 | static void free_compound_page(struct page *page) | 228 | static void free_compound_page(struct page *page) |
229 | { | 229 | { |
230 | __free_pages_ok(page, (unsigned long)page[1].lru.prev); | 230 | __free_pages_ok(page, (unsigned long)page[1].lru.prev); |
231 | } | 231 | } |
232 | 232 | ||
233 | static void prep_compound_page(struct page *page, unsigned long order) | 233 | static void prep_compound_page(struct page *page, unsigned long order) |
234 | { | 234 | { |
235 | int i; | 235 | int i; |
236 | int nr_pages = 1 << order; | 236 | int nr_pages = 1 << order; |
237 | 237 | ||
238 | set_compound_page_dtor(page, free_compound_page); | 238 | set_compound_page_dtor(page, free_compound_page); |
239 | page[1].lru.prev = (void *)order; | 239 | page[1].lru.prev = (void *)order; |
240 | for (i = 0; i < nr_pages; i++) { | 240 | for (i = 0; i < nr_pages; i++) { |
241 | struct page *p = page + i; | 241 | struct page *p = page + i; |
242 | 242 | ||
243 | __SetPageCompound(p); | 243 | __SetPageCompound(p); |
244 | set_page_private(p, (unsigned long)page); | 244 | set_page_private(p, (unsigned long)page); |
245 | } | 245 | } |
246 | } | 246 | } |
247 | 247 | ||
248 | static void destroy_compound_page(struct page *page, unsigned long order) | 248 | static void destroy_compound_page(struct page *page, unsigned long order) |
249 | { | 249 | { |
250 | int i; | 250 | int i; |
251 | int nr_pages = 1 << order; | 251 | int nr_pages = 1 << order; |
252 | 252 | ||
253 | if (unlikely((unsigned long)page[1].lru.prev != order)) | 253 | if (unlikely((unsigned long)page[1].lru.prev != order)) |
254 | bad_page(page); | 254 | bad_page(page); |
255 | 255 | ||
256 | for (i = 0; i < nr_pages; i++) { | 256 | for (i = 0; i < nr_pages; i++) { |
257 | struct page *p = page + i; | 257 | struct page *p = page + i; |
258 | 258 | ||
259 | if (unlikely(!PageCompound(p) | | 259 | if (unlikely(!PageCompound(p) | |
260 | (page_private(p) != (unsigned long)page))) | 260 | (page_private(p) != (unsigned long)page))) |
261 | bad_page(page); | 261 | bad_page(page); |
262 | __ClearPageCompound(p); | 262 | __ClearPageCompound(p); |
263 | } | 263 | } |
264 | } | 264 | } |
265 | 265 | ||
266 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 266 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) |
267 | { | 267 | { |
268 | int i; | 268 | int i; |
269 | 269 | ||
270 | VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 270 | VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); |
271 | /* | 271 | /* |
272 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | 272 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO |
273 | * and __GFP_HIGHMEM from hard or soft interrupt context. | 273 | * and __GFP_HIGHMEM from hard or soft interrupt context. |
274 | */ | 274 | */ |
275 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | 275 | VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); |
276 | for (i = 0; i < (1 << order); i++) | 276 | for (i = 0; i < (1 << order); i++) |
277 | clear_highpage(page + i); | 277 | clear_highpage(page + i); |
278 | } | 278 | } |
279 | 279 | ||
280 | /* | 280 | /* |
281 | * function for dealing with page's order in buddy system. | 281 | * function for dealing with page's order in buddy system. |
282 | * zone->lock is already acquired when we use these. | 282 | * zone->lock is already acquired when we use these. |
283 | * So, we don't need atomic page->flags operations here. | 283 | * So, we don't need atomic page->flags operations here. |
284 | */ | 284 | */ |
285 | static inline unsigned long page_order(struct page *page) | 285 | static inline unsigned long page_order(struct page *page) |
286 | { | 286 | { |
287 | return page_private(page); | 287 | return page_private(page); |
288 | } | 288 | } |
289 | 289 | ||
290 | static inline void set_page_order(struct page *page, int order) | 290 | static inline void set_page_order(struct page *page, int order) |
291 | { | 291 | { |
292 | set_page_private(page, order); | 292 | set_page_private(page, order); |
293 | __SetPageBuddy(page); | 293 | __SetPageBuddy(page); |
294 | } | 294 | } |
295 | 295 | ||
296 | static inline void rmv_page_order(struct page *page) | 296 | static inline void rmv_page_order(struct page *page) |
297 | { | 297 | { |
298 | __ClearPageBuddy(page); | 298 | __ClearPageBuddy(page); |
299 | set_page_private(page, 0); | 299 | set_page_private(page, 0); |
300 | } | 300 | } |
301 | 301 | ||
302 | /* | 302 | /* |
303 | * Locate the struct page for both the matching buddy in our | 303 | * Locate the struct page for both the matching buddy in our |
304 | * pair (buddy1) and the combined O(n+1) page they form (page). | 304 | * pair (buddy1) and the combined O(n+1) page they form (page). |
305 | * | 305 | * |
306 | * 1) Any buddy B1 will have an order O twin B2 which satisfies | 306 | * 1) Any buddy B1 will have an order O twin B2 which satisfies |
307 | * the following equation: | 307 | * the following equation: |
308 | * B2 = B1 ^ (1 << O) | 308 | * B2 = B1 ^ (1 << O) |
309 | * For example, if the starting buddy (buddy2) is #8 its order | 309 | * For example, if the starting buddy (buddy2) is #8 its order |
310 | * 1 buddy is #10: | 310 | * 1 buddy is #10: |
311 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 | 311 | * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 |
312 | * | 312 | * |
313 | * 2) Any buddy B will have an order O+1 parent P which | 313 | * 2) Any buddy B will have an order O+1 parent P which |
314 | * satisfies the following equation: | 314 | * satisfies the following equation: |
315 | * P = B & ~(1 << O) | 315 | * P = B & ~(1 << O) |
316 | * | 316 | * |
317 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER | 317 | * Assumption: *_mem_map is contiguous at least up to MAX_ORDER |
318 | */ | 318 | */ |
319 | static inline struct page * | 319 | static inline struct page * |
320 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) | 320 | __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) |
321 | { | 321 | { |
322 | unsigned long buddy_idx = page_idx ^ (1 << order); | 322 | unsigned long buddy_idx = page_idx ^ (1 << order); |
323 | 323 | ||
324 | return page + (buddy_idx - page_idx); | 324 | return page + (buddy_idx - page_idx); |
325 | } | 325 | } |
326 | 326 | ||
327 | static inline unsigned long | 327 | static inline unsigned long |
328 | __find_combined_index(unsigned long page_idx, unsigned int order) | 328 | __find_combined_index(unsigned long page_idx, unsigned int order) |
329 | { | 329 | { |
330 | return (page_idx & ~(1 << order)); | 330 | return (page_idx & ~(1 << order)); |
331 | } | 331 | } |
332 | 332 | ||
333 | /* | 333 | /* |
334 | * This function checks whether a page is free && is the buddy | 334 | * This function checks whether a page is free && is the buddy |
335 | * we can do coalesce a page and its buddy if | 335 | * we can do coalesce a page and its buddy if |
336 | * (a) the buddy is not in a hole && | 336 | * (a) the buddy is not in a hole && |
337 | * (b) the buddy is in the buddy system && | 337 | * (b) the buddy is in the buddy system && |
338 | * (c) a page and its buddy have the same order && | 338 | * (c) a page and its buddy have the same order && |
339 | * (d) a page and its buddy are in the same zone. | 339 | * (d) a page and its buddy are in the same zone. |
340 | * | 340 | * |
341 | * For recording whether a page is in the buddy system, we use PG_buddy. | 341 | * For recording whether a page is in the buddy system, we use PG_buddy. |
342 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. | 342 | * Setting, clearing, and testing PG_buddy is serialized by zone->lock. |
343 | * | 343 | * |
344 | * For recording page's order, we use page_private(page). | 344 | * For recording page's order, we use page_private(page). |
345 | */ | 345 | */ |
346 | static inline int page_is_buddy(struct page *page, struct page *buddy, | 346 | static inline int page_is_buddy(struct page *page, struct page *buddy, |
347 | int order) | 347 | int order) |
348 | { | 348 | { |
349 | #ifdef CONFIG_HOLES_IN_ZONE | 349 | #ifdef CONFIG_HOLES_IN_ZONE |
350 | if (!pfn_valid(page_to_pfn(buddy))) | 350 | if (!pfn_valid(page_to_pfn(buddy))) |
351 | return 0; | 351 | return 0; |
352 | #endif | 352 | #endif |
353 | 353 | ||
354 | if (page_zone_id(page) != page_zone_id(buddy)) | 354 | if (page_zone_id(page) != page_zone_id(buddy)) |
355 | return 0; | 355 | return 0; |
356 | 356 | ||
357 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 357 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
358 | BUG_ON(page_count(buddy) != 0); | 358 | BUG_ON(page_count(buddy) != 0); |
359 | return 1; | 359 | return 1; |
360 | } | 360 | } |
361 | return 0; | 361 | return 0; |
362 | } | 362 | } |
363 | 363 | ||
364 | /* | 364 | /* |
365 | * Freeing function for a buddy system allocator. | 365 | * Freeing function for a buddy system allocator. |
366 | * | 366 | * |
367 | * The concept of a buddy system is to maintain direct-mapped table | 367 | * The concept of a buddy system is to maintain direct-mapped table |
368 | * (containing bit values) for memory blocks of various "orders". | 368 | * (containing bit values) for memory blocks of various "orders". |
369 | * The bottom level table contains the map for the smallest allocatable | 369 | * The bottom level table contains the map for the smallest allocatable |
370 | * units of memory (here, pages), and each level above it describes | 370 | * units of memory (here, pages), and each level above it describes |
371 | * pairs of units from the levels below, hence, "buddies". | 371 | * pairs of units from the levels below, hence, "buddies". |
372 | * At a high level, all that happens here is marking the table entry | 372 | * At a high level, all that happens here is marking the table entry |
373 | * at the bottom level available, and propagating the changes upward | 373 | * at the bottom level available, and propagating the changes upward |
374 | * as necessary, plus some accounting needed to play nicely with other | 374 | * as necessary, plus some accounting needed to play nicely with other |
375 | * parts of the VM system. | 375 | * parts of the VM system. |
376 | * At each level, we keep a list of pages, which are heads of continuous | 376 | * At each level, we keep a list of pages, which are heads of continuous |
377 | * free pages of length of (1 << order) and marked with PG_buddy. Page's | 377 | * free pages of length of (1 << order) and marked with PG_buddy. Page's |
378 | * order is recorded in page_private(page) field. | 378 | * order is recorded in page_private(page) field. |
379 | * So when we are allocating or freeing one, we can derive the state of the | 379 | * So when we are allocating or freeing one, we can derive the state of the |
380 | * other. That is, if we allocate a small block, and both were | 380 | * other. That is, if we allocate a small block, and both were |
381 | * free, the remainder of the region must be split into blocks. | 381 | * free, the remainder of the region must be split into blocks. |
382 | * If a block is freed, and its buddy is also free, then this | 382 | * If a block is freed, and its buddy is also free, then this |
383 | * triggers coalescing into a block of larger size. | 383 | * triggers coalescing into a block of larger size. |
384 | * | 384 | * |
385 | * -- wli | 385 | * -- wli |
386 | */ | 386 | */ |
387 | 387 | ||
388 | static inline void __free_one_page(struct page *page, | 388 | static inline void __free_one_page(struct page *page, |
389 | struct zone *zone, unsigned int order) | 389 | struct zone *zone, unsigned int order) |
390 | { | 390 | { |
391 | unsigned long page_idx; | 391 | unsigned long page_idx; |
392 | int order_size = 1 << order; | 392 | int order_size = 1 << order; |
393 | 393 | ||
394 | if (unlikely(PageCompound(page))) | 394 | if (unlikely(PageCompound(page))) |
395 | destroy_compound_page(page, order); | 395 | destroy_compound_page(page, order); |
396 | 396 | ||
397 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 397 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
398 | 398 | ||
399 | VM_BUG_ON(page_idx & (order_size - 1)); | 399 | VM_BUG_ON(page_idx & (order_size - 1)); |
400 | VM_BUG_ON(bad_range(zone, page)); | 400 | VM_BUG_ON(bad_range(zone, page)); |
401 | 401 | ||
402 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | 402 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); |
403 | while (order < MAX_ORDER-1) { | 403 | while (order < MAX_ORDER-1) { |
404 | unsigned long combined_idx; | 404 | unsigned long combined_idx; |
405 | struct free_area *area; | 405 | struct free_area *area; |
406 | struct page *buddy; | 406 | struct page *buddy; |
407 | 407 | ||
408 | buddy = __page_find_buddy(page, page_idx, order); | 408 | buddy = __page_find_buddy(page, page_idx, order); |
409 | if (!page_is_buddy(page, buddy, order)) | 409 | if (!page_is_buddy(page, buddy, order)) |
410 | break; /* Move the buddy up one level. */ | 410 | break; /* Move the buddy up one level. */ |
411 | 411 | ||
412 | list_del(&buddy->lru); | 412 | list_del(&buddy->lru); |
413 | area = zone->free_area + order; | 413 | area = zone->free_area + order; |
414 | area->nr_free--; | 414 | area->nr_free--; |
415 | rmv_page_order(buddy); | 415 | rmv_page_order(buddy); |
416 | combined_idx = __find_combined_index(page_idx, order); | 416 | combined_idx = __find_combined_index(page_idx, order); |
417 | page = page + (combined_idx - page_idx); | 417 | page = page + (combined_idx - page_idx); |
418 | page_idx = combined_idx; | 418 | page_idx = combined_idx; |
419 | order++; | 419 | order++; |
420 | } | 420 | } |
421 | set_page_order(page, order); | 421 | set_page_order(page, order); |
422 | list_add(&page->lru, &zone->free_area[order].free_list); | 422 | list_add(&page->lru, &zone->free_area[order].free_list); |
423 | zone->free_area[order].nr_free++; | 423 | zone->free_area[order].nr_free++; |
424 | } | 424 | } |
425 | 425 | ||
426 | static inline int free_pages_check(struct page *page) | 426 | static inline int free_pages_check(struct page *page) |
427 | { | 427 | { |
428 | if (unlikely(page_mapcount(page) | | 428 | if (unlikely(page_mapcount(page) | |
429 | (page->mapping != NULL) | | 429 | (page->mapping != NULL) | |
430 | (page_count(page) != 0) | | 430 | (page_count(page) != 0) | |
431 | (page->flags & ( | 431 | (page->flags & ( |
432 | 1 << PG_lru | | 432 | 1 << PG_lru | |
433 | 1 << PG_private | | 433 | 1 << PG_private | |
434 | 1 << PG_locked | | 434 | 1 << PG_locked | |
435 | 1 << PG_active | | 435 | 1 << PG_active | |
436 | 1 << PG_reclaim | | 436 | 1 << PG_reclaim | |
437 | 1 << PG_slab | | 437 | 1 << PG_slab | |
438 | 1 << PG_swapcache | | 438 | 1 << PG_swapcache | |
439 | 1 << PG_writeback | | 439 | 1 << PG_writeback | |
440 | 1 << PG_reserved | | 440 | 1 << PG_reserved | |
441 | 1 << PG_buddy )))) | 441 | 1 << PG_buddy )))) |
442 | bad_page(page); | 442 | bad_page(page); |
443 | if (PageDirty(page)) | 443 | if (PageDirty(page)) |
444 | __ClearPageDirty(page); | 444 | __ClearPageDirty(page); |
445 | /* | 445 | /* |
446 | * For now, we report if PG_reserved was found set, but do not | 446 | * For now, we report if PG_reserved was found set, but do not |
447 | * clear it, and do not free the page. But we shall soon need | 447 | * clear it, and do not free the page. But we shall soon need |
448 | * to do more, for when the ZERO_PAGE count wraps negative. | 448 | * to do more, for when the ZERO_PAGE count wraps negative. |
449 | */ | 449 | */ |
450 | return PageReserved(page); | 450 | return PageReserved(page); |
451 | } | 451 | } |
452 | 452 | ||
453 | /* | 453 | /* |
454 | * Frees a list of pages. | 454 | * Frees a list of pages. |
455 | * Assumes all pages on list are in same zone, and of same order. | 455 | * Assumes all pages on list are in same zone, and of same order. |
456 | * count is the number of pages to free. | 456 | * count is the number of pages to free. |
457 | * | 457 | * |
458 | * If the zone was previously in an "all pages pinned" state then look to | 458 | * If the zone was previously in an "all pages pinned" state then look to |
459 | * see if this freeing clears that state. | 459 | * see if this freeing clears that state. |
460 | * | 460 | * |
461 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 461 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
462 | * pinned" detection logic. | 462 | * pinned" detection logic. |
463 | */ | 463 | */ |
464 | static void free_pages_bulk(struct zone *zone, int count, | 464 | static void free_pages_bulk(struct zone *zone, int count, |
465 | struct list_head *list, int order) | 465 | struct list_head *list, int order) |
466 | { | 466 | { |
467 | spin_lock(&zone->lock); | 467 | spin_lock(&zone->lock); |
468 | zone->all_unreclaimable = 0; | 468 | zone->all_unreclaimable = 0; |
469 | zone->pages_scanned = 0; | 469 | zone->pages_scanned = 0; |
470 | while (count--) { | 470 | while (count--) { |
471 | struct page *page; | 471 | struct page *page; |
472 | 472 | ||
473 | VM_BUG_ON(list_empty(list)); | 473 | VM_BUG_ON(list_empty(list)); |
474 | page = list_entry(list->prev, struct page, lru); | 474 | page = list_entry(list->prev, struct page, lru); |
475 | /* have to delete it as __free_one_page list manipulates */ | 475 | /* have to delete it as __free_one_page list manipulates */ |
476 | list_del(&page->lru); | 476 | list_del(&page->lru); |
477 | __free_one_page(page, zone, order); | 477 | __free_one_page(page, zone, order); |
478 | } | 478 | } |
479 | spin_unlock(&zone->lock); | 479 | spin_unlock(&zone->lock); |
480 | } | 480 | } |
481 | 481 | ||
482 | static void free_one_page(struct zone *zone, struct page *page, int order) | 482 | static void free_one_page(struct zone *zone, struct page *page, int order) |
483 | { | 483 | { |
484 | spin_lock(&zone->lock); | 484 | spin_lock(&zone->lock); |
485 | zone->all_unreclaimable = 0; | 485 | zone->all_unreclaimable = 0; |
486 | zone->pages_scanned = 0; | 486 | zone->pages_scanned = 0; |
487 | __free_one_page(page, zone, order); | 487 | __free_one_page(page, zone, order); |
488 | spin_unlock(&zone->lock); | 488 | spin_unlock(&zone->lock); |
489 | } | 489 | } |
490 | 490 | ||
491 | static void __free_pages_ok(struct page *page, unsigned int order) | 491 | static void __free_pages_ok(struct page *page, unsigned int order) |
492 | { | 492 | { |
493 | unsigned long flags; | 493 | unsigned long flags; |
494 | int i; | 494 | int i; |
495 | int reserved = 0; | 495 | int reserved = 0; |
496 | 496 | ||
497 | for (i = 0 ; i < (1 << order) ; ++i) | 497 | for (i = 0 ; i < (1 << order) ; ++i) |
498 | reserved += free_pages_check(page + i); | 498 | reserved += free_pages_check(page + i); |
499 | if (reserved) | 499 | if (reserved) |
500 | return; | 500 | return; |
501 | 501 | ||
502 | if (!PageHighMem(page)) | 502 | if (!PageHighMem(page)) |
503 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); | 503 | debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); |
504 | arch_free_page(page, order); | 504 | arch_free_page(page, order); |
505 | kernel_map_pages(page, 1 << order, 0); | 505 | kernel_map_pages(page, 1 << order, 0); |
506 | 506 | ||
507 | local_irq_save(flags); | 507 | local_irq_save(flags); |
508 | __count_vm_events(PGFREE, 1 << order); | 508 | __count_vm_events(PGFREE, 1 << order); |
509 | free_one_page(page_zone(page), page, order); | 509 | free_one_page(page_zone(page), page, order); |
510 | local_irq_restore(flags); | 510 | local_irq_restore(flags); |
511 | } | 511 | } |
512 | 512 | ||
513 | /* | 513 | /* |
514 | * permit the bootmem allocator to evade page validation on high-order frees | 514 | * permit the bootmem allocator to evade page validation on high-order frees |
515 | */ | 515 | */ |
516 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | 516 | void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) |
517 | { | 517 | { |
518 | if (order == 0) { | 518 | if (order == 0) { |
519 | __ClearPageReserved(page); | 519 | __ClearPageReserved(page); |
520 | set_page_count(page, 0); | 520 | set_page_count(page, 0); |
521 | set_page_refcounted(page); | 521 | set_page_refcounted(page); |
522 | __free_page(page); | 522 | __free_page(page); |
523 | } else { | 523 | } else { |
524 | int loop; | 524 | int loop; |
525 | 525 | ||
526 | prefetchw(page); | 526 | prefetchw(page); |
527 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | 527 | for (loop = 0; loop < BITS_PER_LONG; loop++) { |
528 | struct page *p = &page[loop]; | 528 | struct page *p = &page[loop]; |
529 | 529 | ||
530 | if (loop + 1 < BITS_PER_LONG) | 530 | if (loop + 1 < BITS_PER_LONG) |
531 | prefetchw(p + 1); | 531 | prefetchw(p + 1); |
532 | __ClearPageReserved(p); | 532 | __ClearPageReserved(p); |
533 | set_page_count(p, 0); | 533 | set_page_count(p, 0); |
534 | } | 534 | } |
535 | 535 | ||
536 | set_page_refcounted(page); | 536 | set_page_refcounted(page); |
537 | __free_pages(page, order); | 537 | __free_pages(page, order); |
538 | } | 538 | } |
539 | } | 539 | } |
540 | 540 | ||
541 | 541 | ||
542 | /* | 542 | /* |
543 | * The order of subdivision here is critical for the IO subsystem. | 543 | * The order of subdivision here is critical for the IO subsystem. |
544 | * Please do not alter this order without good reasons and regression | 544 | * Please do not alter this order without good reasons and regression |
545 | * testing. Specifically, as large blocks of memory are subdivided, | 545 | * testing. Specifically, as large blocks of memory are subdivided, |
546 | * the order in which smaller blocks are delivered depends on the order | 546 | * the order in which smaller blocks are delivered depends on the order |
547 | * they're subdivided in this function. This is the primary factor | 547 | * they're subdivided in this function. This is the primary factor |
548 | * influencing the order in which pages are delivered to the IO | 548 | * influencing the order in which pages are delivered to the IO |
549 | * subsystem according to empirical testing, and this is also justified | 549 | * subsystem according to empirical testing, and this is also justified |
550 | * by considering the behavior of a buddy system containing a single | 550 | * by considering the behavior of a buddy system containing a single |
551 | * large block of memory acted on by a series of small allocations. | 551 | * large block of memory acted on by a series of small allocations. |
552 | * This behavior is a critical factor in sglist merging's success. | 552 | * This behavior is a critical factor in sglist merging's success. |
553 | * | 553 | * |
554 | * -- wli | 554 | * -- wli |
555 | */ | 555 | */ |
556 | static inline void expand(struct zone *zone, struct page *page, | 556 | static inline void expand(struct zone *zone, struct page *page, |
557 | int low, int high, struct free_area *area) | 557 | int low, int high, struct free_area *area) |
558 | { | 558 | { |
559 | unsigned long size = 1 << high; | 559 | unsigned long size = 1 << high; |
560 | 560 | ||
561 | while (high > low) { | 561 | while (high > low) { |
562 | area--; | 562 | area--; |
563 | high--; | 563 | high--; |
564 | size >>= 1; | 564 | size >>= 1; |
565 | VM_BUG_ON(bad_range(zone, &page[size])); | 565 | VM_BUG_ON(bad_range(zone, &page[size])); |
566 | list_add(&page[size].lru, &area->free_list); | 566 | list_add(&page[size].lru, &area->free_list); |
567 | area->nr_free++; | 567 | area->nr_free++; |
568 | set_page_order(&page[size], high); | 568 | set_page_order(&page[size], high); |
569 | } | 569 | } |
570 | } | 570 | } |
571 | 571 | ||
572 | /* | 572 | /* |
573 | * This page is about to be returned from the page allocator | 573 | * This page is about to be returned from the page allocator |
574 | */ | 574 | */ |
575 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 575 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) |
576 | { | 576 | { |
577 | if (unlikely(page_mapcount(page) | | 577 | if (unlikely(page_mapcount(page) | |
578 | (page->mapping != NULL) | | 578 | (page->mapping != NULL) | |
579 | (page_count(page) != 0) | | 579 | (page_count(page) != 0) | |
580 | (page->flags & ( | 580 | (page->flags & ( |
581 | 1 << PG_lru | | 581 | 1 << PG_lru | |
582 | 1 << PG_private | | 582 | 1 << PG_private | |
583 | 1 << PG_locked | | 583 | 1 << PG_locked | |
584 | 1 << PG_active | | 584 | 1 << PG_active | |
585 | 1 << PG_dirty | | 585 | 1 << PG_dirty | |
586 | 1 << PG_reclaim | | 586 | 1 << PG_reclaim | |
587 | 1 << PG_slab | | 587 | 1 << PG_slab | |
588 | 1 << PG_swapcache | | 588 | 1 << PG_swapcache | |
589 | 1 << PG_writeback | | 589 | 1 << PG_writeback | |
590 | 1 << PG_reserved | | 590 | 1 << PG_reserved | |
591 | 1 << PG_buddy )))) | 591 | 1 << PG_buddy )))) |
592 | bad_page(page); | 592 | bad_page(page); |
593 | 593 | ||
594 | /* | 594 | /* |
595 | * For now, we report if PG_reserved was found set, but do not | 595 | * For now, we report if PG_reserved was found set, but do not |
596 | * clear it, and do not allocate the page: as a safety net. | 596 | * clear it, and do not allocate the page: as a safety net. |
597 | */ | 597 | */ |
598 | if (PageReserved(page)) | 598 | if (PageReserved(page)) |
599 | return 1; | 599 | return 1; |
600 | 600 | ||
601 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | | 601 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | |
602 | 1 << PG_referenced | 1 << PG_arch_1 | | 602 | 1 << PG_referenced | 1 << PG_arch_1 | |
603 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); | 603 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); |
604 | set_page_private(page, 0); | 604 | set_page_private(page, 0); |
605 | set_page_refcounted(page); | 605 | set_page_refcounted(page); |
606 | 606 | ||
607 | arch_alloc_page(page, order); | 607 | arch_alloc_page(page, order); |
608 | kernel_map_pages(page, 1 << order, 1); | 608 | kernel_map_pages(page, 1 << order, 1); |
609 | 609 | ||
610 | if (gfp_flags & __GFP_ZERO) | 610 | if (gfp_flags & __GFP_ZERO) |
611 | prep_zero_page(page, order, gfp_flags); | 611 | prep_zero_page(page, order, gfp_flags); |
612 | 612 | ||
613 | if (order && (gfp_flags & __GFP_COMP)) | 613 | if (order && (gfp_flags & __GFP_COMP)) |
614 | prep_compound_page(page, order); | 614 | prep_compound_page(page, order); |
615 | 615 | ||
616 | return 0; | 616 | return 0; |
617 | } | 617 | } |
618 | 618 | ||
619 | /* | 619 | /* |
620 | * Do the hard work of removing an element from the buddy allocator. | 620 | * Do the hard work of removing an element from the buddy allocator. |
621 | * Call me with the zone->lock already held. | 621 | * Call me with the zone->lock already held. |
622 | */ | 622 | */ |
623 | static struct page *__rmqueue(struct zone *zone, unsigned int order) | 623 | static struct page *__rmqueue(struct zone *zone, unsigned int order) |
624 | { | 624 | { |
625 | struct free_area * area; | 625 | struct free_area * area; |
626 | unsigned int current_order; | 626 | unsigned int current_order; |
627 | struct page *page; | 627 | struct page *page; |
628 | 628 | ||
629 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { | 629 | for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
630 | area = zone->free_area + current_order; | 630 | area = zone->free_area + current_order; |
631 | if (list_empty(&area->free_list)) | 631 | if (list_empty(&area->free_list)) |
632 | continue; | 632 | continue; |
633 | 633 | ||
634 | page = list_entry(area->free_list.next, struct page, lru); | 634 | page = list_entry(area->free_list.next, struct page, lru); |
635 | list_del(&page->lru); | 635 | list_del(&page->lru); |
636 | rmv_page_order(page); | 636 | rmv_page_order(page); |
637 | area->nr_free--; | 637 | area->nr_free--; |
638 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | 638 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); |
639 | expand(zone, page, order, current_order, area); | 639 | expand(zone, page, order, current_order, area); |
640 | return page; | 640 | return page; |
641 | } | 641 | } |
642 | 642 | ||
643 | return NULL; | 643 | return NULL; |
644 | } | 644 | } |
645 | 645 | ||
646 | /* | 646 | /* |
647 | * Obtain a specified number of elements from the buddy allocator, all under | 647 | * Obtain a specified number of elements from the buddy allocator, all under |
648 | * a single hold of the lock, for efficiency. Add them to the supplied list. | 648 | * a single hold of the lock, for efficiency. Add them to the supplied list. |
649 | * Returns the number of new pages which were placed at *list. | 649 | * Returns the number of new pages which were placed at *list. |
650 | */ | 650 | */ |
651 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 651 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
652 | unsigned long count, struct list_head *list) | 652 | unsigned long count, struct list_head *list) |
653 | { | 653 | { |
654 | int i; | 654 | int i; |
655 | 655 | ||
656 | spin_lock(&zone->lock); | 656 | spin_lock(&zone->lock); |
657 | for (i = 0; i < count; ++i) { | 657 | for (i = 0; i < count; ++i) { |
658 | struct page *page = __rmqueue(zone, order); | 658 | struct page *page = __rmqueue(zone, order); |
659 | if (unlikely(page == NULL)) | 659 | if (unlikely(page == NULL)) |
660 | break; | 660 | break; |
661 | list_add_tail(&page->lru, list); | 661 | list_add_tail(&page->lru, list); |
662 | } | 662 | } |
663 | spin_unlock(&zone->lock); | 663 | spin_unlock(&zone->lock); |
664 | return i; | 664 | return i; |
665 | } | 665 | } |
666 | 666 | ||
667 | #if MAX_NUMNODES > 1 | 667 | #if MAX_NUMNODES > 1 |
668 | int nr_node_ids __read_mostly; | 668 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
669 | EXPORT_SYMBOL(nr_node_ids); | 669 | EXPORT_SYMBOL(nr_node_ids); |
670 | 670 | ||
671 | /* | 671 | /* |
672 | * Figure out the number of possible node ids. | 672 | * Figure out the number of possible node ids. |
673 | */ | 673 | */ |
674 | static void __init setup_nr_node_ids(void) | 674 | static void __init setup_nr_node_ids(void) |
675 | { | 675 | { |
676 | unsigned int node; | 676 | unsigned int node; |
677 | unsigned int highest = 0; | 677 | unsigned int highest = 0; |
678 | 678 | ||
679 | for_each_node_mask(node, node_possible_map) | 679 | for_each_node_mask(node, node_possible_map) |
680 | highest = node; | 680 | highest = node; |
681 | nr_node_ids = highest + 1; | 681 | nr_node_ids = highest + 1; |
682 | } | 682 | } |
683 | #else | 683 | #else |
684 | static void __init setup_nr_node_ids(void) {} | 684 | static void __init setup_nr_node_ids(void) {} |
685 | #endif | 685 | #endif |
686 | 686 | ||
687 | #ifdef CONFIG_NUMA | 687 | #ifdef CONFIG_NUMA |
688 | /* | 688 | /* |
689 | * Called from the slab reaper to drain pagesets on a particular node that | 689 | * Called from the slab reaper to drain pagesets on a particular node that |
690 | * belongs to the currently executing processor. | 690 | * belongs to the currently executing processor. |
691 | * Note that this function must be called with the thread pinned to | 691 | * Note that this function must be called with the thread pinned to |
692 | * a single processor. | 692 | * a single processor. |
693 | */ | 693 | */ |
694 | void drain_node_pages(int nodeid) | 694 | void drain_node_pages(int nodeid) |
695 | { | 695 | { |
696 | int i; | 696 | int i; |
697 | enum zone_type z; | 697 | enum zone_type z; |
698 | unsigned long flags; | 698 | unsigned long flags; |
699 | 699 | ||
700 | for (z = 0; z < MAX_NR_ZONES; z++) { | 700 | for (z = 0; z < MAX_NR_ZONES; z++) { |
701 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 701 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; |
702 | struct per_cpu_pageset *pset; | 702 | struct per_cpu_pageset *pset; |
703 | 703 | ||
704 | if (!populated_zone(zone)) | 704 | if (!populated_zone(zone)) |
705 | continue; | 705 | continue; |
706 | 706 | ||
707 | pset = zone_pcp(zone, smp_processor_id()); | 707 | pset = zone_pcp(zone, smp_processor_id()); |
708 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 708 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
709 | struct per_cpu_pages *pcp; | 709 | struct per_cpu_pages *pcp; |
710 | 710 | ||
711 | pcp = &pset->pcp[i]; | 711 | pcp = &pset->pcp[i]; |
712 | if (pcp->count) { | 712 | if (pcp->count) { |
713 | int to_drain; | 713 | int to_drain; |
714 | 714 | ||
715 | local_irq_save(flags); | 715 | local_irq_save(flags); |
716 | if (pcp->count >= pcp->batch) | 716 | if (pcp->count >= pcp->batch) |
717 | to_drain = pcp->batch; | 717 | to_drain = pcp->batch; |
718 | else | 718 | else |
719 | to_drain = pcp->count; | 719 | to_drain = pcp->count; |
720 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | 720 | free_pages_bulk(zone, to_drain, &pcp->list, 0); |
721 | pcp->count -= to_drain; | 721 | pcp->count -= to_drain; |
722 | local_irq_restore(flags); | 722 | local_irq_restore(flags); |
723 | } | 723 | } |
724 | } | 724 | } |
725 | } | 725 | } |
726 | } | 726 | } |
727 | #endif | 727 | #endif |
728 | 728 | ||
729 | static void __drain_pages(unsigned int cpu) | 729 | static void __drain_pages(unsigned int cpu) |
730 | { | 730 | { |
731 | unsigned long flags; | 731 | unsigned long flags; |
732 | struct zone *zone; | 732 | struct zone *zone; |
733 | int i; | 733 | int i; |
734 | 734 | ||
735 | for_each_zone(zone) { | 735 | for_each_zone(zone) { |
736 | struct per_cpu_pageset *pset; | 736 | struct per_cpu_pageset *pset; |
737 | 737 | ||
738 | if (!populated_zone(zone)) | 738 | if (!populated_zone(zone)) |
739 | continue; | 739 | continue; |
740 | 740 | ||
741 | pset = zone_pcp(zone, cpu); | 741 | pset = zone_pcp(zone, cpu); |
742 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 742 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
743 | struct per_cpu_pages *pcp; | 743 | struct per_cpu_pages *pcp; |
744 | 744 | ||
745 | pcp = &pset->pcp[i]; | 745 | pcp = &pset->pcp[i]; |
746 | local_irq_save(flags); | 746 | local_irq_save(flags); |
747 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 747 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); |
748 | pcp->count = 0; | 748 | pcp->count = 0; |
749 | local_irq_restore(flags); | 749 | local_irq_restore(flags); |
750 | } | 750 | } |
751 | } | 751 | } |
752 | } | 752 | } |
753 | 753 | ||
754 | #ifdef CONFIG_PM | 754 | #ifdef CONFIG_PM |
755 | 755 | ||
756 | void mark_free_pages(struct zone *zone) | 756 | void mark_free_pages(struct zone *zone) |
757 | { | 757 | { |
758 | unsigned long pfn, max_zone_pfn; | 758 | unsigned long pfn, max_zone_pfn; |
759 | unsigned long flags; | 759 | unsigned long flags; |
760 | int order; | 760 | int order; |
761 | struct list_head *curr; | 761 | struct list_head *curr; |
762 | 762 | ||
763 | if (!zone->spanned_pages) | 763 | if (!zone->spanned_pages) |
764 | return; | 764 | return; |
765 | 765 | ||
766 | spin_lock_irqsave(&zone->lock, flags); | 766 | spin_lock_irqsave(&zone->lock, flags); |
767 | 767 | ||
768 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 768 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
769 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 769 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
770 | if (pfn_valid(pfn)) { | 770 | if (pfn_valid(pfn)) { |
771 | struct page *page = pfn_to_page(pfn); | 771 | struct page *page = pfn_to_page(pfn); |
772 | 772 | ||
773 | if (!PageNosave(page)) | 773 | if (!PageNosave(page)) |
774 | ClearPageNosaveFree(page); | 774 | ClearPageNosaveFree(page); |
775 | } | 775 | } |
776 | 776 | ||
777 | for (order = MAX_ORDER - 1; order >= 0; --order) | 777 | for (order = MAX_ORDER - 1; order >= 0; --order) |
778 | list_for_each(curr, &zone->free_area[order].free_list) { | 778 | list_for_each(curr, &zone->free_area[order].free_list) { |
779 | unsigned long i; | 779 | unsigned long i; |
780 | 780 | ||
781 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); | 781 | pfn = page_to_pfn(list_entry(curr, struct page, lru)); |
782 | for (i = 0; i < (1UL << order); i++) | 782 | for (i = 0; i < (1UL << order); i++) |
783 | SetPageNosaveFree(pfn_to_page(pfn + i)); | 783 | SetPageNosaveFree(pfn_to_page(pfn + i)); |
784 | } | 784 | } |
785 | 785 | ||
786 | spin_unlock_irqrestore(&zone->lock, flags); | 786 | spin_unlock_irqrestore(&zone->lock, flags); |
787 | } | 787 | } |
788 | 788 | ||
789 | /* | 789 | /* |
790 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. | 790 | * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
791 | */ | 791 | */ |
792 | void drain_local_pages(void) | 792 | void drain_local_pages(void) |
793 | { | 793 | { |
794 | unsigned long flags; | 794 | unsigned long flags; |
795 | 795 | ||
796 | local_irq_save(flags); | 796 | local_irq_save(flags); |
797 | __drain_pages(smp_processor_id()); | 797 | __drain_pages(smp_processor_id()); |
798 | local_irq_restore(flags); | 798 | local_irq_restore(flags); |
799 | } | 799 | } |
800 | #endif /* CONFIG_PM */ | 800 | #endif /* CONFIG_PM */ |
801 | 801 | ||
802 | /* | 802 | /* |
803 | * Free a 0-order page | 803 | * Free a 0-order page |
804 | */ | 804 | */ |
805 | static void fastcall free_hot_cold_page(struct page *page, int cold) | 805 | static void fastcall free_hot_cold_page(struct page *page, int cold) |
806 | { | 806 | { |
807 | struct zone *zone = page_zone(page); | 807 | struct zone *zone = page_zone(page); |
808 | struct per_cpu_pages *pcp; | 808 | struct per_cpu_pages *pcp; |
809 | unsigned long flags; | 809 | unsigned long flags; |
810 | 810 | ||
811 | if (PageAnon(page)) | 811 | if (PageAnon(page)) |
812 | page->mapping = NULL; | 812 | page->mapping = NULL; |
813 | if (free_pages_check(page)) | 813 | if (free_pages_check(page)) |
814 | return; | 814 | return; |
815 | 815 | ||
816 | if (!PageHighMem(page)) | 816 | if (!PageHighMem(page)) |
817 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | 817 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); |
818 | arch_free_page(page, 0); | 818 | arch_free_page(page, 0); |
819 | kernel_map_pages(page, 1, 0); | 819 | kernel_map_pages(page, 1, 0); |
820 | 820 | ||
821 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; | 821 | pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; |
822 | local_irq_save(flags); | 822 | local_irq_save(flags); |
823 | __count_vm_event(PGFREE); | 823 | __count_vm_event(PGFREE); |
824 | list_add(&page->lru, &pcp->list); | 824 | list_add(&page->lru, &pcp->list); |
825 | pcp->count++; | 825 | pcp->count++; |
826 | if (pcp->count >= pcp->high) { | 826 | if (pcp->count >= pcp->high) { |
827 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 827 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); |
828 | pcp->count -= pcp->batch; | 828 | pcp->count -= pcp->batch; |
829 | } | 829 | } |
830 | local_irq_restore(flags); | 830 | local_irq_restore(flags); |
831 | put_cpu(); | 831 | put_cpu(); |
832 | } | 832 | } |
833 | 833 | ||
834 | void fastcall free_hot_page(struct page *page) | 834 | void fastcall free_hot_page(struct page *page) |
835 | { | 835 | { |
836 | free_hot_cold_page(page, 0); | 836 | free_hot_cold_page(page, 0); |
837 | } | 837 | } |
838 | 838 | ||
839 | void fastcall free_cold_page(struct page *page) | 839 | void fastcall free_cold_page(struct page *page) |
840 | { | 840 | { |
841 | free_hot_cold_page(page, 1); | 841 | free_hot_cold_page(page, 1); |
842 | } | 842 | } |
843 | 843 | ||
844 | /* | 844 | /* |
845 | * split_page takes a non-compound higher-order page, and splits it into | 845 | * split_page takes a non-compound higher-order page, and splits it into |
846 | * n (1<<order) sub-pages: page[0..n] | 846 | * n (1<<order) sub-pages: page[0..n] |
847 | * Each sub-page must be freed individually. | 847 | * Each sub-page must be freed individually. |
848 | * | 848 | * |
849 | * Note: this is probably too low level an operation for use in drivers. | 849 | * Note: this is probably too low level an operation for use in drivers. |
850 | * Please consult with lkml before using this in your driver. | 850 | * Please consult with lkml before using this in your driver. |
851 | */ | 851 | */ |
852 | void split_page(struct page *page, unsigned int order) | 852 | void split_page(struct page *page, unsigned int order) |
853 | { | 853 | { |
854 | int i; | 854 | int i; |
855 | 855 | ||
856 | VM_BUG_ON(PageCompound(page)); | 856 | VM_BUG_ON(PageCompound(page)); |
857 | VM_BUG_ON(!page_count(page)); | 857 | VM_BUG_ON(!page_count(page)); |
858 | for (i = 1; i < (1 << order); i++) | 858 | for (i = 1; i < (1 << order); i++) |
859 | set_page_refcounted(page + i); | 859 | set_page_refcounted(page + i); |
860 | } | 860 | } |
861 | 861 | ||
862 | /* | 862 | /* |
863 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But | 863 | * Really, prep_compound_page() should be called from __rmqueue_bulk(). But |
864 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 864 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
865 | * or two. | 865 | * or two. |
866 | */ | 866 | */ |
867 | static struct page *buffered_rmqueue(struct zonelist *zonelist, | 867 | static struct page *buffered_rmqueue(struct zonelist *zonelist, |
868 | struct zone *zone, int order, gfp_t gfp_flags) | 868 | struct zone *zone, int order, gfp_t gfp_flags) |
869 | { | 869 | { |
870 | unsigned long flags; | 870 | unsigned long flags; |
871 | struct page *page; | 871 | struct page *page; |
872 | int cold = !!(gfp_flags & __GFP_COLD); | 872 | int cold = !!(gfp_flags & __GFP_COLD); |
873 | int cpu; | 873 | int cpu; |
874 | 874 | ||
875 | again: | 875 | again: |
876 | cpu = get_cpu(); | 876 | cpu = get_cpu(); |
877 | if (likely(order == 0)) { | 877 | if (likely(order == 0)) { |
878 | struct per_cpu_pages *pcp; | 878 | struct per_cpu_pages *pcp; |
879 | 879 | ||
880 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; | 880 | pcp = &zone_pcp(zone, cpu)->pcp[cold]; |
881 | local_irq_save(flags); | 881 | local_irq_save(flags); |
882 | if (!pcp->count) { | 882 | if (!pcp->count) { |
883 | pcp->count = rmqueue_bulk(zone, 0, | 883 | pcp->count = rmqueue_bulk(zone, 0, |
884 | pcp->batch, &pcp->list); | 884 | pcp->batch, &pcp->list); |
885 | if (unlikely(!pcp->count)) | 885 | if (unlikely(!pcp->count)) |
886 | goto failed; | 886 | goto failed; |
887 | } | 887 | } |
888 | page = list_entry(pcp->list.next, struct page, lru); | 888 | page = list_entry(pcp->list.next, struct page, lru); |
889 | list_del(&page->lru); | 889 | list_del(&page->lru); |
890 | pcp->count--; | 890 | pcp->count--; |
891 | } else { | 891 | } else { |
892 | spin_lock_irqsave(&zone->lock, flags); | 892 | spin_lock_irqsave(&zone->lock, flags); |
893 | page = __rmqueue(zone, order); | 893 | page = __rmqueue(zone, order); |
894 | spin_unlock(&zone->lock); | 894 | spin_unlock(&zone->lock); |
895 | if (!page) | 895 | if (!page) |
896 | goto failed; | 896 | goto failed; |
897 | } | 897 | } |
898 | 898 | ||
899 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 899 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
900 | zone_statistics(zonelist, zone); | 900 | zone_statistics(zonelist, zone); |
901 | local_irq_restore(flags); | 901 | local_irq_restore(flags); |
902 | put_cpu(); | 902 | put_cpu(); |
903 | 903 | ||
904 | VM_BUG_ON(bad_range(zone, page)); | 904 | VM_BUG_ON(bad_range(zone, page)); |
905 | if (prep_new_page(page, order, gfp_flags)) | 905 | if (prep_new_page(page, order, gfp_flags)) |
906 | goto again; | 906 | goto again; |
907 | return page; | 907 | return page; |
908 | 908 | ||
909 | failed: | 909 | failed: |
910 | local_irq_restore(flags); | 910 | local_irq_restore(flags); |
911 | put_cpu(); | 911 | put_cpu(); |
912 | return NULL; | 912 | return NULL; |
913 | } | 913 | } |
914 | 914 | ||
915 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 915 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ |
916 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | 916 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ |
917 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | 917 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ |
918 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | 918 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ |
919 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 919 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
920 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 920 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
921 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 921 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
922 | 922 | ||
923 | #ifdef CONFIG_FAIL_PAGE_ALLOC | 923 | #ifdef CONFIG_FAIL_PAGE_ALLOC |
924 | 924 | ||
925 | static struct fail_page_alloc_attr { | 925 | static struct fail_page_alloc_attr { |
926 | struct fault_attr attr; | 926 | struct fault_attr attr; |
927 | 927 | ||
928 | u32 ignore_gfp_highmem; | 928 | u32 ignore_gfp_highmem; |
929 | u32 ignore_gfp_wait; | 929 | u32 ignore_gfp_wait; |
930 | 930 | ||
931 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 931 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
932 | 932 | ||
933 | struct dentry *ignore_gfp_highmem_file; | 933 | struct dentry *ignore_gfp_highmem_file; |
934 | struct dentry *ignore_gfp_wait_file; | 934 | struct dentry *ignore_gfp_wait_file; |
935 | 935 | ||
936 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | 936 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
937 | 937 | ||
938 | } fail_page_alloc = { | 938 | } fail_page_alloc = { |
939 | .attr = FAULT_ATTR_INITIALIZER, | 939 | .attr = FAULT_ATTR_INITIALIZER, |
940 | .ignore_gfp_wait = 1, | 940 | .ignore_gfp_wait = 1, |
941 | .ignore_gfp_highmem = 1, | 941 | .ignore_gfp_highmem = 1, |
942 | }; | 942 | }; |
943 | 943 | ||
944 | static int __init setup_fail_page_alloc(char *str) | 944 | static int __init setup_fail_page_alloc(char *str) |
945 | { | 945 | { |
946 | return setup_fault_attr(&fail_page_alloc.attr, str); | 946 | return setup_fault_attr(&fail_page_alloc.attr, str); |
947 | } | 947 | } |
948 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 948 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
949 | 949 | ||
950 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 950 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
951 | { | 951 | { |
952 | if (gfp_mask & __GFP_NOFAIL) | 952 | if (gfp_mask & __GFP_NOFAIL) |
953 | return 0; | 953 | return 0; |
954 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 954 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
955 | return 0; | 955 | return 0; |
956 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 956 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
957 | return 0; | 957 | return 0; |
958 | 958 | ||
959 | return should_fail(&fail_page_alloc.attr, 1 << order); | 959 | return should_fail(&fail_page_alloc.attr, 1 << order); |
960 | } | 960 | } |
961 | 961 | ||
962 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 962 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
963 | 963 | ||
964 | static int __init fail_page_alloc_debugfs(void) | 964 | static int __init fail_page_alloc_debugfs(void) |
965 | { | 965 | { |
966 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 966 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
967 | struct dentry *dir; | 967 | struct dentry *dir; |
968 | int err; | 968 | int err; |
969 | 969 | ||
970 | err = init_fault_attr_dentries(&fail_page_alloc.attr, | 970 | err = init_fault_attr_dentries(&fail_page_alloc.attr, |
971 | "fail_page_alloc"); | 971 | "fail_page_alloc"); |
972 | if (err) | 972 | if (err) |
973 | return err; | 973 | return err; |
974 | dir = fail_page_alloc.attr.dentries.dir; | 974 | dir = fail_page_alloc.attr.dentries.dir; |
975 | 975 | ||
976 | fail_page_alloc.ignore_gfp_wait_file = | 976 | fail_page_alloc.ignore_gfp_wait_file = |
977 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 977 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
978 | &fail_page_alloc.ignore_gfp_wait); | 978 | &fail_page_alloc.ignore_gfp_wait); |
979 | 979 | ||
980 | fail_page_alloc.ignore_gfp_highmem_file = | 980 | fail_page_alloc.ignore_gfp_highmem_file = |
981 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | 981 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
982 | &fail_page_alloc.ignore_gfp_highmem); | 982 | &fail_page_alloc.ignore_gfp_highmem); |
983 | 983 | ||
984 | if (!fail_page_alloc.ignore_gfp_wait_file || | 984 | if (!fail_page_alloc.ignore_gfp_wait_file || |
985 | !fail_page_alloc.ignore_gfp_highmem_file) { | 985 | !fail_page_alloc.ignore_gfp_highmem_file) { |
986 | err = -ENOMEM; | 986 | err = -ENOMEM; |
987 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | 987 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); |
988 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | 988 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); |
989 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | 989 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); |
990 | } | 990 | } |
991 | 991 | ||
992 | return err; | 992 | return err; |
993 | } | 993 | } |
994 | 994 | ||
995 | late_initcall(fail_page_alloc_debugfs); | 995 | late_initcall(fail_page_alloc_debugfs); |
996 | 996 | ||
997 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | 997 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
998 | 998 | ||
999 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 999 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1000 | 1000 | ||
1001 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1001 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1002 | { | 1002 | { |
1003 | return 0; | 1003 | return 0; |
1004 | } | 1004 | } |
1005 | 1005 | ||
1006 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1006 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
1007 | 1007 | ||
1008 | /* | 1008 | /* |
1009 | * Return 1 if free pages are above 'mark'. This takes into account the order | 1009 | * Return 1 if free pages are above 'mark'. This takes into account the order |
1010 | * of the allocation. | 1010 | * of the allocation. |
1011 | */ | 1011 | */ |
1012 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1012 | int zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1013 | int classzone_idx, int alloc_flags) | 1013 | int classzone_idx, int alloc_flags) |
1014 | { | 1014 | { |
1015 | /* free_pages my go negative - that's OK */ | 1015 | /* free_pages my go negative - that's OK */ |
1016 | long min = mark; | 1016 | long min = mark; |
1017 | long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; | 1017 | long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; |
1018 | int o; | 1018 | int o; |
1019 | 1019 | ||
1020 | if (alloc_flags & ALLOC_HIGH) | 1020 | if (alloc_flags & ALLOC_HIGH) |
1021 | min -= min / 2; | 1021 | min -= min / 2; |
1022 | if (alloc_flags & ALLOC_HARDER) | 1022 | if (alloc_flags & ALLOC_HARDER) |
1023 | min -= min / 4; | 1023 | min -= min / 4; |
1024 | 1024 | ||
1025 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1025 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) |
1026 | return 0; | 1026 | return 0; |
1027 | for (o = 0; o < order; o++) { | 1027 | for (o = 0; o < order; o++) { |
1028 | /* At the next order, this order's pages become unavailable */ | 1028 | /* At the next order, this order's pages become unavailable */ |
1029 | free_pages -= z->free_area[o].nr_free << o; | 1029 | free_pages -= z->free_area[o].nr_free << o; |
1030 | 1030 | ||
1031 | /* Require fewer higher order pages to be free */ | 1031 | /* Require fewer higher order pages to be free */ |
1032 | min >>= 1; | 1032 | min >>= 1; |
1033 | 1033 | ||
1034 | if (free_pages <= min) | 1034 | if (free_pages <= min) |
1035 | return 0; | 1035 | return 0; |
1036 | } | 1036 | } |
1037 | return 1; | 1037 | return 1; |
1038 | } | 1038 | } |
1039 | 1039 | ||
1040 | #ifdef CONFIG_NUMA | 1040 | #ifdef CONFIG_NUMA |
1041 | /* | 1041 | /* |
1042 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to | 1042 | * zlc_setup - Setup for "zonelist cache". Uses cached zone data to |
1043 | * skip over zones that are not allowed by the cpuset, or that have | 1043 | * skip over zones that are not allowed by the cpuset, or that have |
1044 | * been recently (in last second) found to be nearly full. See further | 1044 | * been recently (in last second) found to be nearly full. See further |
1045 | * comments in mmzone.h. Reduces cache footprint of zonelist scans | 1045 | * comments in mmzone.h. Reduces cache footprint of zonelist scans |
1046 | * that have to skip over alot of full or unallowed zones. | 1046 | * that have to skip over alot of full or unallowed zones. |
1047 | * | 1047 | * |
1048 | * If the zonelist cache is present in the passed in zonelist, then | 1048 | * If the zonelist cache is present in the passed in zonelist, then |
1049 | * returns a pointer to the allowed node mask (either the current | 1049 | * returns a pointer to the allowed node mask (either the current |
1050 | * tasks mems_allowed, or node_online_map.) | 1050 | * tasks mems_allowed, or node_online_map.) |
1051 | * | 1051 | * |
1052 | * If the zonelist cache is not available for this zonelist, does | 1052 | * If the zonelist cache is not available for this zonelist, does |
1053 | * nothing and returns NULL. | 1053 | * nothing and returns NULL. |
1054 | * | 1054 | * |
1055 | * If the fullzones BITMAP in the zonelist cache is stale (more than | 1055 | * If the fullzones BITMAP in the zonelist cache is stale (more than |
1056 | * a second since last zap'd) then we zap it out (clear its bits.) | 1056 | * a second since last zap'd) then we zap it out (clear its bits.) |
1057 | * | 1057 | * |
1058 | * We hold off even calling zlc_setup, until after we've checked the | 1058 | * We hold off even calling zlc_setup, until after we've checked the |
1059 | * first zone in the zonelist, on the theory that most allocations will | 1059 | * first zone in the zonelist, on the theory that most allocations will |
1060 | * be satisfied from that first zone, so best to examine that zone as | 1060 | * be satisfied from that first zone, so best to examine that zone as |
1061 | * quickly as we can. | 1061 | * quickly as we can. |
1062 | */ | 1062 | */ |
1063 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1063 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
1064 | { | 1064 | { |
1065 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1065 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1066 | nodemask_t *allowednodes; /* zonelist_cache approximation */ | 1066 | nodemask_t *allowednodes; /* zonelist_cache approximation */ |
1067 | 1067 | ||
1068 | zlc = zonelist->zlcache_ptr; | 1068 | zlc = zonelist->zlcache_ptr; |
1069 | if (!zlc) | 1069 | if (!zlc) |
1070 | return NULL; | 1070 | return NULL; |
1071 | 1071 | ||
1072 | if (jiffies - zlc->last_full_zap > 1 * HZ) { | 1072 | if (jiffies - zlc->last_full_zap > 1 * HZ) { |
1073 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1073 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1074 | zlc->last_full_zap = jiffies; | 1074 | zlc->last_full_zap = jiffies; |
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1077 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1078 | &cpuset_current_mems_allowed : | 1078 | &cpuset_current_mems_allowed : |
1079 | &node_online_map; | 1079 | &node_online_map; |
1080 | return allowednodes; | 1080 | return allowednodes; |
1081 | } | 1081 | } |
1082 | 1082 | ||
1083 | /* | 1083 | /* |
1084 | * Given 'z' scanning a zonelist, run a couple of quick checks to see | 1084 | * Given 'z' scanning a zonelist, run a couple of quick checks to see |
1085 | * if it is worth looking at further for free memory: | 1085 | * if it is worth looking at further for free memory: |
1086 | * 1) Check that the zone isn't thought to be full (doesn't have its | 1086 | * 1) Check that the zone isn't thought to be full (doesn't have its |
1087 | * bit set in the zonelist_cache fullzones BITMAP). | 1087 | * bit set in the zonelist_cache fullzones BITMAP). |
1088 | * 2) Check that the zones node (obtained from the zonelist_cache | 1088 | * 2) Check that the zones node (obtained from the zonelist_cache |
1089 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. | 1089 | * z_to_n[] mapping) is allowed in the passed in allowednodes mask. |
1090 | * Return true (non-zero) if zone is worth looking at further, or | 1090 | * Return true (non-zero) if zone is worth looking at further, or |
1091 | * else return false (zero) if it is not. | 1091 | * else return false (zero) if it is not. |
1092 | * | 1092 | * |
1093 | * This check -ignores- the distinction between various watermarks, | 1093 | * This check -ignores- the distinction between various watermarks, |
1094 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is | 1094 | * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is |
1095 | * found to be full for any variation of these watermarks, it will | 1095 | * found to be full for any variation of these watermarks, it will |
1096 | * be considered full for up to one second by all requests, unless | 1096 | * be considered full for up to one second by all requests, unless |
1097 | * we are so low on memory on all allowed nodes that we are forced | 1097 | * we are so low on memory on all allowed nodes that we are forced |
1098 | * into the second scan of the zonelist. | 1098 | * into the second scan of the zonelist. |
1099 | * | 1099 | * |
1100 | * In the second scan we ignore this zonelist cache and exactly | 1100 | * In the second scan we ignore this zonelist cache and exactly |
1101 | * apply the watermarks to all zones, even it is slower to do so. | 1101 | * apply the watermarks to all zones, even it is slower to do so. |
1102 | * We are low on memory in the second scan, and should leave no stone | 1102 | * We are low on memory in the second scan, and should leave no stone |
1103 | * unturned looking for a free page. | 1103 | * unturned looking for a free page. |
1104 | */ | 1104 | */ |
1105 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | 1105 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, |
1106 | nodemask_t *allowednodes) | 1106 | nodemask_t *allowednodes) |
1107 | { | 1107 | { |
1108 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1108 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1109 | int i; /* index of *z in zonelist zones */ | 1109 | int i; /* index of *z in zonelist zones */ |
1110 | int n; /* node that zone *z is on */ | 1110 | int n; /* node that zone *z is on */ |
1111 | 1111 | ||
1112 | zlc = zonelist->zlcache_ptr; | 1112 | zlc = zonelist->zlcache_ptr; |
1113 | if (!zlc) | 1113 | if (!zlc) |
1114 | return 1; | 1114 | return 1; |
1115 | 1115 | ||
1116 | i = z - zonelist->zones; | 1116 | i = z - zonelist->zones; |
1117 | n = zlc->z_to_n[i]; | 1117 | n = zlc->z_to_n[i]; |
1118 | 1118 | ||
1119 | /* This zone is worth trying if it is allowed but not full */ | 1119 | /* This zone is worth trying if it is allowed but not full */ |
1120 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); | 1120 | return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); |
1121 | } | 1121 | } |
1122 | 1122 | ||
1123 | /* | 1123 | /* |
1124 | * Given 'z' scanning a zonelist, set the corresponding bit in | 1124 | * Given 'z' scanning a zonelist, set the corresponding bit in |
1125 | * zlc->fullzones, so that subsequent attempts to allocate a page | 1125 | * zlc->fullzones, so that subsequent attempts to allocate a page |
1126 | * from that zone don't waste time re-examining it. | 1126 | * from that zone don't waste time re-examining it. |
1127 | */ | 1127 | */ |
1128 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | 1128 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) |
1129 | { | 1129 | { |
1130 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1130 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1131 | int i; /* index of *z in zonelist zones */ | 1131 | int i; /* index of *z in zonelist zones */ |
1132 | 1132 | ||
1133 | zlc = zonelist->zlcache_ptr; | 1133 | zlc = zonelist->zlcache_ptr; |
1134 | if (!zlc) | 1134 | if (!zlc) |
1135 | return; | 1135 | return; |
1136 | 1136 | ||
1137 | i = z - zonelist->zones; | 1137 | i = z - zonelist->zones; |
1138 | 1138 | ||
1139 | set_bit(i, zlc->fullzones); | 1139 | set_bit(i, zlc->fullzones); |
1140 | } | 1140 | } |
1141 | 1141 | ||
1142 | #else /* CONFIG_NUMA */ | 1142 | #else /* CONFIG_NUMA */ |
1143 | 1143 | ||
1144 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | 1144 | static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) |
1145 | { | 1145 | { |
1146 | return NULL; | 1146 | return NULL; |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | 1149 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, |
1150 | nodemask_t *allowednodes) | 1150 | nodemask_t *allowednodes) |
1151 | { | 1151 | { |
1152 | return 1; | 1152 | return 1; |
1153 | } | 1153 | } |
1154 | 1154 | ||
1155 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | 1155 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) |
1156 | { | 1156 | { |
1157 | } | 1157 | } |
1158 | #endif /* CONFIG_NUMA */ | 1158 | #endif /* CONFIG_NUMA */ |
1159 | 1159 | ||
1160 | /* | 1160 | /* |
1161 | * get_page_from_freelist goes through the zonelist trying to allocate | 1161 | * get_page_from_freelist goes through the zonelist trying to allocate |
1162 | * a page. | 1162 | * a page. |
1163 | */ | 1163 | */ |
1164 | static struct page * | 1164 | static struct page * |
1165 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1165 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
1166 | struct zonelist *zonelist, int alloc_flags) | 1166 | struct zonelist *zonelist, int alloc_flags) |
1167 | { | 1167 | { |
1168 | struct zone **z; | 1168 | struct zone **z; |
1169 | struct page *page = NULL; | 1169 | struct page *page = NULL; |
1170 | int classzone_idx = zone_idx(zonelist->zones[0]); | 1170 | int classzone_idx = zone_idx(zonelist->zones[0]); |
1171 | struct zone *zone; | 1171 | struct zone *zone; |
1172 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1172 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1173 | int zlc_active = 0; /* set if using zonelist_cache */ | 1173 | int zlc_active = 0; /* set if using zonelist_cache */ |
1174 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1174 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1175 | 1175 | ||
1176 | zonelist_scan: | 1176 | zonelist_scan: |
1177 | /* | 1177 | /* |
1178 | * Scan zonelist, looking for a zone with enough free. | 1178 | * Scan zonelist, looking for a zone with enough free. |
1179 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1179 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1180 | */ | 1180 | */ |
1181 | z = zonelist->zones; | 1181 | z = zonelist->zones; |
1182 | 1182 | ||
1183 | do { | 1183 | do { |
1184 | if (NUMA_BUILD && zlc_active && | 1184 | if (NUMA_BUILD && zlc_active && |
1185 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1185 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1186 | continue; | 1186 | continue; |
1187 | zone = *z; | 1187 | zone = *z; |
1188 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && | 1188 | if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && |
1189 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) | 1189 | zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) |
1190 | break; | 1190 | break; |
1191 | if ((alloc_flags & ALLOC_CPUSET) && | 1191 | if ((alloc_flags & ALLOC_CPUSET) && |
1192 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1192 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1193 | goto try_next_zone; | 1193 | goto try_next_zone; |
1194 | 1194 | ||
1195 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1195 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
1196 | unsigned long mark; | 1196 | unsigned long mark; |
1197 | if (alloc_flags & ALLOC_WMARK_MIN) | 1197 | if (alloc_flags & ALLOC_WMARK_MIN) |
1198 | mark = zone->pages_min; | 1198 | mark = zone->pages_min; |
1199 | else if (alloc_flags & ALLOC_WMARK_LOW) | 1199 | else if (alloc_flags & ALLOC_WMARK_LOW) |
1200 | mark = zone->pages_low; | 1200 | mark = zone->pages_low; |
1201 | else | 1201 | else |
1202 | mark = zone->pages_high; | 1202 | mark = zone->pages_high; |
1203 | if (!zone_watermark_ok(zone, order, mark, | 1203 | if (!zone_watermark_ok(zone, order, mark, |
1204 | classzone_idx, alloc_flags)) { | 1204 | classzone_idx, alloc_flags)) { |
1205 | if (!zone_reclaim_mode || | 1205 | if (!zone_reclaim_mode || |
1206 | !zone_reclaim(zone, gfp_mask, order)) | 1206 | !zone_reclaim(zone, gfp_mask, order)) |
1207 | goto this_zone_full; | 1207 | goto this_zone_full; |
1208 | } | 1208 | } |
1209 | } | 1209 | } |
1210 | 1210 | ||
1211 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1211 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); |
1212 | if (page) | 1212 | if (page) |
1213 | break; | 1213 | break; |
1214 | this_zone_full: | 1214 | this_zone_full: |
1215 | if (NUMA_BUILD) | 1215 | if (NUMA_BUILD) |
1216 | zlc_mark_zone_full(zonelist, z); | 1216 | zlc_mark_zone_full(zonelist, z); |
1217 | try_next_zone: | 1217 | try_next_zone: |
1218 | if (NUMA_BUILD && !did_zlc_setup) { | 1218 | if (NUMA_BUILD && !did_zlc_setup) { |
1219 | /* we do zlc_setup after the first zone is tried */ | 1219 | /* we do zlc_setup after the first zone is tried */ |
1220 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1220 | allowednodes = zlc_setup(zonelist, alloc_flags); |
1221 | zlc_active = 1; | 1221 | zlc_active = 1; |
1222 | did_zlc_setup = 1; | 1222 | did_zlc_setup = 1; |
1223 | } | 1223 | } |
1224 | } while (*(++z) != NULL); | 1224 | } while (*(++z) != NULL); |
1225 | 1225 | ||
1226 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1226 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
1227 | /* Disable zlc cache for second zonelist scan */ | 1227 | /* Disable zlc cache for second zonelist scan */ |
1228 | zlc_active = 0; | 1228 | zlc_active = 0; |
1229 | goto zonelist_scan; | 1229 | goto zonelist_scan; |
1230 | } | 1230 | } |
1231 | return page; | 1231 | return page; |
1232 | } | 1232 | } |
1233 | 1233 | ||
1234 | /* | 1234 | /* |
1235 | * This is the 'heart' of the zoned buddy allocator. | 1235 | * This is the 'heart' of the zoned buddy allocator. |
1236 | */ | 1236 | */ |
1237 | struct page * fastcall | 1237 | struct page * fastcall |
1238 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | 1238 | __alloc_pages(gfp_t gfp_mask, unsigned int order, |
1239 | struct zonelist *zonelist) | 1239 | struct zonelist *zonelist) |
1240 | { | 1240 | { |
1241 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1241 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1242 | struct zone **z; | 1242 | struct zone **z; |
1243 | struct page *page; | 1243 | struct page *page; |
1244 | struct reclaim_state reclaim_state; | 1244 | struct reclaim_state reclaim_state; |
1245 | struct task_struct *p = current; | 1245 | struct task_struct *p = current; |
1246 | int do_retry; | 1246 | int do_retry; |
1247 | int alloc_flags; | 1247 | int alloc_flags; |
1248 | int did_some_progress; | 1248 | int did_some_progress; |
1249 | 1249 | ||
1250 | might_sleep_if(wait); | 1250 | might_sleep_if(wait); |
1251 | 1251 | ||
1252 | if (should_fail_alloc_page(gfp_mask, order)) | 1252 | if (should_fail_alloc_page(gfp_mask, order)) |
1253 | return NULL; | 1253 | return NULL; |
1254 | 1254 | ||
1255 | restart: | 1255 | restart: |
1256 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1256 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ |
1257 | 1257 | ||
1258 | if (unlikely(*z == NULL)) { | 1258 | if (unlikely(*z == NULL)) { |
1259 | /* Should this ever happen?? */ | 1259 | /* Should this ever happen?? */ |
1260 | return NULL; | 1260 | return NULL; |
1261 | } | 1261 | } |
1262 | 1262 | ||
1263 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1263 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
1264 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1264 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
1265 | if (page) | 1265 | if (page) |
1266 | goto got_pg; | 1266 | goto got_pg; |
1267 | 1267 | ||
1268 | /* | 1268 | /* |
1269 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1269 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
1270 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | 1270 | * __GFP_NOWARN set) should not cause reclaim since the subsystem |
1271 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | 1271 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim |
1272 | * using a larger set of nodes after it has established that the | 1272 | * using a larger set of nodes after it has established that the |
1273 | * allowed per node queues are empty and that nodes are | 1273 | * allowed per node queues are empty and that nodes are |
1274 | * over allocated. | 1274 | * over allocated. |
1275 | */ | 1275 | */ |
1276 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1276 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1277 | goto nopage; | 1277 | goto nopage; |
1278 | 1278 | ||
1279 | for (z = zonelist->zones; *z; z++) | 1279 | for (z = zonelist->zones; *z; z++) |
1280 | wakeup_kswapd(*z, order); | 1280 | wakeup_kswapd(*z, order); |
1281 | 1281 | ||
1282 | /* | 1282 | /* |
1283 | * OK, we're below the kswapd watermark and have kicked background | 1283 | * OK, we're below the kswapd watermark and have kicked background |
1284 | * reclaim. Now things get more complex, so set up alloc_flags according | 1284 | * reclaim. Now things get more complex, so set up alloc_flags according |
1285 | * to how we want to proceed. | 1285 | * to how we want to proceed. |
1286 | * | 1286 | * |
1287 | * The caller may dip into page reserves a bit more if the caller | 1287 | * The caller may dip into page reserves a bit more if the caller |
1288 | * cannot run direct reclaim, or if the caller has realtime scheduling | 1288 | * cannot run direct reclaim, or if the caller has realtime scheduling |
1289 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | 1289 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will |
1290 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | 1290 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). |
1291 | */ | 1291 | */ |
1292 | alloc_flags = ALLOC_WMARK_MIN; | 1292 | alloc_flags = ALLOC_WMARK_MIN; |
1293 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | 1293 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) |
1294 | alloc_flags |= ALLOC_HARDER; | 1294 | alloc_flags |= ALLOC_HARDER; |
1295 | if (gfp_mask & __GFP_HIGH) | 1295 | if (gfp_mask & __GFP_HIGH) |
1296 | alloc_flags |= ALLOC_HIGH; | 1296 | alloc_flags |= ALLOC_HIGH; |
1297 | if (wait) | 1297 | if (wait) |
1298 | alloc_flags |= ALLOC_CPUSET; | 1298 | alloc_flags |= ALLOC_CPUSET; |
1299 | 1299 | ||
1300 | /* | 1300 | /* |
1301 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 1301 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
1302 | * coming from realtime tasks go deeper into reserves. | 1302 | * coming from realtime tasks go deeper into reserves. |
1303 | * | 1303 | * |
1304 | * This is the last chance, in general, before the goto nopage. | 1304 | * This is the last chance, in general, before the goto nopage. |
1305 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1305 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1306 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1306 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1307 | */ | 1307 | */ |
1308 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); | 1308 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); |
1309 | if (page) | 1309 | if (page) |
1310 | goto got_pg; | 1310 | goto got_pg; |
1311 | 1311 | ||
1312 | /* This allocation should allow future memory freeing. */ | 1312 | /* This allocation should allow future memory freeing. */ |
1313 | 1313 | ||
1314 | rebalance: | 1314 | rebalance: |
1315 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1315 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) |
1316 | && !in_interrupt()) { | 1316 | && !in_interrupt()) { |
1317 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1317 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
1318 | nofail_alloc: | 1318 | nofail_alloc: |
1319 | /* go through the zonelist yet again, ignoring mins */ | 1319 | /* go through the zonelist yet again, ignoring mins */ |
1320 | page = get_page_from_freelist(gfp_mask, order, | 1320 | page = get_page_from_freelist(gfp_mask, order, |
1321 | zonelist, ALLOC_NO_WATERMARKS); | 1321 | zonelist, ALLOC_NO_WATERMARKS); |
1322 | if (page) | 1322 | if (page) |
1323 | goto got_pg; | 1323 | goto got_pg; |
1324 | if (gfp_mask & __GFP_NOFAIL) { | 1324 | if (gfp_mask & __GFP_NOFAIL) { |
1325 | congestion_wait(WRITE, HZ/50); | 1325 | congestion_wait(WRITE, HZ/50); |
1326 | goto nofail_alloc; | 1326 | goto nofail_alloc; |
1327 | } | 1327 | } |
1328 | } | 1328 | } |
1329 | goto nopage; | 1329 | goto nopage; |
1330 | } | 1330 | } |
1331 | 1331 | ||
1332 | /* Atomic allocations - we can't balance anything */ | 1332 | /* Atomic allocations - we can't balance anything */ |
1333 | if (!wait) | 1333 | if (!wait) |
1334 | goto nopage; | 1334 | goto nopage; |
1335 | 1335 | ||
1336 | cond_resched(); | 1336 | cond_resched(); |
1337 | 1337 | ||
1338 | /* We now go into synchronous reclaim */ | 1338 | /* We now go into synchronous reclaim */ |
1339 | cpuset_memory_pressure_bump(); | 1339 | cpuset_memory_pressure_bump(); |
1340 | p->flags |= PF_MEMALLOC; | 1340 | p->flags |= PF_MEMALLOC; |
1341 | reclaim_state.reclaimed_slab = 0; | 1341 | reclaim_state.reclaimed_slab = 0; |
1342 | p->reclaim_state = &reclaim_state; | 1342 | p->reclaim_state = &reclaim_state; |
1343 | 1343 | ||
1344 | did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); | 1344 | did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); |
1345 | 1345 | ||
1346 | p->reclaim_state = NULL; | 1346 | p->reclaim_state = NULL; |
1347 | p->flags &= ~PF_MEMALLOC; | 1347 | p->flags &= ~PF_MEMALLOC; |
1348 | 1348 | ||
1349 | cond_resched(); | 1349 | cond_resched(); |
1350 | 1350 | ||
1351 | if (likely(did_some_progress)) { | 1351 | if (likely(did_some_progress)) { |
1352 | page = get_page_from_freelist(gfp_mask, order, | 1352 | page = get_page_from_freelist(gfp_mask, order, |
1353 | zonelist, alloc_flags); | 1353 | zonelist, alloc_flags); |
1354 | if (page) | 1354 | if (page) |
1355 | goto got_pg; | 1355 | goto got_pg; |
1356 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1356 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1357 | /* | 1357 | /* |
1358 | * Go through the zonelist yet one more time, keep | 1358 | * Go through the zonelist yet one more time, keep |
1359 | * very high watermark here, this is only to catch | 1359 | * very high watermark here, this is only to catch |
1360 | * a parallel oom killing, we must fail if we're still | 1360 | * a parallel oom killing, we must fail if we're still |
1361 | * under heavy pressure. | 1361 | * under heavy pressure. |
1362 | */ | 1362 | */ |
1363 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1363 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
1364 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1364 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); |
1365 | if (page) | 1365 | if (page) |
1366 | goto got_pg; | 1366 | goto got_pg; |
1367 | 1367 | ||
1368 | out_of_memory(zonelist, gfp_mask, order); | 1368 | out_of_memory(zonelist, gfp_mask, order); |
1369 | goto restart; | 1369 | goto restart; |
1370 | } | 1370 | } |
1371 | 1371 | ||
1372 | /* | 1372 | /* |
1373 | * Don't let big-order allocations loop unless the caller explicitly | 1373 | * Don't let big-order allocations loop unless the caller explicitly |
1374 | * requests that. Wait for some write requests to complete then retry. | 1374 | * requests that. Wait for some write requests to complete then retry. |
1375 | * | 1375 | * |
1376 | * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order | 1376 | * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order |
1377 | * <= 3, but that may not be true in other implementations. | 1377 | * <= 3, but that may not be true in other implementations. |
1378 | */ | 1378 | */ |
1379 | do_retry = 0; | 1379 | do_retry = 0; |
1380 | if (!(gfp_mask & __GFP_NORETRY)) { | 1380 | if (!(gfp_mask & __GFP_NORETRY)) { |
1381 | if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) | 1381 | if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) |
1382 | do_retry = 1; | 1382 | do_retry = 1; |
1383 | if (gfp_mask & __GFP_NOFAIL) | 1383 | if (gfp_mask & __GFP_NOFAIL) |
1384 | do_retry = 1; | 1384 | do_retry = 1; |
1385 | } | 1385 | } |
1386 | if (do_retry) { | 1386 | if (do_retry) { |
1387 | congestion_wait(WRITE, HZ/50); | 1387 | congestion_wait(WRITE, HZ/50); |
1388 | goto rebalance; | 1388 | goto rebalance; |
1389 | } | 1389 | } |
1390 | 1390 | ||
1391 | nopage: | 1391 | nopage: |
1392 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { | 1392 | if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { |
1393 | printk(KERN_WARNING "%s: page allocation failure." | 1393 | printk(KERN_WARNING "%s: page allocation failure." |
1394 | " order:%d, mode:0x%x\n", | 1394 | " order:%d, mode:0x%x\n", |
1395 | p->comm, order, gfp_mask); | 1395 | p->comm, order, gfp_mask); |
1396 | dump_stack(); | 1396 | dump_stack(); |
1397 | show_mem(); | 1397 | show_mem(); |
1398 | } | 1398 | } |
1399 | got_pg: | 1399 | got_pg: |
1400 | return page; | 1400 | return page; |
1401 | } | 1401 | } |
1402 | 1402 | ||
1403 | EXPORT_SYMBOL(__alloc_pages); | 1403 | EXPORT_SYMBOL(__alloc_pages); |
1404 | 1404 | ||
1405 | /* | 1405 | /* |
1406 | * Common helper functions. | 1406 | * Common helper functions. |
1407 | */ | 1407 | */ |
1408 | fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 1408 | fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
1409 | { | 1409 | { |
1410 | struct page * page; | 1410 | struct page * page; |
1411 | page = alloc_pages(gfp_mask, order); | 1411 | page = alloc_pages(gfp_mask, order); |
1412 | if (!page) | 1412 | if (!page) |
1413 | return 0; | 1413 | return 0; |
1414 | return (unsigned long) page_address(page); | 1414 | return (unsigned long) page_address(page); |
1415 | } | 1415 | } |
1416 | 1416 | ||
1417 | EXPORT_SYMBOL(__get_free_pages); | 1417 | EXPORT_SYMBOL(__get_free_pages); |
1418 | 1418 | ||
1419 | fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) | 1419 | fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) |
1420 | { | 1420 | { |
1421 | struct page * page; | 1421 | struct page * page; |
1422 | 1422 | ||
1423 | /* | 1423 | /* |
1424 | * get_zeroed_page() returns a 32-bit address, which cannot represent | 1424 | * get_zeroed_page() returns a 32-bit address, which cannot represent |
1425 | * a highmem page | 1425 | * a highmem page |
1426 | */ | 1426 | */ |
1427 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | 1427 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); |
1428 | 1428 | ||
1429 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | 1429 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); |
1430 | if (page) | 1430 | if (page) |
1431 | return (unsigned long) page_address(page); | 1431 | return (unsigned long) page_address(page); |
1432 | return 0; | 1432 | return 0; |
1433 | } | 1433 | } |
1434 | 1434 | ||
1435 | EXPORT_SYMBOL(get_zeroed_page); | 1435 | EXPORT_SYMBOL(get_zeroed_page); |
1436 | 1436 | ||
1437 | void __pagevec_free(struct pagevec *pvec) | 1437 | void __pagevec_free(struct pagevec *pvec) |
1438 | { | 1438 | { |
1439 | int i = pagevec_count(pvec); | 1439 | int i = pagevec_count(pvec); |
1440 | 1440 | ||
1441 | while (--i >= 0) | 1441 | while (--i >= 0) |
1442 | free_hot_cold_page(pvec->pages[i], pvec->cold); | 1442 | free_hot_cold_page(pvec->pages[i], pvec->cold); |
1443 | } | 1443 | } |
1444 | 1444 | ||
1445 | fastcall void __free_pages(struct page *page, unsigned int order) | 1445 | fastcall void __free_pages(struct page *page, unsigned int order) |
1446 | { | 1446 | { |
1447 | if (put_page_testzero(page)) { | 1447 | if (put_page_testzero(page)) { |
1448 | if (order == 0) | 1448 | if (order == 0) |
1449 | free_hot_page(page); | 1449 | free_hot_page(page); |
1450 | else | 1450 | else |
1451 | __free_pages_ok(page, order); | 1451 | __free_pages_ok(page, order); |
1452 | } | 1452 | } |
1453 | } | 1453 | } |
1454 | 1454 | ||
1455 | EXPORT_SYMBOL(__free_pages); | 1455 | EXPORT_SYMBOL(__free_pages); |
1456 | 1456 | ||
1457 | fastcall void free_pages(unsigned long addr, unsigned int order) | 1457 | fastcall void free_pages(unsigned long addr, unsigned int order) |
1458 | { | 1458 | { |
1459 | if (addr != 0) { | 1459 | if (addr != 0) { |
1460 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | 1460 | VM_BUG_ON(!virt_addr_valid((void *)addr)); |
1461 | __free_pages(virt_to_page((void *)addr), order); | 1461 | __free_pages(virt_to_page((void *)addr), order); |
1462 | } | 1462 | } |
1463 | } | 1463 | } |
1464 | 1464 | ||
1465 | EXPORT_SYMBOL(free_pages); | 1465 | EXPORT_SYMBOL(free_pages); |
1466 | 1466 | ||
1467 | static unsigned int nr_free_zone_pages(int offset) | 1467 | static unsigned int nr_free_zone_pages(int offset) |
1468 | { | 1468 | { |
1469 | /* Just pick one node, since fallback list is circular */ | 1469 | /* Just pick one node, since fallback list is circular */ |
1470 | pg_data_t *pgdat = NODE_DATA(numa_node_id()); | 1470 | pg_data_t *pgdat = NODE_DATA(numa_node_id()); |
1471 | unsigned int sum = 0; | 1471 | unsigned int sum = 0; |
1472 | 1472 | ||
1473 | struct zonelist *zonelist = pgdat->node_zonelists + offset; | 1473 | struct zonelist *zonelist = pgdat->node_zonelists + offset; |
1474 | struct zone **zonep = zonelist->zones; | 1474 | struct zone **zonep = zonelist->zones; |
1475 | struct zone *zone; | 1475 | struct zone *zone; |
1476 | 1476 | ||
1477 | for (zone = *zonep++; zone; zone = *zonep++) { | 1477 | for (zone = *zonep++; zone; zone = *zonep++) { |
1478 | unsigned long size = zone->present_pages; | 1478 | unsigned long size = zone->present_pages; |
1479 | unsigned long high = zone->pages_high; | 1479 | unsigned long high = zone->pages_high; |
1480 | if (size > high) | 1480 | if (size > high) |
1481 | sum += size - high; | 1481 | sum += size - high; |
1482 | } | 1482 | } |
1483 | 1483 | ||
1484 | return sum; | 1484 | return sum; |
1485 | } | 1485 | } |
1486 | 1486 | ||
1487 | /* | 1487 | /* |
1488 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL | 1488 | * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL |
1489 | */ | 1489 | */ |
1490 | unsigned int nr_free_buffer_pages(void) | 1490 | unsigned int nr_free_buffer_pages(void) |
1491 | { | 1491 | { |
1492 | return nr_free_zone_pages(gfp_zone(GFP_USER)); | 1492 | return nr_free_zone_pages(gfp_zone(GFP_USER)); |
1493 | } | 1493 | } |
1494 | 1494 | ||
1495 | /* | 1495 | /* |
1496 | * Amount of free RAM allocatable within all zones | 1496 | * Amount of free RAM allocatable within all zones |
1497 | */ | 1497 | */ |
1498 | unsigned int nr_free_pagecache_pages(void) | 1498 | unsigned int nr_free_pagecache_pages(void) |
1499 | { | 1499 | { |
1500 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); | 1500 | return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); |
1501 | } | 1501 | } |
1502 | 1502 | ||
1503 | static inline void show_node(struct zone *zone) | 1503 | static inline void show_node(struct zone *zone) |
1504 | { | 1504 | { |
1505 | if (NUMA_BUILD) | 1505 | if (NUMA_BUILD) |
1506 | printk("Node %d ", zone_to_nid(zone)); | 1506 | printk("Node %d ", zone_to_nid(zone)); |
1507 | } | 1507 | } |
1508 | 1508 | ||
1509 | void si_meminfo(struct sysinfo *val) | 1509 | void si_meminfo(struct sysinfo *val) |
1510 | { | 1510 | { |
1511 | val->totalram = totalram_pages; | 1511 | val->totalram = totalram_pages; |
1512 | val->sharedram = 0; | 1512 | val->sharedram = 0; |
1513 | val->freeram = global_page_state(NR_FREE_PAGES); | 1513 | val->freeram = global_page_state(NR_FREE_PAGES); |
1514 | val->bufferram = nr_blockdev_pages(); | 1514 | val->bufferram = nr_blockdev_pages(); |
1515 | val->totalhigh = totalhigh_pages; | 1515 | val->totalhigh = totalhigh_pages; |
1516 | val->freehigh = nr_free_highpages(); | 1516 | val->freehigh = nr_free_highpages(); |
1517 | val->mem_unit = PAGE_SIZE; | 1517 | val->mem_unit = PAGE_SIZE; |
1518 | } | 1518 | } |
1519 | 1519 | ||
1520 | EXPORT_SYMBOL(si_meminfo); | 1520 | EXPORT_SYMBOL(si_meminfo); |
1521 | 1521 | ||
1522 | #ifdef CONFIG_NUMA | 1522 | #ifdef CONFIG_NUMA |
1523 | void si_meminfo_node(struct sysinfo *val, int nid) | 1523 | void si_meminfo_node(struct sysinfo *val, int nid) |
1524 | { | 1524 | { |
1525 | pg_data_t *pgdat = NODE_DATA(nid); | 1525 | pg_data_t *pgdat = NODE_DATA(nid); |
1526 | 1526 | ||
1527 | val->totalram = pgdat->node_present_pages; | 1527 | val->totalram = pgdat->node_present_pages; |
1528 | val->freeram = node_page_state(nid, NR_FREE_PAGES); | 1528 | val->freeram = node_page_state(nid, NR_FREE_PAGES); |
1529 | #ifdef CONFIG_HIGHMEM | 1529 | #ifdef CONFIG_HIGHMEM |
1530 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; | 1530 | val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; |
1531 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], | 1531 | val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], |
1532 | NR_FREE_PAGES); | 1532 | NR_FREE_PAGES); |
1533 | #else | 1533 | #else |
1534 | val->totalhigh = 0; | 1534 | val->totalhigh = 0; |
1535 | val->freehigh = 0; | 1535 | val->freehigh = 0; |
1536 | #endif | 1536 | #endif |
1537 | val->mem_unit = PAGE_SIZE; | 1537 | val->mem_unit = PAGE_SIZE; |
1538 | } | 1538 | } |
1539 | #endif | 1539 | #endif |
1540 | 1540 | ||
1541 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1541 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1542 | 1542 | ||
1543 | /* | 1543 | /* |
1544 | * Show free area list (used inside shift_scroll-lock stuff) | 1544 | * Show free area list (used inside shift_scroll-lock stuff) |
1545 | * We also calculate the percentage fragmentation. We do this by counting the | 1545 | * We also calculate the percentage fragmentation. We do this by counting the |
1546 | * memory on each free list with the exception of the first item on the list. | 1546 | * memory on each free list with the exception of the first item on the list. |
1547 | */ | 1547 | */ |
1548 | void show_free_areas(void) | 1548 | void show_free_areas(void) |
1549 | { | 1549 | { |
1550 | int cpu; | 1550 | int cpu; |
1551 | struct zone *zone; | 1551 | struct zone *zone; |
1552 | 1552 | ||
1553 | for_each_zone(zone) { | 1553 | for_each_zone(zone) { |
1554 | if (!populated_zone(zone)) | 1554 | if (!populated_zone(zone)) |
1555 | continue; | 1555 | continue; |
1556 | 1556 | ||
1557 | show_node(zone); | 1557 | show_node(zone); |
1558 | printk("%s per-cpu:\n", zone->name); | 1558 | printk("%s per-cpu:\n", zone->name); |
1559 | 1559 | ||
1560 | for_each_online_cpu(cpu) { | 1560 | for_each_online_cpu(cpu) { |
1561 | struct per_cpu_pageset *pageset; | 1561 | struct per_cpu_pageset *pageset; |
1562 | 1562 | ||
1563 | pageset = zone_pcp(zone, cpu); | 1563 | pageset = zone_pcp(zone, cpu); |
1564 | 1564 | ||
1565 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " | 1565 | printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " |
1566 | "Cold: hi:%5d, btch:%4d usd:%4d\n", | 1566 | "Cold: hi:%5d, btch:%4d usd:%4d\n", |
1567 | cpu, pageset->pcp[0].high, | 1567 | cpu, pageset->pcp[0].high, |
1568 | pageset->pcp[0].batch, pageset->pcp[0].count, | 1568 | pageset->pcp[0].batch, pageset->pcp[0].count, |
1569 | pageset->pcp[1].high, pageset->pcp[1].batch, | 1569 | pageset->pcp[1].high, pageset->pcp[1].batch, |
1570 | pageset->pcp[1].count); | 1570 | pageset->pcp[1].count); |
1571 | } | 1571 | } |
1572 | } | 1572 | } |
1573 | 1573 | ||
1574 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" | 1574 | printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
1575 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 1575 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", |
1576 | global_page_state(NR_ACTIVE), | 1576 | global_page_state(NR_ACTIVE), |
1577 | global_page_state(NR_INACTIVE), | 1577 | global_page_state(NR_INACTIVE), |
1578 | global_page_state(NR_FILE_DIRTY), | 1578 | global_page_state(NR_FILE_DIRTY), |
1579 | global_page_state(NR_WRITEBACK), | 1579 | global_page_state(NR_WRITEBACK), |
1580 | global_page_state(NR_UNSTABLE_NFS), | 1580 | global_page_state(NR_UNSTABLE_NFS), |
1581 | global_page_state(NR_FREE_PAGES), | 1581 | global_page_state(NR_FREE_PAGES), |
1582 | global_page_state(NR_SLAB_RECLAIMABLE) + | 1582 | global_page_state(NR_SLAB_RECLAIMABLE) + |
1583 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 1583 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
1584 | global_page_state(NR_FILE_MAPPED), | 1584 | global_page_state(NR_FILE_MAPPED), |
1585 | global_page_state(NR_PAGETABLE), | 1585 | global_page_state(NR_PAGETABLE), |
1586 | global_page_state(NR_BOUNCE)); | 1586 | global_page_state(NR_BOUNCE)); |
1587 | 1587 | ||
1588 | for_each_zone(zone) { | 1588 | for_each_zone(zone) { |
1589 | int i; | 1589 | int i; |
1590 | 1590 | ||
1591 | if (!populated_zone(zone)) | 1591 | if (!populated_zone(zone)) |
1592 | continue; | 1592 | continue; |
1593 | 1593 | ||
1594 | show_node(zone); | 1594 | show_node(zone); |
1595 | printk("%s" | 1595 | printk("%s" |
1596 | " free:%lukB" | 1596 | " free:%lukB" |
1597 | " min:%lukB" | 1597 | " min:%lukB" |
1598 | " low:%lukB" | 1598 | " low:%lukB" |
1599 | " high:%lukB" | 1599 | " high:%lukB" |
1600 | " active:%lukB" | 1600 | " active:%lukB" |
1601 | " inactive:%lukB" | 1601 | " inactive:%lukB" |
1602 | " present:%lukB" | 1602 | " present:%lukB" |
1603 | " pages_scanned:%lu" | 1603 | " pages_scanned:%lu" |
1604 | " all_unreclaimable? %s" | 1604 | " all_unreclaimable? %s" |
1605 | "\n", | 1605 | "\n", |
1606 | zone->name, | 1606 | zone->name, |
1607 | K(zone_page_state(zone, NR_FREE_PAGES)), | 1607 | K(zone_page_state(zone, NR_FREE_PAGES)), |
1608 | K(zone->pages_min), | 1608 | K(zone->pages_min), |
1609 | K(zone->pages_low), | 1609 | K(zone->pages_low), |
1610 | K(zone->pages_high), | 1610 | K(zone->pages_high), |
1611 | K(zone_page_state(zone, NR_ACTIVE)), | 1611 | K(zone_page_state(zone, NR_ACTIVE)), |
1612 | K(zone_page_state(zone, NR_INACTIVE)), | 1612 | K(zone_page_state(zone, NR_INACTIVE)), |
1613 | K(zone->present_pages), | 1613 | K(zone->present_pages), |
1614 | zone->pages_scanned, | 1614 | zone->pages_scanned, |
1615 | (zone->all_unreclaimable ? "yes" : "no") | 1615 | (zone->all_unreclaimable ? "yes" : "no") |
1616 | ); | 1616 | ); |
1617 | printk("lowmem_reserve[]:"); | 1617 | printk("lowmem_reserve[]:"); |
1618 | for (i = 0; i < MAX_NR_ZONES; i++) | 1618 | for (i = 0; i < MAX_NR_ZONES; i++) |
1619 | printk(" %lu", zone->lowmem_reserve[i]); | 1619 | printk(" %lu", zone->lowmem_reserve[i]); |
1620 | printk("\n"); | 1620 | printk("\n"); |
1621 | } | 1621 | } |
1622 | 1622 | ||
1623 | for_each_zone(zone) { | 1623 | for_each_zone(zone) { |
1624 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 1624 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
1625 | 1625 | ||
1626 | if (!populated_zone(zone)) | 1626 | if (!populated_zone(zone)) |
1627 | continue; | 1627 | continue; |
1628 | 1628 | ||
1629 | show_node(zone); | 1629 | show_node(zone); |
1630 | printk("%s: ", zone->name); | 1630 | printk("%s: ", zone->name); |
1631 | 1631 | ||
1632 | spin_lock_irqsave(&zone->lock, flags); | 1632 | spin_lock_irqsave(&zone->lock, flags); |
1633 | for (order = 0; order < MAX_ORDER; order++) { | 1633 | for (order = 0; order < MAX_ORDER; order++) { |
1634 | nr[order] = zone->free_area[order].nr_free; | 1634 | nr[order] = zone->free_area[order].nr_free; |
1635 | total += nr[order] << order; | 1635 | total += nr[order] << order; |
1636 | } | 1636 | } |
1637 | spin_unlock_irqrestore(&zone->lock, flags); | 1637 | spin_unlock_irqrestore(&zone->lock, flags); |
1638 | for (order = 0; order < MAX_ORDER; order++) | 1638 | for (order = 0; order < MAX_ORDER; order++) |
1639 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 1639 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
1640 | printk("= %lukB\n", K(total)); | 1640 | printk("= %lukB\n", K(total)); |
1641 | } | 1641 | } |
1642 | 1642 | ||
1643 | show_swap_cache_info(); | 1643 | show_swap_cache_info(); |
1644 | } | 1644 | } |
1645 | 1645 | ||
1646 | /* | 1646 | /* |
1647 | * Builds allocation fallback zone lists. | 1647 | * Builds allocation fallback zone lists. |
1648 | * | 1648 | * |
1649 | * Add all populated zones of a node to the zonelist. | 1649 | * Add all populated zones of a node to the zonelist. |
1650 | */ | 1650 | */ |
1651 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 1651 | static int __meminit build_zonelists_node(pg_data_t *pgdat, |
1652 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) | 1652 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) |
1653 | { | 1653 | { |
1654 | struct zone *zone; | 1654 | struct zone *zone; |
1655 | 1655 | ||
1656 | BUG_ON(zone_type >= MAX_NR_ZONES); | 1656 | BUG_ON(zone_type >= MAX_NR_ZONES); |
1657 | zone_type++; | 1657 | zone_type++; |
1658 | 1658 | ||
1659 | do { | 1659 | do { |
1660 | zone_type--; | 1660 | zone_type--; |
1661 | zone = pgdat->node_zones + zone_type; | 1661 | zone = pgdat->node_zones + zone_type; |
1662 | if (populated_zone(zone)) { | 1662 | if (populated_zone(zone)) { |
1663 | zonelist->zones[nr_zones++] = zone; | 1663 | zonelist->zones[nr_zones++] = zone; |
1664 | check_highest_zone(zone_type); | 1664 | check_highest_zone(zone_type); |
1665 | } | 1665 | } |
1666 | 1666 | ||
1667 | } while (zone_type); | 1667 | } while (zone_type); |
1668 | return nr_zones; | 1668 | return nr_zones; |
1669 | } | 1669 | } |
1670 | 1670 | ||
1671 | #ifdef CONFIG_NUMA | 1671 | #ifdef CONFIG_NUMA |
1672 | #define MAX_NODE_LOAD (num_online_nodes()) | 1672 | #define MAX_NODE_LOAD (num_online_nodes()) |
1673 | static int __meminitdata node_load[MAX_NUMNODES]; | 1673 | static int __meminitdata node_load[MAX_NUMNODES]; |
1674 | /** | 1674 | /** |
1675 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1675 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1676 | * @node: node whose fallback list we're appending | 1676 | * @node: node whose fallback list we're appending |
1677 | * @used_node_mask: nodemask_t of already used nodes | 1677 | * @used_node_mask: nodemask_t of already used nodes |
1678 | * | 1678 | * |
1679 | * We use a number of factors to determine which is the next node that should | 1679 | * We use a number of factors to determine which is the next node that should |
1680 | * appear on a given node's fallback list. The node should not have appeared | 1680 | * appear on a given node's fallback list. The node should not have appeared |
1681 | * already in @node's fallback list, and it should be the next closest node | 1681 | * already in @node's fallback list, and it should be the next closest node |
1682 | * according to the distance array (which contains arbitrary distance values | 1682 | * according to the distance array (which contains arbitrary distance values |
1683 | * from each node to each node in the system), and should also prefer nodes | 1683 | * from each node to each node in the system), and should also prefer nodes |
1684 | * with no CPUs, since presumably they'll have very little allocation pressure | 1684 | * with no CPUs, since presumably they'll have very little allocation pressure |
1685 | * on them otherwise. | 1685 | * on them otherwise. |
1686 | * It returns -1 if no node is found. | 1686 | * It returns -1 if no node is found. |
1687 | */ | 1687 | */ |
1688 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | 1688 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) |
1689 | { | 1689 | { |
1690 | int n, val; | 1690 | int n, val; |
1691 | int min_val = INT_MAX; | 1691 | int min_val = INT_MAX; |
1692 | int best_node = -1; | 1692 | int best_node = -1; |
1693 | 1693 | ||
1694 | /* Use the local node if we haven't already */ | 1694 | /* Use the local node if we haven't already */ |
1695 | if (!node_isset(node, *used_node_mask)) { | 1695 | if (!node_isset(node, *used_node_mask)) { |
1696 | node_set(node, *used_node_mask); | 1696 | node_set(node, *used_node_mask); |
1697 | return node; | 1697 | return node; |
1698 | } | 1698 | } |
1699 | 1699 | ||
1700 | for_each_online_node(n) { | 1700 | for_each_online_node(n) { |
1701 | cpumask_t tmp; | 1701 | cpumask_t tmp; |
1702 | 1702 | ||
1703 | /* Don't want a node to appear more than once */ | 1703 | /* Don't want a node to appear more than once */ |
1704 | if (node_isset(n, *used_node_mask)) | 1704 | if (node_isset(n, *used_node_mask)) |
1705 | continue; | 1705 | continue; |
1706 | 1706 | ||
1707 | /* Use the distance array to find the distance */ | 1707 | /* Use the distance array to find the distance */ |
1708 | val = node_distance(node, n); | 1708 | val = node_distance(node, n); |
1709 | 1709 | ||
1710 | /* Penalize nodes under us ("prefer the next node") */ | 1710 | /* Penalize nodes under us ("prefer the next node") */ |
1711 | val += (n < node); | 1711 | val += (n < node); |
1712 | 1712 | ||
1713 | /* Give preference to headless and unused nodes */ | 1713 | /* Give preference to headless and unused nodes */ |
1714 | tmp = node_to_cpumask(n); | 1714 | tmp = node_to_cpumask(n); |
1715 | if (!cpus_empty(tmp)) | 1715 | if (!cpus_empty(tmp)) |
1716 | val += PENALTY_FOR_NODE_WITH_CPUS; | 1716 | val += PENALTY_FOR_NODE_WITH_CPUS; |
1717 | 1717 | ||
1718 | /* Slight preference for less loaded node */ | 1718 | /* Slight preference for less loaded node */ |
1719 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); | 1719 | val *= (MAX_NODE_LOAD*MAX_NUMNODES); |
1720 | val += node_load[n]; | 1720 | val += node_load[n]; |
1721 | 1721 | ||
1722 | if (val < min_val) { | 1722 | if (val < min_val) { |
1723 | min_val = val; | 1723 | min_val = val; |
1724 | best_node = n; | 1724 | best_node = n; |
1725 | } | 1725 | } |
1726 | } | 1726 | } |
1727 | 1727 | ||
1728 | if (best_node >= 0) | 1728 | if (best_node >= 0) |
1729 | node_set(best_node, *used_node_mask); | 1729 | node_set(best_node, *used_node_mask); |
1730 | 1730 | ||
1731 | return best_node; | 1731 | return best_node; |
1732 | } | 1732 | } |
1733 | 1733 | ||
1734 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1734 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1735 | { | 1735 | { |
1736 | int j, node, local_node; | 1736 | int j, node, local_node; |
1737 | enum zone_type i; | 1737 | enum zone_type i; |
1738 | int prev_node, load; | 1738 | int prev_node, load; |
1739 | struct zonelist *zonelist; | 1739 | struct zonelist *zonelist; |
1740 | nodemask_t used_mask; | 1740 | nodemask_t used_mask; |
1741 | 1741 | ||
1742 | /* initialize zonelists */ | 1742 | /* initialize zonelists */ |
1743 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1743 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1744 | zonelist = pgdat->node_zonelists + i; | 1744 | zonelist = pgdat->node_zonelists + i; |
1745 | zonelist->zones[0] = NULL; | 1745 | zonelist->zones[0] = NULL; |
1746 | } | 1746 | } |
1747 | 1747 | ||
1748 | /* NUMA-aware ordering of nodes */ | 1748 | /* NUMA-aware ordering of nodes */ |
1749 | local_node = pgdat->node_id; | 1749 | local_node = pgdat->node_id; |
1750 | load = num_online_nodes(); | 1750 | load = num_online_nodes(); |
1751 | prev_node = local_node; | 1751 | prev_node = local_node; |
1752 | nodes_clear(used_mask); | 1752 | nodes_clear(used_mask); |
1753 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1753 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1754 | int distance = node_distance(local_node, node); | 1754 | int distance = node_distance(local_node, node); |
1755 | 1755 | ||
1756 | /* | 1756 | /* |
1757 | * If another node is sufficiently far away then it is better | 1757 | * If another node is sufficiently far away then it is better |
1758 | * to reclaim pages in a zone before going off node. | 1758 | * to reclaim pages in a zone before going off node. |
1759 | */ | 1759 | */ |
1760 | if (distance > RECLAIM_DISTANCE) | 1760 | if (distance > RECLAIM_DISTANCE) |
1761 | zone_reclaim_mode = 1; | 1761 | zone_reclaim_mode = 1; |
1762 | 1762 | ||
1763 | /* | 1763 | /* |
1764 | * We don't want to pressure a particular node. | 1764 | * We don't want to pressure a particular node. |
1765 | * So adding penalty to the first node in same | 1765 | * So adding penalty to the first node in same |
1766 | * distance group to make it round-robin. | 1766 | * distance group to make it round-robin. |
1767 | */ | 1767 | */ |
1768 | 1768 | ||
1769 | if (distance != node_distance(local_node, prev_node)) | 1769 | if (distance != node_distance(local_node, prev_node)) |
1770 | node_load[node] += load; | 1770 | node_load[node] += load; |
1771 | prev_node = node; | 1771 | prev_node = node; |
1772 | load--; | 1772 | load--; |
1773 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1773 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1774 | zonelist = pgdat->node_zonelists + i; | 1774 | zonelist = pgdat->node_zonelists + i; |
1775 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1775 | for (j = 0; zonelist->zones[j] != NULL; j++); |
1776 | 1776 | ||
1777 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 1777 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); |
1778 | zonelist->zones[j] = NULL; | 1778 | zonelist->zones[j] = NULL; |
1779 | } | 1779 | } |
1780 | } | 1780 | } |
1781 | } | 1781 | } |
1782 | 1782 | ||
1783 | /* Construct the zonelist performance cache - see further mmzone.h */ | 1783 | /* Construct the zonelist performance cache - see further mmzone.h */ |
1784 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 1784 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) |
1785 | { | 1785 | { |
1786 | int i; | 1786 | int i; |
1787 | 1787 | ||
1788 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1788 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1789 | struct zonelist *zonelist; | 1789 | struct zonelist *zonelist; |
1790 | struct zonelist_cache *zlc; | 1790 | struct zonelist_cache *zlc; |
1791 | struct zone **z; | 1791 | struct zone **z; |
1792 | 1792 | ||
1793 | zonelist = pgdat->node_zonelists + i; | 1793 | zonelist = pgdat->node_zonelists + i; |
1794 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | 1794 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; |
1795 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1795 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1796 | for (z = zonelist->zones; *z; z++) | 1796 | for (z = zonelist->zones; *z; z++) |
1797 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | 1797 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); |
1798 | } | 1798 | } |
1799 | } | 1799 | } |
1800 | 1800 | ||
1801 | #else /* CONFIG_NUMA */ | 1801 | #else /* CONFIG_NUMA */ |
1802 | 1802 | ||
1803 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1803 | static void __meminit build_zonelists(pg_data_t *pgdat) |
1804 | { | 1804 | { |
1805 | int node, local_node; | 1805 | int node, local_node; |
1806 | enum zone_type i,j; | 1806 | enum zone_type i,j; |
1807 | 1807 | ||
1808 | local_node = pgdat->node_id; | 1808 | local_node = pgdat->node_id; |
1809 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1809 | for (i = 0; i < MAX_NR_ZONES; i++) { |
1810 | struct zonelist *zonelist; | 1810 | struct zonelist *zonelist; |
1811 | 1811 | ||
1812 | zonelist = pgdat->node_zonelists + i; | 1812 | zonelist = pgdat->node_zonelists + i; |
1813 | 1813 | ||
1814 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 1814 | j = build_zonelists_node(pgdat, zonelist, 0, i); |
1815 | /* | 1815 | /* |
1816 | * Now we build the zonelist so that it contains the zones | 1816 | * Now we build the zonelist so that it contains the zones |
1817 | * of all the other nodes. | 1817 | * of all the other nodes. |
1818 | * We don't want to pressure a particular node, so when | 1818 | * We don't want to pressure a particular node, so when |
1819 | * building the zones for node N, we make sure that the | 1819 | * building the zones for node N, we make sure that the |
1820 | * zones coming right after the local ones are those from | 1820 | * zones coming right after the local ones are those from |
1821 | * node N+1 (modulo N) | 1821 | * node N+1 (modulo N) |
1822 | */ | 1822 | */ |
1823 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 1823 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
1824 | if (!node_online(node)) | 1824 | if (!node_online(node)) |
1825 | continue; | 1825 | continue; |
1826 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 1826 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); |
1827 | } | 1827 | } |
1828 | for (node = 0; node < local_node; node++) { | 1828 | for (node = 0; node < local_node; node++) { |
1829 | if (!node_online(node)) | 1829 | if (!node_online(node)) |
1830 | continue; | 1830 | continue; |
1831 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 1831 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); |
1832 | } | 1832 | } |
1833 | 1833 | ||
1834 | zonelist->zones[j] = NULL; | 1834 | zonelist->zones[j] = NULL; |
1835 | } | 1835 | } |
1836 | } | 1836 | } |
1837 | 1837 | ||
1838 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 1838 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
1839 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 1839 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) |
1840 | { | 1840 | { |
1841 | int i; | 1841 | int i; |
1842 | 1842 | ||
1843 | for (i = 0; i < MAX_NR_ZONES; i++) | 1843 | for (i = 0; i < MAX_NR_ZONES; i++) |
1844 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | 1844 | pgdat->node_zonelists[i].zlcache_ptr = NULL; |
1845 | } | 1845 | } |
1846 | 1846 | ||
1847 | #endif /* CONFIG_NUMA */ | 1847 | #endif /* CONFIG_NUMA */ |
1848 | 1848 | ||
1849 | /* return values int ....just for stop_machine_run() */ | 1849 | /* return values int ....just for stop_machine_run() */ |
1850 | static int __meminit __build_all_zonelists(void *dummy) | 1850 | static int __meminit __build_all_zonelists(void *dummy) |
1851 | { | 1851 | { |
1852 | int nid; | 1852 | int nid; |
1853 | 1853 | ||
1854 | for_each_online_node(nid) { | 1854 | for_each_online_node(nid) { |
1855 | build_zonelists(NODE_DATA(nid)); | 1855 | build_zonelists(NODE_DATA(nid)); |
1856 | build_zonelist_cache(NODE_DATA(nid)); | 1856 | build_zonelist_cache(NODE_DATA(nid)); |
1857 | } | 1857 | } |
1858 | return 0; | 1858 | return 0; |
1859 | } | 1859 | } |
1860 | 1860 | ||
1861 | void __meminit build_all_zonelists(void) | 1861 | void __meminit build_all_zonelists(void) |
1862 | { | 1862 | { |
1863 | if (system_state == SYSTEM_BOOTING) { | 1863 | if (system_state == SYSTEM_BOOTING) { |
1864 | __build_all_zonelists(NULL); | 1864 | __build_all_zonelists(NULL); |
1865 | cpuset_init_current_mems_allowed(); | 1865 | cpuset_init_current_mems_allowed(); |
1866 | } else { | 1866 | } else { |
1867 | /* we have to stop all cpus to guaranntee there is no user | 1867 | /* we have to stop all cpus to guaranntee there is no user |
1868 | of zonelist */ | 1868 | of zonelist */ |
1869 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | 1869 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); |
1870 | /* cpuset refresh routine should be here */ | 1870 | /* cpuset refresh routine should be here */ |
1871 | } | 1871 | } |
1872 | vm_total_pages = nr_free_pagecache_pages(); | 1872 | vm_total_pages = nr_free_pagecache_pages(); |
1873 | printk("Built %i zonelists. Total pages: %ld\n", | 1873 | printk("Built %i zonelists. Total pages: %ld\n", |
1874 | num_online_nodes(), vm_total_pages); | 1874 | num_online_nodes(), vm_total_pages); |
1875 | } | 1875 | } |
1876 | 1876 | ||
1877 | /* | 1877 | /* |
1878 | * Helper functions to size the waitqueue hash table. | 1878 | * Helper functions to size the waitqueue hash table. |
1879 | * Essentially these want to choose hash table sizes sufficiently | 1879 | * Essentially these want to choose hash table sizes sufficiently |
1880 | * large so that collisions trying to wait on pages are rare. | 1880 | * large so that collisions trying to wait on pages are rare. |
1881 | * But in fact, the number of active page waitqueues on typical | 1881 | * But in fact, the number of active page waitqueues on typical |
1882 | * systems is ridiculously low, less than 200. So this is even | 1882 | * systems is ridiculously low, less than 200. So this is even |
1883 | * conservative, even though it seems large. | 1883 | * conservative, even though it seems large. |
1884 | * | 1884 | * |
1885 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to | 1885 | * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to |
1886 | * waitqueues, i.e. the size of the waitq table given the number of pages. | 1886 | * waitqueues, i.e. the size of the waitq table given the number of pages. |
1887 | */ | 1887 | */ |
1888 | #define PAGES_PER_WAITQUEUE 256 | 1888 | #define PAGES_PER_WAITQUEUE 256 |
1889 | 1889 | ||
1890 | #ifndef CONFIG_MEMORY_HOTPLUG | 1890 | #ifndef CONFIG_MEMORY_HOTPLUG |
1891 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | 1891 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) |
1892 | { | 1892 | { |
1893 | unsigned long size = 1; | 1893 | unsigned long size = 1; |
1894 | 1894 | ||
1895 | pages /= PAGES_PER_WAITQUEUE; | 1895 | pages /= PAGES_PER_WAITQUEUE; |
1896 | 1896 | ||
1897 | while (size < pages) | 1897 | while (size < pages) |
1898 | size <<= 1; | 1898 | size <<= 1; |
1899 | 1899 | ||
1900 | /* | 1900 | /* |
1901 | * Once we have dozens or even hundreds of threads sleeping | 1901 | * Once we have dozens or even hundreds of threads sleeping |
1902 | * on IO we've got bigger problems than wait queue collision. | 1902 | * on IO we've got bigger problems than wait queue collision. |
1903 | * Limit the size of the wait table to a reasonable size. | 1903 | * Limit the size of the wait table to a reasonable size. |
1904 | */ | 1904 | */ |
1905 | size = min(size, 4096UL); | 1905 | size = min(size, 4096UL); |
1906 | 1906 | ||
1907 | return max(size, 4UL); | 1907 | return max(size, 4UL); |
1908 | } | 1908 | } |
1909 | #else | 1909 | #else |
1910 | /* | 1910 | /* |
1911 | * A zone's size might be changed by hot-add, so it is not possible to determine | 1911 | * A zone's size might be changed by hot-add, so it is not possible to determine |
1912 | * a suitable size for its wait_table. So we use the maximum size now. | 1912 | * a suitable size for its wait_table. So we use the maximum size now. |
1913 | * | 1913 | * |
1914 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: | 1914 | * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: |
1915 | * | 1915 | * |
1916 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. | 1916 | * i386 (preemption config) : 4096 x 16 = 64Kbyte. |
1917 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. | 1917 | * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. |
1918 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. | 1918 | * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. |
1919 | * | 1919 | * |
1920 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages | 1920 | * The maximum entries are prepared when a zone's memory is (512K + 256) pages |
1921 | * or more by the traditional way. (See above). It equals: | 1921 | * or more by the traditional way. (See above). It equals: |
1922 | * | 1922 | * |
1923 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. | 1923 | * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. |
1924 | * ia64(16K page size) : = ( 8G + 4M)byte. | 1924 | * ia64(16K page size) : = ( 8G + 4M)byte. |
1925 | * powerpc (64K page size) : = (32G +16M)byte. | 1925 | * powerpc (64K page size) : = (32G +16M)byte. |
1926 | */ | 1926 | */ |
1927 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) | 1927 | static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) |
1928 | { | 1928 | { |
1929 | return 4096UL; | 1929 | return 4096UL; |
1930 | } | 1930 | } |
1931 | #endif | 1931 | #endif |
1932 | 1932 | ||
1933 | /* | 1933 | /* |
1934 | * This is an integer logarithm so that shifts can be used later | 1934 | * This is an integer logarithm so that shifts can be used later |
1935 | * to extract the more random high bits from the multiplicative | 1935 | * to extract the more random high bits from the multiplicative |
1936 | * hash function before the remainder is taken. | 1936 | * hash function before the remainder is taken. |
1937 | */ | 1937 | */ |
1938 | static inline unsigned long wait_table_bits(unsigned long size) | 1938 | static inline unsigned long wait_table_bits(unsigned long size) |
1939 | { | 1939 | { |
1940 | return ffz(~size); | 1940 | return ffz(~size); |
1941 | } | 1941 | } |
1942 | 1942 | ||
1943 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) | 1943 | #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) |
1944 | 1944 | ||
1945 | /* | 1945 | /* |
1946 | * Initially all pages are reserved - free ones are freed | 1946 | * Initially all pages are reserved - free ones are freed |
1947 | * up by free_all_bootmem() once the early boot process is | 1947 | * up by free_all_bootmem() once the early boot process is |
1948 | * done. Non-atomic initialization, single-pass. | 1948 | * done. Non-atomic initialization, single-pass. |
1949 | */ | 1949 | */ |
1950 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 1950 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
1951 | unsigned long start_pfn, enum memmap_context context) | 1951 | unsigned long start_pfn, enum memmap_context context) |
1952 | { | 1952 | { |
1953 | struct page *page; | 1953 | struct page *page; |
1954 | unsigned long end_pfn = start_pfn + size; | 1954 | unsigned long end_pfn = start_pfn + size; |
1955 | unsigned long pfn; | 1955 | unsigned long pfn; |
1956 | 1956 | ||
1957 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 1957 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
1958 | /* | 1958 | /* |
1959 | * There can be holes in boot-time mem_map[]s | 1959 | * There can be holes in boot-time mem_map[]s |
1960 | * handed to this function. They do not | 1960 | * handed to this function. They do not |
1961 | * exist on hotplugged memory. | 1961 | * exist on hotplugged memory. |
1962 | */ | 1962 | */ |
1963 | if (context == MEMMAP_EARLY) { | 1963 | if (context == MEMMAP_EARLY) { |
1964 | if (!early_pfn_valid(pfn)) | 1964 | if (!early_pfn_valid(pfn)) |
1965 | continue; | 1965 | continue; |
1966 | if (!early_pfn_in_nid(pfn, nid)) | 1966 | if (!early_pfn_in_nid(pfn, nid)) |
1967 | continue; | 1967 | continue; |
1968 | } | 1968 | } |
1969 | page = pfn_to_page(pfn); | 1969 | page = pfn_to_page(pfn); |
1970 | set_page_links(page, zone, nid, pfn); | 1970 | set_page_links(page, zone, nid, pfn); |
1971 | init_page_count(page); | 1971 | init_page_count(page); |
1972 | reset_page_mapcount(page); | 1972 | reset_page_mapcount(page); |
1973 | SetPageReserved(page); | 1973 | SetPageReserved(page); |
1974 | INIT_LIST_HEAD(&page->lru); | 1974 | INIT_LIST_HEAD(&page->lru); |
1975 | #ifdef WANT_PAGE_VIRTUAL | 1975 | #ifdef WANT_PAGE_VIRTUAL |
1976 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ | 1976 | /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
1977 | if (!is_highmem_idx(zone)) | 1977 | if (!is_highmem_idx(zone)) |
1978 | set_page_address(page, __va(pfn << PAGE_SHIFT)); | 1978 | set_page_address(page, __va(pfn << PAGE_SHIFT)); |
1979 | #endif | 1979 | #endif |
1980 | } | 1980 | } |
1981 | } | 1981 | } |
1982 | 1982 | ||
1983 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | 1983 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, |
1984 | unsigned long size) | 1984 | unsigned long size) |
1985 | { | 1985 | { |
1986 | int order; | 1986 | int order; |
1987 | for (order = 0; order < MAX_ORDER ; order++) { | 1987 | for (order = 0; order < MAX_ORDER ; order++) { |
1988 | INIT_LIST_HEAD(&zone->free_area[order].free_list); | 1988 | INIT_LIST_HEAD(&zone->free_area[order].free_list); |
1989 | zone->free_area[order].nr_free = 0; | 1989 | zone->free_area[order].nr_free = 0; |
1990 | } | 1990 | } |
1991 | } | 1991 | } |
1992 | 1992 | ||
1993 | #ifndef __HAVE_ARCH_MEMMAP_INIT | 1993 | #ifndef __HAVE_ARCH_MEMMAP_INIT |
1994 | #define memmap_init(size, nid, zone, start_pfn) \ | 1994 | #define memmap_init(size, nid, zone, start_pfn) \ |
1995 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 1995 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
1996 | #endif | 1996 | #endif |
1997 | 1997 | ||
1998 | static int __cpuinit zone_batchsize(struct zone *zone) | 1998 | static int __cpuinit zone_batchsize(struct zone *zone) |
1999 | { | 1999 | { |
2000 | int batch; | 2000 | int batch; |
2001 | 2001 | ||
2002 | /* | 2002 | /* |
2003 | * The per-cpu-pages pools are set to around 1000th of the | 2003 | * The per-cpu-pages pools are set to around 1000th of the |
2004 | * size of the zone. But no more than 1/2 of a meg. | 2004 | * size of the zone. But no more than 1/2 of a meg. |
2005 | * | 2005 | * |
2006 | * OK, so we don't know how big the cache is. So guess. | 2006 | * OK, so we don't know how big the cache is. So guess. |
2007 | */ | 2007 | */ |
2008 | batch = zone->present_pages / 1024; | 2008 | batch = zone->present_pages / 1024; |
2009 | if (batch * PAGE_SIZE > 512 * 1024) | 2009 | if (batch * PAGE_SIZE > 512 * 1024) |
2010 | batch = (512 * 1024) / PAGE_SIZE; | 2010 | batch = (512 * 1024) / PAGE_SIZE; |
2011 | batch /= 4; /* We effectively *= 4 below */ | 2011 | batch /= 4; /* We effectively *= 4 below */ |
2012 | if (batch < 1) | 2012 | if (batch < 1) |
2013 | batch = 1; | 2013 | batch = 1; |
2014 | 2014 | ||
2015 | /* | 2015 | /* |
2016 | * Clamp the batch to a 2^n - 1 value. Having a power | 2016 | * Clamp the batch to a 2^n - 1 value. Having a power |
2017 | * of 2 value was found to be more likely to have | 2017 | * of 2 value was found to be more likely to have |
2018 | * suboptimal cache aliasing properties in some cases. | 2018 | * suboptimal cache aliasing properties in some cases. |
2019 | * | 2019 | * |
2020 | * For example if 2 tasks are alternately allocating | 2020 | * For example if 2 tasks are alternately allocating |
2021 | * batches of pages, one task can end up with a lot | 2021 | * batches of pages, one task can end up with a lot |
2022 | * of pages of one half of the possible page colors | 2022 | * of pages of one half of the possible page colors |
2023 | * and the other with pages of the other colors. | 2023 | * and the other with pages of the other colors. |
2024 | */ | 2024 | */ |
2025 | batch = (1 << (fls(batch + batch/2)-1)) - 1; | 2025 | batch = (1 << (fls(batch + batch/2)-1)) - 1; |
2026 | 2026 | ||
2027 | return batch; | 2027 | return batch; |
2028 | } | 2028 | } |
2029 | 2029 | ||
2030 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 2030 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
2031 | { | 2031 | { |
2032 | struct per_cpu_pages *pcp; | 2032 | struct per_cpu_pages *pcp; |
2033 | 2033 | ||
2034 | memset(p, 0, sizeof(*p)); | 2034 | memset(p, 0, sizeof(*p)); |
2035 | 2035 | ||
2036 | pcp = &p->pcp[0]; /* hot */ | 2036 | pcp = &p->pcp[0]; /* hot */ |
2037 | pcp->count = 0; | 2037 | pcp->count = 0; |
2038 | pcp->high = 6 * batch; | 2038 | pcp->high = 6 * batch; |
2039 | pcp->batch = max(1UL, 1 * batch); | 2039 | pcp->batch = max(1UL, 1 * batch); |
2040 | INIT_LIST_HEAD(&pcp->list); | 2040 | INIT_LIST_HEAD(&pcp->list); |
2041 | 2041 | ||
2042 | pcp = &p->pcp[1]; /* cold*/ | 2042 | pcp = &p->pcp[1]; /* cold*/ |
2043 | pcp->count = 0; | 2043 | pcp->count = 0; |
2044 | pcp->high = 2 * batch; | 2044 | pcp->high = 2 * batch; |
2045 | pcp->batch = max(1UL, batch/2); | 2045 | pcp->batch = max(1UL, batch/2); |
2046 | INIT_LIST_HEAD(&pcp->list); | 2046 | INIT_LIST_HEAD(&pcp->list); |
2047 | } | 2047 | } |
2048 | 2048 | ||
2049 | /* | 2049 | /* |
2050 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist | 2050 | * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist |
2051 | * to the value high for the pageset p. | 2051 | * to the value high for the pageset p. |
2052 | */ | 2052 | */ |
2053 | 2053 | ||
2054 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, | 2054 | static void setup_pagelist_highmark(struct per_cpu_pageset *p, |
2055 | unsigned long high) | 2055 | unsigned long high) |
2056 | { | 2056 | { |
2057 | struct per_cpu_pages *pcp; | 2057 | struct per_cpu_pages *pcp; |
2058 | 2058 | ||
2059 | pcp = &p->pcp[0]; /* hot list */ | 2059 | pcp = &p->pcp[0]; /* hot list */ |
2060 | pcp->high = high; | 2060 | pcp->high = high; |
2061 | pcp->batch = max(1UL, high/4); | 2061 | pcp->batch = max(1UL, high/4); |
2062 | if ((high/4) > (PAGE_SHIFT * 8)) | 2062 | if ((high/4) > (PAGE_SHIFT * 8)) |
2063 | pcp->batch = PAGE_SHIFT * 8; | 2063 | pcp->batch = PAGE_SHIFT * 8; |
2064 | } | 2064 | } |
2065 | 2065 | ||
2066 | 2066 | ||
2067 | #ifdef CONFIG_NUMA | 2067 | #ifdef CONFIG_NUMA |
2068 | /* | 2068 | /* |
2069 | * Boot pageset table. One per cpu which is going to be used for all | 2069 | * Boot pageset table. One per cpu which is going to be used for all |
2070 | * zones and all nodes. The parameters will be set in such a way | 2070 | * zones and all nodes. The parameters will be set in such a way |
2071 | * that an item put on a list will immediately be handed over to | 2071 | * that an item put on a list will immediately be handed over to |
2072 | * the buddy list. This is safe since pageset manipulation is done | 2072 | * the buddy list. This is safe since pageset manipulation is done |
2073 | * with interrupts disabled. | 2073 | * with interrupts disabled. |
2074 | * | 2074 | * |
2075 | * Some NUMA counter updates may also be caught by the boot pagesets. | 2075 | * Some NUMA counter updates may also be caught by the boot pagesets. |
2076 | * | 2076 | * |
2077 | * The boot_pagesets must be kept even after bootup is complete for | 2077 | * The boot_pagesets must be kept even after bootup is complete for |
2078 | * unused processors and/or zones. They do play a role for bootstrapping | 2078 | * unused processors and/or zones. They do play a role for bootstrapping |
2079 | * hotplugged processors. | 2079 | * hotplugged processors. |
2080 | * | 2080 | * |
2081 | * zoneinfo_show() and maybe other functions do | 2081 | * zoneinfo_show() and maybe other functions do |
2082 | * not check if the processor is online before following the pageset pointer. | 2082 | * not check if the processor is online before following the pageset pointer. |
2083 | * Other parts of the kernel may not check if the zone is available. | 2083 | * Other parts of the kernel may not check if the zone is available. |
2084 | */ | 2084 | */ |
2085 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | 2085 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; |
2086 | 2086 | ||
2087 | /* | 2087 | /* |
2088 | * Dynamically allocate memory for the | 2088 | * Dynamically allocate memory for the |
2089 | * per cpu pageset array in struct zone. | 2089 | * per cpu pageset array in struct zone. |
2090 | */ | 2090 | */ |
2091 | static int __cpuinit process_zones(int cpu) | 2091 | static int __cpuinit process_zones(int cpu) |
2092 | { | 2092 | { |
2093 | struct zone *zone, *dzone; | 2093 | struct zone *zone, *dzone; |
2094 | 2094 | ||
2095 | for_each_zone(zone) { | 2095 | for_each_zone(zone) { |
2096 | 2096 | ||
2097 | if (!populated_zone(zone)) | 2097 | if (!populated_zone(zone)) |
2098 | continue; | 2098 | continue; |
2099 | 2099 | ||
2100 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 2100 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), |
2101 | GFP_KERNEL, cpu_to_node(cpu)); | 2101 | GFP_KERNEL, cpu_to_node(cpu)); |
2102 | if (!zone_pcp(zone, cpu)) | 2102 | if (!zone_pcp(zone, cpu)) |
2103 | goto bad; | 2103 | goto bad; |
2104 | 2104 | ||
2105 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | 2105 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); |
2106 | 2106 | ||
2107 | if (percpu_pagelist_fraction) | 2107 | if (percpu_pagelist_fraction) |
2108 | setup_pagelist_highmark(zone_pcp(zone, cpu), | 2108 | setup_pagelist_highmark(zone_pcp(zone, cpu), |
2109 | (zone->present_pages / percpu_pagelist_fraction)); | 2109 | (zone->present_pages / percpu_pagelist_fraction)); |
2110 | } | 2110 | } |
2111 | 2111 | ||
2112 | return 0; | 2112 | return 0; |
2113 | bad: | 2113 | bad: |
2114 | for_each_zone(dzone) { | 2114 | for_each_zone(dzone) { |
2115 | if (dzone == zone) | 2115 | if (dzone == zone) |
2116 | break; | 2116 | break; |
2117 | kfree(zone_pcp(dzone, cpu)); | 2117 | kfree(zone_pcp(dzone, cpu)); |
2118 | zone_pcp(dzone, cpu) = NULL; | 2118 | zone_pcp(dzone, cpu) = NULL; |
2119 | } | 2119 | } |
2120 | return -ENOMEM; | 2120 | return -ENOMEM; |
2121 | } | 2121 | } |
2122 | 2122 | ||
2123 | static inline void free_zone_pagesets(int cpu) | 2123 | static inline void free_zone_pagesets(int cpu) |
2124 | { | 2124 | { |
2125 | struct zone *zone; | 2125 | struct zone *zone; |
2126 | 2126 | ||
2127 | for_each_zone(zone) { | 2127 | for_each_zone(zone) { |
2128 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | 2128 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); |
2129 | 2129 | ||
2130 | /* Free per_cpu_pageset if it is slab allocated */ | 2130 | /* Free per_cpu_pageset if it is slab allocated */ |
2131 | if (pset != &boot_pageset[cpu]) | 2131 | if (pset != &boot_pageset[cpu]) |
2132 | kfree(pset); | 2132 | kfree(pset); |
2133 | zone_pcp(zone, cpu) = NULL; | 2133 | zone_pcp(zone, cpu) = NULL; |
2134 | } | 2134 | } |
2135 | } | 2135 | } |
2136 | 2136 | ||
2137 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | 2137 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, |
2138 | unsigned long action, | 2138 | unsigned long action, |
2139 | void *hcpu) | 2139 | void *hcpu) |
2140 | { | 2140 | { |
2141 | int cpu = (long)hcpu; | 2141 | int cpu = (long)hcpu; |
2142 | int ret = NOTIFY_OK; | 2142 | int ret = NOTIFY_OK; |
2143 | 2143 | ||
2144 | switch (action) { | 2144 | switch (action) { |
2145 | case CPU_UP_PREPARE: | 2145 | case CPU_UP_PREPARE: |
2146 | if (process_zones(cpu)) | 2146 | if (process_zones(cpu)) |
2147 | ret = NOTIFY_BAD; | 2147 | ret = NOTIFY_BAD; |
2148 | break; | 2148 | break; |
2149 | case CPU_UP_CANCELED: | 2149 | case CPU_UP_CANCELED: |
2150 | case CPU_DEAD: | 2150 | case CPU_DEAD: |
2151 | free_zone_pagesets(cpu); | 2151 | free_zone_pagesets(cpu); |
2152 | break; | 2152 | break; |
2153 | default: | 2153 | default: |
2154 | break; | 2154 | break; |
2155 | } | 2155 | } |
2156 | return ret; | 2156 | return ret; |
2157 | } | 2157 | } |
2158 | 2158 | ||
2159 | static struct notifier_block __cpuinitdata pageset_notifier = | 2159 | static struct notifier_block __cpuinitdata pageset_notifier = |
2160 | { &pageset_cpuup_callback, NULL, 0 }; | 2160 | { &pageset_cpuup_callback, NULL, 0 }; |
2161 | 2161 | ||
2162 | void __init setup_per_cpu_pageset(void) | 2162 | void __init setup_per_cpu_pageset(void) |
2163 | { | 2163 | { |
2164 | int err; | 2164 | int err; |
2165 | 2165 | ||
2166 | /* Initialize per_cpu_pageset for cpu 0. | 2166 | /* Initialize per_cpu_pageset for cpu 0. |
2167 | * A cpuup callback will do this for every cpu | 2167 | * A cpuup callback will do this for every cpu |
2168 | * as it comes online | 2168 | * as it comes online |
2169 | */ | 2169 | */ |
2170 | err = process_zones(smp_processor_id()); | 2170 | err = process_zones(smp_processor_id()); |
2171 | BUG_ON(err); | 2171 | BUG_ON(err); |
2172 | register_cpu_notifier(&pageset_notifier); | 2172 | register_cpu_notifier(&pageset_notifier); |
2173 | } | 2173 | } |
2174 | 2174 | ||
2175 | #endif | 2175 | #endif |
2176 | 2176 | ||
2177 | static __meminit | 2177 | static __meminit |
2178 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 2178 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
2179 | { | 2179 | { |
2180 | int i; | 2180 | int i; |
2181 | struct pglist_data *pgdat = zone->zone_pgdat; | 2181 | struct pglist_data *pgdat = zone->zone_pgdat; |
2182 | size_t alloc_size; | 2182 | size_t alloc_size; |
2183 | 2183 | ||
2184 | /* | 2184 | /* |
2185 | * The per-page waitqueue mechanism uses hashed waitqueues | 2185 | * The per-page waitqueue mechanism uses hashed waitqueues |
2186 | * per zone. | 2186 | * per zone. |
2187 | */ | 2187 | */ |
2188 | zone->wait_table_hash_nr_entries = | 2188 | zone->wait_table_hash_nr_entries = |
2189 | wait_table_hash_nr_entries(zone_size_pages); | 2189 | wait_table_hash_nr_entries(zone_size_pages); |
2190 | zone->wait_table_bits = | 2190 | zone->wait_table_bits = |
2191 | wait_table_bits(zone->wait_table_hash_nr_entries); | 2191 | wait_table_bits(zone->wait_table_hash_nr_entries); |
2192 | alloc_size = zone->wait_table_hash_nr_entries | 2192 | alloc_size = zone->wait_table_hash_nr_entries |
2193 | * sizeof(wait_queue_head_t); | 2193 | * sizeof(wait_queue_head_t); |
2194 | 2194 | ||
2195 | if (system_state == SYSTEM_BOOTING) { | 2195 | if (system_state == SYSTEM_BOOTING) { |
2196 | zone->wait_table = (wait_queue_head_t *) | 2196 | zone->wait_table = (wait_queue_head_t *) |
2197 | alloc_bootmem_node(pgdat, alloc_size); | 2197 | alloc_bootmem_node(pgdat, alloc_size); |
2198 | } else { | 2198 | } else { |
2199 | /* | 2199 | /* |
2200 | * This case means that a zone whose size was 0 gets new memory | 2200 | * This case means that a zone whose size was 0 gets new memory |
2201 | * via memory hot-add. | 2201 | * via memory hot-add. |
2202 | * But it may be the case that a new node was hot-added. In | 2202 | * But it may be the case that a new node was hot-added. In |
2203 | * this case vmalloc() will not be able to use this new node's | 2203 | * this case vmalloc() will not be able to use this new node's |
2204 | * memory - this wait_table must be initialized to use this new | 2204 | * memory - this wait_table must be initialized to use this new |
2205 | * node itself as well. | 2205 | * node itself as well. |
2206 | * To use this new node's memory, further consideration will be | 2206 | * To use this new node's memory, further consideration will be |
2207 | * necessary. | 2207 | * necessary. |
2208 | */ | 2208 | */ |
2209 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); | 2209 | zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); |
2210 | } | 2210 | } |
2211 | if (!zone->wait_table) | 2211 | if (!zone->wait_table) |
2212 | return -ENOMEM; | 2212 | return -ENOMEM; |
2213 | 2213 | ||
2214 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) | 2214 | for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) |
2215 | init_waitqueue_head(zone->wait_table + i); | 2215 | init_waitqueue_head(zone->wait_table + i); |
2216 | 2216 | ||
2217 | return 0; | 2217 | return 0; |
2218 | } | 2218 | } |
2219 | 2219 | ||
2220 | static __meminit void zone_pcp_init(struct zone *zone) | 2220 | static __meminit void zone_pcp_init(struct zone *zone) |
2221 | { | 2221 | { |
2222 | int cpu; | 2222 | int cpu; |
2223 | unsigned long batch = zone_batchsize(zone); | 2223 | unsigned long batch = zone_batchsize(zone); |
2224 | 2224 | ||
2225 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 2225 | for (cpu = 0; cpu < NR_CPUS; cpu++) { |
2226 | #ifdef CONFIG_NUMA | 2226 | #ifdef CONFIG_NUMA |
2227 | /* Early boot. Slab allocator not functional yet */ | 2227 | /* Early boot. Slab allocator not functional yet */ |
2228 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | 2228 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; |
2229 | setup_pageset(&boot_pageset[cpu],0); | 2229 | setup_pageset(&boot_pageset[cpu],0); |
2230 | #else | 2230 | #else |
2231 | setup_pageset(zone_pcp(zone,cpu), batch); | 2231 | setup_pageset(zone_pcp(zone,cpu), batch); |
2232 | #endif | 2232 | #endif |
2233 | } | 2233 | } |
2234 | if (zone->present_pages) | 2234 | if (zone->present_pages) |
2235 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 2235 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", |
2236 | zone->name, zone->present_pages, batch); | 2236 | zone->name, zone->present_pages, batch); |
2237 | } | 2237 | } |
2238 | 2238 | ||
2239 | __meminit int init_currently_empty_zone(struct zone *zone, | 2239 | __meminit int init_currently_empty_zone(struct zone *zone, |
2240 | unsigned long zone_start_pfn, | 2240 | unsigned long zone_start_pfn, |
2241 | unsigned long size, | 2241 | unsigned long size, |
2242 | enum memmap_context context) | 2242 | enum memmap_context context) |
2243 | { | 2243 | { |
2244 | struct pglist_data *pgdat = zone->zone_pgdat; | 2244 | struct pglist_data *pgdat = zone->zone_pgdat; |
2245 | int ret; | 2245 | int ret; |
2246 | ret = zone_wait_table_init(zone, size); | 2246 | ret = zone_wait_table_init(zone, size); |
2247 | if (ret) | 2247 | if (ret) |
2248 | return ret; | 2248 | return ret; |
2249 | pgdat->nr_zones = zone_idx(zone) + 1; | 2249 | pgdat->nr_zones = zone_idx(zone) + 1; |
2250 | 2250 | ||
2251 | zone->zone_start_pfn = zone_start_pfn; | 2251 | zone->zone_start_pfn = zone_start_pfn; |
2252 | 2252 | ||
2253 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); | 2253 | memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); |
2254 | 2254 | ||
2255 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); | 2255 | zone_init_free_lists(pgdat, zone, zone->spanned_pages); |
2256 | 2256 | ||
2257 | return 0; | 2257 | return 0; |
2258 | } | 2258 | } |
2259 | 2259 | ||
2260 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 2260 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
2261 | /* | 2261 | /* |
2262 | * Basic iterator support. Return the first range of PFNs for a node | 2262 | * Basic iterator support. Return the first range of PFNs for a node |
2263 | * Note: nid == MAX_NUMNODES returns first region regardless of node | 2263 | * Note: nid == MAX_NUMNODES returns first region regardless of node |
2264 | */ | 2264 | */ |
2265 | static int __init first_active_region_index_in_nid(int nid) | 2265 | static int __init first_active_region_index_in_nid(int nid) |
2266 | { | 2266 | { |
2267 | int i; | 2267 | int i; |
2268 | 2268 | ||
2269 | for (i = 0; i < nr_nodemap_entries; i++) | 2269 | for (i = 0; i < nr_nodemap_entries; i++) |
2270 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) | 2270 | if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) |
2271 | return i; | 2271 | return i; |
2272 | 2272 | ||
2273 | return -1; | 2273 | return -1; |
2274 | } | 2274 | } |
2275 | 2275 | ||
2276 | /* | 2276 | /* |
2277 | * Basic iterator support. Return the next active range of PFNs for a node | 2277 | * Basic iterator support. Return the next active range of PFNs for a node |
2278 | * Note: nid == MAX_NUMNODES returns next region regardles of node | 2278 | * Note: nid == MAX_NUMNODES returns next region regardles of node |
2279 | */ | 2279 | */ |
2280 | static int __init next_active_region_index_in_nid(int index, int nid) | 2280 | static int __init next_active_region_index_in_nid(int index, int nid) |
2281 | { | 2281 | { |
2282 | for (index = index + 1; index < nr_nodemap_entries; index++) | 2282 | for (index = index + 1; index < nr_nodemap_entries; index++) |
2283 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) | 2283 | if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) |
2284 | return index; | 2284 | return index; |
2285 | 2285 | ||
2286 | return -1; | 2286 | return -1; |
2287 | } | 2287 | } |
2288 | 2288 | ||
2289 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID | 2289 | #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
2290 | /* | 2290 | /* |
2291 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. | 2291 | * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
2292 | * Architectures may implement their own version but if add_active_range() | 2292 | * Architectures may implement their own version but if add_active_range() |
2293 | * was used and there are no special requirements, this is a convenient | 2293 | * was used and there are no special requirements, this is a convenient |
2294 | * alternative | 2294 | * alternative |
2295 | */ | 2295 | */ |
2296 | int __init early_pfn_to_nid(unsigned long pfn) | 2296 | int __init early_pfn_to_nid(unsigned long pfn) |
2297 | { | 2297 | { |
2298 | int i; | 2298 | int i; |
2299 | 2299 | ||
2300 | for (i = 0; i < nr_nodemap_entries; i++) { | 2300 | for (i = 0; i < nr_nodemap_entries; i++) { |
2301 | unsigned long start_pfn = early_node_map[i].start_pfn; | 2301 | unsigned long start_pfn = early_node_map[i].start_pfn; |
2302 | unsigned long end_pfn = early_node_map[i].end_pfn; | 2302 | unsigned long end_pfn = early_node_map[i].end_pfn; |
2303 | 2303 | ||
2304 | if (start_pfn <= pfn && pfn < end_pfn) | 2304 | if (start_pfn <= pfn && pfn < end_pfn) |
2305 | return early_node_map[i].nid; | 2305 | return early_node_map[i].nid; |
2306 | } | 2306 | } |
2307 | 2307 | ||
2308 | return 0; | 2308 | return 0; |
2309 | } | 2309 | } |
2310 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ | 2310 | #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
2311 | 2311 | ||
2312 | /* Basic iterator support to walk early_node_map[] */ | 2312 | /* Basic iterator support to walk early_node_map[] */ |
2313 | #define for_each_active_range_index_in_nid(i, nid) \ | 2313 | #define for_each_active_range_index_in_nid(i, nid) \ |
2314 | for (i = first_active_region_index_in_nid(nid); i != -1; \ | 2314 | for (i = first_active_region_index_in_nid(nid); i != -1; \ |
2315 | i = next_active_region_index_in_nid(i, nid)) | 2315 | i = next_active_region_index_in_nid(i, nid)) |
2316 | 2316 | ||
2317 | /** | 2317 | /** |
2318 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range | 2318 | * free_bootmem_with_active_regions - Call free_bootmem_node for each active range |
2319 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. | 2319 | * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. |
2320 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node | 2320 | * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node |
2321 | * | 2321 | * |
2322 | * If an architecture guarantees that all ranges registered with | 2322 | * If an architecture guarantees that all ranges registered with |
2323 | * add_active_ranges() contain no holes and may be freed, this | 2323 | * add_active_ranges() contain no holes and may be freed, this |
2324 | * this function may be used instead of calling free_bootmem() manually. | 2324 | * this function may be used instead of calling free_bootmem() manually. |
2325 | */ | 2325 | */ |
2326 | void __init free_bootmem_with_active_regions(int nid, | 2326 | void __init free_bootmem_with_active_regions(int nid, |
2327 | unsigned long max_low_pfn) | 2327 | unsigned long max_low_pfn) |
2328 | { | 2328 | { |
2329 | int i; | 2329 | int i; |
2330 | 2330 | ||
2331 | for_each_active_range_index_in_nid(i, nid) { | 2331 | for_each_active_range_index_in_nid(i, nid) { |
2332 | unsigned long size_pages = 0; | 2332 | unsigned long size_pages = 0; |
2333 | unsigned long end_pfn = early_node_map[i].end_pfn; | 2333 | unsigned long end_pfn = early_node_map[i].end_pfn; |
2334 | 2334 | ||
2335 | if (early_node_map[i].start_pfn >= max_low_pfn) | 2335 | if (early_node_map[i].start_pfn >= max_low_pfn) |
2336 | continue; | 2336 | continue; |
2337 | 2337 | ||
2338 | if (end_pfn > max_low_pfn) | 2338 | if (end_pfn > max_low_pfn) |
2339 | end_pfn = max_low_pfn; | 2339 | end_pfn = max_low_pfn; |
2340 | 2340 | ||
2341 | size_pages = end_pfn - early_node_map[i].start_pfn; | 2341 | size_pages = end_pfn - early_node_map[i].start_pfn; |
2342 | free_bootmem_node(NODE_DATA(early_node_map[i].nid), | 2342 | free_bootmem_node(NODE_DATA(early_node_map[i].nid), |
2343 | PFN_PHYS(early_node_map[i].start_pfn), | 2343 | PFN_PHYS(early_node_map[i].start_pfn), |
2344 | size_pages << PAGE_SHIFT); | 2344 | size_pages << PAGE_SHIFT); |
2345 | } | 2345 | } |
2346 | } | 2346 | } |
2347 | 2347 | ||
2348 | /** | 2348 | /** |
2349 | * sparse_memory_present_with_active_regions - Call memory_present for each active range | 2349 | * sparse_memory_present_with_active_regions - Call memory_present for each active range |
2350 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. | 2350 | * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. |
2351 | * | 2351 | * |
2352 | * If an architecture guarantees that all ranges registered with | 2352 | * If an architecture guarantees that all ranges registered with |
2353 | * add_active_ranges() contain no holes and may be freed, this | 2353 | * add_active_ranges() contain no holes and may be freed, this |
2354 | * function may be used instead of calling memory_present() manually. | 2354 | * function may be used instead of calling memory_present() manually. |
2355 | */ | 2355 | */ |
2356 | void __init sparse_memory_present_with_active_regions(int nid) | 2356 | void __init sparse_memory_present_with_active_regions(int nid) |
2357 | { | 2357 | { |
2358 | int i; | 2358 | int i; |
2359 | 2359 | ||
2360 | for_each_active_range_index_in_nid(i, nid) | 2360 | for_each_active_range_index_in_nid(i, nid) |
2361 | memory_present(early_node_map[i].nid, | 2361 | memory_present(early_node_map[i].nid, |
2362 | early_node_map[i].start_pfn, | 2362 | early_node_map[i].start_pfn, |
2363 | early_node_map[i].end_pfn); | 2363 | early_node_map[i].end_pfn); |
2364 | } | 2364 | } |
2365 | 2365 | ||
2366 | /** | 2366 | /** |
2367 | * push_node_boundaries - Push node boundaries to at least the requested boundary | 2367 | * push_node_boundaries - Push node boundaries to at least the requested boundary |
2368 | * @nid: The nid of the node to push the boundary for | 2368 | * @nid: The nid of the node to push the boundary for |
2369 | * @start_pfn: The start pfn of the node | 2369 | * @start_pfn: The start pfn of the node |
2370 | * @end_pfn: The end pfn of the node | 2370 | * @end_pfn: The end pfn of the node |
2371 | * | 2371 | * |
2372 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | 2372 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd |
2373 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | 2373 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially |
2374 | * be hotplugged even though no physical memory exists. This function allows | 2374 | * be hotplugged even though no physical memory exists. This function allows |
2375 | * an arch to push out the node boundaries so mem_map is allocated that can | 2375 | * an arch to push out the node boundaries so mem_map is allocated that can |
2376 | * be used later. | 2376 | * be used later. |
2377 | */ | 2377 | */ |
2378 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | 2378 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE |
2379 | void __init push_node_boundaries(unsigned int nid, | 2379 | void __init push_node_boundaries(unsigned int nid, |
2380 | unsigned long start_pfn, unsigned long end_pfn) | 2380 | unsigned long start_pfn, unsigned long end_pfn) |
2381 | { | 2381 | { |
2382 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | 2382 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", |
2383 | nid, start_pfn, end_pfn); | 2383 | nid, start_pfn, end_pfn); |
2384 | 2384 | ||
2385 | /* Initialise the boundary for this node if necessary */ | 2385 | /* Initialise the boundary for this node if necessary */ |
2386 | if (node_boundary_end_pfn[nid] == 0) | 2386 | if (node_boundary_end_pfn[nid] == 0) |
2387 | node_boundary_start_pfn[nid] = -1UL; | 2387 | node_boundary_start_pfn[nid] = -1UL; |
2388 | 2388 | ||
2389 | /* Update the boundaries */ | 2389 | /* Update the boundaries */ |
2390 | if (node_boundary_start_pfn[nid] > start_pfn) | 2390 | if (node_boundary_start_pfn[nid] > start_pfn) |
2391 | node_boundary_start_pfn[nid] = start_pfn; | 2391 | node_boundary_start_pfn[nid] = start_pfn; |
2392 | if (node_boundary_end_pfn[nid] < end_pfn) | 2392 | if (node_boundary_end_pfn[nid] < end_pfn) |
2393 | node_boundary_end_pfn[nid] = end_pfn; | 2393 | node_boundary_end_pfn[nid] = end_pfn; |
2394 | } | 2394 | } |
2395 | 2395 | ||
2396 | /* If necessary, push the node boundary out for reserve hotadd */ | 2396 | /* If necessary, push the node boundary out for reserve hotadd */ |
2397 | static void __init account_node_boundary(unsigned int nid, | 2397 | static void __init account_node_boundary(unsigned int nid, |
2398 | unsigned long *start_pfn, unsigned long *end_pfn) | 2398 | unsigned long *start_pfn, unsigned long *end_pfn) |
2399 | { | 2399 | { |
2400 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | 2400 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", |
2401 | nid, *start_pfn, *end_pfn); | 2401 | nid, *start_pfn, *end_pfn); |
2402 | 2402 | ||
2403 | /* Return if boundary information has not been provided */ | 2403 | /* Return if boundary information has not been provided */ |
2404 | if (node_boundary_end_pfn[nid] == 0) | 2404 | if (node_boundary_end_pfn[nid] == 0) |
2405 | return; | 2405 | return; |
2406 | 2406 | ||
2407 | /* Check the boundaries and update if necessary */ | 2407 | /* Check the boundaries and update if necessary */ |
2408 | if (node_boundary_start_pfn[nid] < *start_pfn) | 2408 | if (node_boundary_start_pfn[nid] < *start_pfn) |
2409 | *start_pfn = node_boundary_start_pfn[nid]; | 2409 | *start_pfn = node_boundary_start_pfn[nid]; |
2410 | if (node_boundary_end_pfn[nid] > *end_pfn) | 2410 | if (node_boundary_end_pfn[nid] > *end_pfn) |
2411 | *end_pfn = node_boundary_end_pfn[nid]; | 2411 | *end_pfn = node_boundary_end_pfn[nid]; |
2412 | } | 2412 | } |
2413 | #else | 2413 | #else |
2414 | void __init push_node_boundaries(unsigned int nid, | 2414 | void __init push_node_boundaries(unsigned int nid, |
2415 | unsigned long start_pfn, unsigned long end_pfn) {} | 2415 | unsigned long start_pfn, unsigned long end_pfn) {} |
2416 | 2416 | ||
2417 | static void __init account_node_boundary(unsigned int nid, | 2417 | static void __init account_node_boundary(unsigned int nid, |
2418 | unsigned long *start_pfn, unsigned long *end_pfn) {} | 2418 | unsigned long *start_pfn, unsigned long *end_pfn) {} |
2419 | #endif | 2419 | #endif |
2420 | 2420 | ||
2421 | 2421 | ||
2422 | /** | 2422 | /** |
2423 | * get_pfn_range_for_nid - Return the start and end page frames for a node | 2423 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
2424 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | 2424 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
2425 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | 2425 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. |
2426 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. | 2426 | * @end_pfn: Passed by reference. On return, it will have the node end_pfn. |
2427 | * | 2427 | * |
2428 | * It returns the start and end page frame of a node based on information | 2428 | * It returns the start and end page frame of a node based on information |
2429 | * provided by an arch calling add_active_range(). If called for a node | 2429 | * provided by an arch calling add_active_range(). If called for a node |
2430 | * with no available memory, a warning is printed and the start and end | 2430 | * with no available memory, a warning is printed and the start and end |
2431 | * PFNs will be 0. | 2431 | * PFNs will be 0. |
2432 | */ | 2432 | */ |
2433 | void __init get_pfn_range_for_nid(unsigned int nid, | 2433 | void __init get_pfn_range_for_nid(unsigned int nid, |
2434 | unsigned long *start_pfn, unsigned long *end_pfn) | 2434 | unsigned long *start_pfn, unsigned long *end_pfn) |
2435 | { | 2435 | { |
2436 | int i; | 2436 | int i; |
2437 | *start_pfn = -1UL; | 2437 | *start_pfn = -1UL; |
2438 | *end_pfn = 0; | 2438 | *end_pfn = 0; |
2439 | 2439 | ||
2440 | for_each_active_range_index_in_nid(i, nid) { | 2440 | for_each_active_range_index_in_nid(i, nid) { |
2441 | *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); | 2441 | *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); |
2442 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); | 2442 | *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); |
2443 | } | 2443 | } |
2444 | 2444 | ||
2445 | if (*start_pfn == -1UL) { | 2445 | if (*start_pfn == -1UL) { |
2446 | printk(KERN_WARNING "Node %u active with no memory\n", nid); | 2446 | printk(KERN_WARNING "Node %u active with no memory\n", nid); |
2447 | *start_pfn = 0; | 2447 | *start_pfn = 0; |
2448 | } | 2448 | } |
2449 | 2449 | ||
2450 | /* Push the node boundaries out if requested */ | 2450 | /* Push the node boundaries out if requested */ |
2451 | account_node_boundary(nid, start_pfn, end_pfn); | 2451 | account_node_boundary(nid, start_pfn, end_pfn); |
2452 | } | 2452 | } |
2453 | 2453 | ||
2454 | /* | 2454 | /* |
2455 | * Return the number of pages a zone spans in a node, including holes | 2455 | * Return the number of pages a zone spans in a node, including holes |
2456 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 2456 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
2457 | */ | 2457 | */ |
2458 | unsigned long __init zone_spanned_pages_in_node(int nid, | 2458 | unsigned long __init zone_spanned_pages_in_node(int nid, |
2459 | unsigned long zone_type, | 2459 | unsigned long zone_type, |
2460 | unsigned long *ignored) | 2460 | unsigned long *ignored) |
2461 | { | 2461 | { |
2462 | unsigned long node_start_pfn, node_end_pfn; | 2462 | unsigned long node_start_pfn, node_end_pfn; |
2463 | unsigned long zone_start_pfn, zone_end_pfn; | 2463 | unsigned long zone_start_pfn, zone_end_pfn; |
2464 | 2464 | ||
2465 | /* Get the start and end of the node and zone */ | 2465 | /* Get the start and end of the node and zone */ |
2466 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | 2466 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); |
2467 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; | 2467 | zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; |
2468 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; | 2468 | zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; |
2469 | 2469 | ||
2470 | /* Check that this node has pages within the zone's required range */ | 2470 | /* Check that this node has pages within the zone's required range */ |
2471 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) | 2471 | if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) |
2472 | return 0; | 2472 | return 0; |
2473 | 2473 | ||
2474 | /* Move the zone boundaries inside the node if necessary */ | 2474 | /* Move the zone boundaries inside the node if necessary */ |
2475 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); | 2475 | zone_end_pfn = min(zone_end_pfn, node_end_pfn); |
2476 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); | 2476 | zone_start_pfn = max(zone_start_pfn, node_start_pfn); |
2477 | 2477 | ||
2478 | /* Return the spanned pages */ | 2478 | /* Return the spanned pages */ |
2479 | return zone_end_pfn - zone_start_pfn; | 2479 | return zone_end_pfn - zone_start_pfn; |
2480 | } | 2480 | } |
2481 | 2481 | ||
2482 | /* | 2482 | /* |
2483 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 2483 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
2484 | * then all holes in the requested range will be accounted for. | 2484 | * then all holes in the requested range will be accounted for. |
2485 | */ | 2485 | */ |
2486 | unsigned long __init __absent_pages_in_range(int nid, | 2486 | unsigned long __init __absent_pages_in_range(int nid, |
2487 | unsigned long range_start_pfn, | 2487 | unsigned long range_start_pfn, |
2488 | unsigned long range_end_pfn) | 2488 | unsigned long range_end_pfn) |
2489 | { | 2489 | { |
2490 | int i = 0; | 2490 | int i = 0; |
2491 | unsigned long prev_end_pfn = 0, hole_pages = 0; | 2491 | unsigned long prev_end_pfn = 0, hole_pages = 0; |
2492 | unsigned long start_pfn; | 2492 | unsigned long start_pfn; |
2493 | 2493 | ||
2494 | /* Find the end_pfn of the first active range of pfns in the node */ | 2494 | /* Find the end_pfn of the first active range of pfns in the node */ |
2495 | i = first_active_region_index_in_nid(nid); | 2495 | i = first_active_region_index_in_nid(nid); |
2496 | if (i == -1) | 2496 | if (i == -1) |
2497 | return 0; | 2497 | return 0; |
2498 | 2498 | ||
2499 | /* Account for ranges before physical memory on this node */ | 2499 | /* Account for ranges before physical memory on this node */ |
2500 | if (early_node_map[i].start_pfn > range_start_pfn) | 2500 | if (early_node_map[i].start_pfn > range_start_pfn) |
2501 | hole_pages = early_node_map[i].start_pfn - range_start_pfn; | 2501 | hole_pages = early_node_map[i].start_pfn - range_start_pfn; |
2502 | 2502 | ||
2503 | prev_end_pfn = early_node_map[i].start_pfn; | 2503 | prev_end_pfn = early_node_map[i].start_pfn; |
2504 | 2504 | ||
2505 | /* Find all holes for the zone within the node */ | 2505 | /* Find all holes for the zone within the node */ |
2506 | for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { | 2506 | for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { |
2507 | 2507 | ||
2508 | /* No need to continue if prev_end_pfn is outside the zone */ | 2508 | /* No need to continue if prev_end_pfn is outside the zone */ |
2509 | if (prev_end_pfn >= range_end_pfn) | 2509 | if (prev_end_pfn >= range_end_pfn) |
2510 | break; | 2510 | break; |
2511 | 2511 | ||
2512 | /* Make sure the end of the zone is not within the hole */ | 2512 | /* Make sure the end of the zone is not within the hole */ |
2513 | start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); | 2513 | start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); |
2514 | prev_end_pfn = max(prev_end_pfn, range_start_pfn); | 2514 | prev_end_pfn = max(prev_end_pfn, range_start_pfn); |
2515 | 2515 | ||
2516 | /* Update the hole size cound and move on */ | 2516 | /* Update the hole size cound and move on */ |
2517 | if (start_pfn > range_start_pfn) { | 2517 | if (start_pfn > range_start_pfn) { |
2518 | BUG_ON(prev_end_pfn > start_pfn); | 2518 | BUG_ON(prev_end_pfn > start_pfn); |
2519 | hole_pages += start_pfn - prev_end_pfn; | 2519 | hole_pages += start_pfn - prev_end_pfn; |
2520 | } | 2520 | } |
2521 | prev_end_pfn = early_node_map[i].end_pfn; | 2521 | prev_end_pfn = early_node_map[i].end_pfn; |
2522 | } | 2522 | } |
2523 | 2523 | ||
2524 | /* Account for ranges past physical memory on this node */ | 2524 | /* Account for ranges past physical memory on this node */ |
2525 | if (range_end_pfn > prev_end_pfn) | 2525 | if (range_end_pfn > prev_end_pfn) |
2526 | hole_pages += range_end_pfn - | 2526 | hole_pages += range_end_pfn - |
2527 | max(range_start_pfn, prev_end_pfn); | 2527 | max(range_start_pfn, prev_end_pfn); |
2528 | 2528 | ||
2529 | return hole_pages; | 2529 | return hole_pages; |
2530 | } | 2530 | } |
2531 | 2531 | ||
2532 | /** | 2532 | /** |
2533 | * absent_pages_in_range - Return number of page frames in holes within a range | 2533 | * absent_pages_in_range - Return number of page frames in holes within a range |
2534 | * @start_pfn: The start PFN to start searching for holes | 2534 | * @start_pfn: The start PFN to start searching for holes |
2535 | * @end_pfn: The end PFN to stop searching for holes | 2535 | * @end_pfn: The end PFN to stop searching for holes |
2536 | * | 2536 | * |
2537 | * It returns the number of pages frames in memory holes within a range. | 2537 | * It returns the number of pages frames in memory holes within a range. |
2538 | */ | 2538 | */ |
2539 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, | 2539 | unsigned long __init absent_pages_in_range(unsigned long start_pfn, |
2540 | unsigned long end_pfn) | 2540 | unsigned long end_pfn) |
2541 | { | 2541 | { |
2542 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); | 2542 | return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); |
2543 | } | 2543 | } |
2544 | 2544 | ||
2545 | /* Return the number of page frames in holes in a zone on a node */ | 2545 | /* Return the number of page frames in holes in a zone on a node */ |
2546 | unsigned long __init zone_absent_pages_in_node(int nid, | 2546 | unsigned long __init zone_absent_pages_in_node(int nid, |
2547 | unsigned long zone_type, | 2547 | unsigned long zone_type, |
2548 | unsigned long *ignored) | 2548 | unsigned long *ignored) |
2549 | { | 2549 | { |
2550 | unsigned long node_start_pfn, node_end_pfn; | 2550 | unsigned long node_start_pfn, node_end_pfn; |
2551 | unsigned long zone_start_pfn, zone_end_pfn; | 2551 | unsigned long zone_start_pfn, zone_end_pfn; |
2552 | 2552 | ||
2553 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); | 2553 | get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); |
2554 | zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], | 2554 | zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], |
2555 | node_start_pfn); | 2555 | node_start_pfn); |
2556 | zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], | 2556 | zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], |
2557 | node_end_pfn); | 2557 | node_end_pfn); |
2558 | 2558 | ||
2559 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); | 2559 | return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); |
2560 | } | 2560 | } |
2561 | 2561 | ||
2562 | #else | 2562 | #else |
2563 | static inline unsigned long zone_spanned_pages_in_node(int nid, | 2563 | static inline unsigned long zone_spanned_pages_in_node(int nid, |
2564 | unsigned long zone_type, | 2564 | unsigned long zone_type, |
2565 | unsigned long *zones_size) | 2565 | unsigned long *zones_size) |
2566 | { | 2566 | { |
2567 | return zones_size[zone_type]; | 2567 | return zones_size[zone_type]; |
2568 | } | 2568 | } |
2569 | 2569 | ||
2570 | static inline unsigned long zone_absent_pages_in_node(int nid, | 2570 | static inline unsigned long zone_absent_pages_in_node(int nid, |
2571 | unsigned long zone_type, | 2571 | unsigned long zone_type, |
2572 | unsigned long *zholes_size) | 2572 | unsigned long *zholes_size) |
2573 | { | 2573 | { |
2574 | if (!zholes_size) | 2574 | if (!zholes_size) |
2575 | return 0; | 2575 | return 0; |
2576 | 2576 | ||
2577 | return zholes_size[zone_type]; | 2577 | return zholes_size[zone_type]; |
2578 | } | 2578 | } |
2579 | 2579 | ||
2580 | #endif | 2580 | #endif |
2581 | 2581 | ||
2582 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, | 2582 | static void __init calculate_node_totalpages(struct pglist_data *pgdat, |
2583 | unsigned long *zones_size, unsigned long *zholes_size) | 2583 | unsigned long *zones_size, unsigned long *zholes_size) |
2584 | { | 2584 | { |
2585 | unsigned long realtotalpages, totalpages = 0; | 2585 | unsigned long realtotalpages, totalpages = 0; |
2586 | enum zone_type i; | 2586 | enum zone_type i; |
2587 | 2587 | ||
2588 | for (i = 0; i < MAX_NR_ZONES; i++) | 2588 | for (i = 0; i < MAX_NR_ZONES; i++) |
2589 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, | 2589 | totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, |
2590 | zones_size); | 2590 | zones_size); |
2591 | pgdat->node_spanned_pages = totalpages; | 2591 | pgdat->node_spanned_pages = totalpages; |
2592 | 2592 | ||
2593 | realtotalpages = totalpages; | 2593 | realtotalpages = totalpages; |
2594 | for (i = 0; i < MAX_NR_ZONES; i++) | 2594 | for (i = 0; i < MAX_NR_ZONES; i++) |
2595 | realtotalpages -= | 2595 | realtotalpages -= |
2596 | zone_absent_pages_in_node(pgdat->node_id, i, | 2596 | zone_absent_pages_in_node(pgdat->node_id, i, |
2597 | zholes_size); | 2597 | zholes_size); |
2598 | pgdat->node_present_pages = realtotalpages; | 2598 | pgdat->node_present_pages = realtotalpages; |
2599 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, | 2599 | printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, |
2600 | realtotalpages); | 2600 | realtotalpages); |
2601 | } | 2601 | } |
2602 | 2602 | ||
2603 | /* | 2603 | /* |
2604 | * Set up the zone data structures: | 2604 | * Set up the zone data structures: |
2605 | * - mark all pages reserved | 2605 | * - mark all pages reserved |
2606 | * - mark all memory queues empty | 2606 | * - mark all memory queues empty |
2607 | * - clear the memory bitmaps | 2607 | * - clear the memory bitmaps |
2608 | */ | 2608 | */ |
2609 | static void __meminit free_area_init_core(struct pglist_data *pgdat, | 2609 | static void __meminit free_area_init_core(struct pglist_data *pgdat, |
2610 | unsigned long *zones_size, unsigned long *zholes_size) | 2610 | unsigned long *zones_size, unsigned long *zholes_size) |
2611 | { | 2611 | { |
2612 | enum zone_type j; | 2612 | enum zone_type j; |
2613 | int nid = pgdat->node_id; | 2613 | int nid = pgdat->node_id; |
2614 | unsigned long zone_start_pfn = pgdat->node_start_pfn; | 2614 | unsigned long zone_start_pfn = pgdat->node_start_pfn; |
2615 | int ret; | 2615 | int ret; |
2616 | 2616 | ||
2617 | pgdat_resize_init(pgdat); | 2617 | pgdat_resize_init(pgdat); |
2618 | pgdat->nr_zones = 0; | 2618 | pgdat->nr_zones = 0; |
2619 | init_waitqueue_head(&pgdat->kswapd_wait); | 2619 | init_waitqueue_head(&pgdat->kswapd_wait); |
2620 | pgdat->kswapd_max_order = 0; | 2620 | pgdat->kswapd_max_order = 0; |
2621 | 2621 | ||
2622 | for (j = 0; j < MAX_NR_ZONES; j++) { | 2622 | for (j = 0; j < MAX_NR_ZONES; j++) { |
2623 | struct zone *zone = pgdat->node_zones + j; | 2623 | struct zone *zone = pgdat->node_zones + j; |
2624 | unsigned long size, realsize, memmap_pages; | 2624 | unsigned long size, realsize, memmap_pages; |
2625 | 2625 | ||
2626 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 2626 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
2627 | realsize = size - zone_absent_pages_in_node(nid, j, | 2627 | realsize = size - zone_absent_pages_in_node(nid, j, |
2628 | zholes_size); | 2628 | zholes_size); |
2629 | 2629 | ||
2630 | /* | 2630 | /* |
2631 | * Adjust realsize so that it accounts for how much memory | 2631 | * Adjust realsize so that it accounts for how much memory |
2632 | * is used by this zone for memmap. This affects the watermark | 2632 | * is used by this zone for memmap. This affects the watermark |
2633 | * and per-cpu initialisations | 2633 | * and per-cpu initialisations |
2634 | */ | 2634 | */ |
2635 | memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; | 2635 | memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; |
2636 | if (realsize >= memmap_pages) { | 2636 | if (realsize >= memmap_pages) { |
2637 | realsize -= memmap_pages; | 2637 | realsize -= memmap_pages; |
2638 | printk(KERN_DEBUG | 2638 | printk(KERN_DEBUG |
2639 | " %s zone: %lu pages used for memmap\n", | 2639 | " %s zone: %lu pages used for memmap\n", |
2640 | zone_names[j], memmap_pages); | 2640 | zone_names[j], memmap_pages); |
2641 | } else | 2641 | } else |
2642 | printk(KERN_WARNING | 2642 | printk(KERN_WARNING |
2643 | " %s zone: %lu pages exceeds realsize %lu\n", | 2643 | " %s zone: %lu pages exceeds realsize %lu\n", |
2644 | zone_names[j], memmap_pages, realsize); | 2644 | zone_names[j], memmap_pages, realsize); |
2645 | 2645 | ||
2646 | /* Account for reserved pages */ | 2646 | /* Account for reserved pages */ |
2647 | if (j == 0 && realsize > dma_reserve) { | 2647 | if (j == 0 && realsize > dma_reserve) { |
2648 | realsize -= dma_reserve; | 2648 | realsize -= dma_reserve; |
2649 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 2649 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
2650 | zone_names[0], dma_reserve); | 2650 | zone_names[0], dma_reserve); |
2651 | } | 2651 | } |
2652 | 2652 | ||
2653 | if (!is_highmem_idx(j)) | 2653 | if (!is_highmem_idx(j)) |
2654 | nr_kernel_pages += realsize; | 2654 | nr_kernel_pages += realsize; |
2655 | nr_all_pages += realsize; | 2655 | nr_all_pages += realsize; |
2656 | 2656 | ||
2657 | zone->spanned_pages = size; | 2657 | zone->spanned_pages = size; |
2658 | zone->present_pages = realsize; | 2658 | zone->present_pages = realsize; |
2659 | #ifdef CONFIG_NUMA | 2659 | #ifdef CONFIG_NUMA |
2660 | zone->node = nid; | 2660 | zone->node = nid; |
2661 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 2661 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
2662 | / 100; | 2662 | / 100; |
2663 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 2663 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; |
2664 | #endif | 2664 | #endif |
2665 | zone->name = zone_names[j]; | 2665 | zone->name = zone_names[j]; |
2666 | spin_lock_init(&zone->lock); | 2666 | spin_lock_init(&zone->lock); |
2667 | spin_lock_init(&zone->lru_lock); | 2667 | spin_lock_init(&zone->lru_lock); |
2668 | zone_seqlock_init(zone); | 2668 | zone_seqlock_init(zone); |
2669 | zone->zone_pgdat = pgdat; | 2669 | zone->zone_pgdat = pgdat; |
2670 | 2670 | ||
2671 | zone->prev_priority = DEF_PRIORITY; | 2671 | zone->prev_priority = DEF_PRIORITY; |
2672 | 2672 | ||
2673 | zone_pcp_init(zone); | 2673 | zone_pcp_init(zone); |
2674 | INIT_LIST_HEAD(&zone->active_list); | 2674 | INIT_LIST_HEAD(&zone->active_list); |
2675 | INIT_LIST_HEAD(&zone->inactive_list); | 2675 | INIT_LIST_HEAD(&zone->inactive_list); |
2676 | zone->nr_scan_active = 0; | 2676 | zone->nr_scan_active = 0; |
2677 | zone->nr_scan_inactive = 0; | 2677 | zone->nr_scan_inactive = 0; |
2678 | zap_zone_vm_stats(zone); | 2678 | zap_zone_vm_stats(zone); |
2679 | atomic_set(&zone->reclaim_in_progress, 0); | 2679 | atomic_set(&zone->reclaim_in_progress, 0); |
2680 | if (!size) | 2680 | if (!size) |
2681 | continue; | 2681 | continue; |
2682 | 2682 | ||
2683 | ret = init_currently_empty_zone(zone, zone_start_pfn, | 2683 | ret = init_currently_empty_zone(zone, zone_start_pfn, |
2684 | size, MEMMAP_EARLY); | 2684 | size, MEMMAP_EARLY); |
2685 | BUG_ON(ret); | 2685 | BUG_ON(ret); |
2686 | zone_start_pfn += size; | 2686 | zone_start_pfn += size; |
2687 | } | 2687 | } |
2688 | } | 2688 | } |
2689 | 2689 | ||
2690 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) | 2690 | static void __init alloc_node_mem_map(struct pglist_data *pgdat) |
2691 | { | 2691 | { |
2692 | /* Skip empty nodes */ | 2692 | /* Skip empty nodes */ |
2693 | if (!pgdat->node_spanned_pages) | 2693 | if (!pgdat->node_spanned_pages) |
2694 | return; | 2694 | return; |
2695 | 2695 | ||
2696 | #ifdef CONFIG_FLAT_NODE_MEM_MAP | 2696 | #ifdef CONFIG_FLAT_NODE_MEM_MAP |
2697 | /* ia64 gets its own node_mem_map, before this, without bootmem */ | 2697 | /* ia64 gets its own node_mem_map, before this, without bootmem */ |
2698 | if (!pgdat->node_mem_map) { | 2698 | if (!pgdat->node_mem_map) { |
2699 | unsigned long size, start, end; | 2699 | unsigned long size, start, end; |
2700 | struct page *map; | 2700 | struct page *map; |
2701 | 2701 | ||
2702 | /* | 2702 | /* |
2703 | * The zone's endpoints aren't required to be MAX_ORDER | 2703 | * The zone's endpoints aren't required to be MAX_ORDER |
2704 | * aligned but the node_mem_map endpoints must be in order | 2704 | * aligned but the node_mem_map endpoints must be in order |
2705 | * for the buddy allocator to function correctly. | 2705 | * for the buddy allocator to function correctly. |
2706 | */ | 2706 | */ |
2707 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); | 2707 | start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); |
2708 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; | 2708 | end = pgdat->node_start_pfn + pgdat->node_spanned_pages; |
2709 | end = ALIGN(end, MAX_ORDER_NR_PAGES); | 2709 | end = ALIGN(end, MAX_ORDER_NR_PAGES); |
2710 | size = (end - start) * sizeof(struct page); | 2710 | size = (end - start) * sizeof(struct page); |
2711 | map = alloc_remap(pgdat->node_id, size); | 2711 | map = alloc_remap(pgdat->node_id, size); |
2712 | if (!map) | 2712 | if (!map) |
2713 | map = alloc_bootmem_node(pgdat, size); | 2713 | map = alloc_bootmem_node(pgdat, size); |
2714 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); | 2714 | pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); |
2715 | } | 2715 | } |
2716 | #ifdef CONFIG_FLATMEM | 2716 | #ifdef CONFIG_FLATMEM |
2717 | /* | 2717 | /* |
2718 | * With no DISCONTIG, the global mem_map is just set as node 0's | 2718 | * With no DISCONTIG, the global mem_map is just set as node 0's |
2719 | */ | 2719 | */ |
2720 | if (pgdat == NODE_DATA(0)) { | 2720 | if (pgdat == NODE_DATA(0)) { |
2721 | mem_map = NODE_DATA(0)->node_mem_map; | 2721 | mem_map = NODE_DATA(0)->node_mem_map; |
2722 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 2722 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
2723 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) | 2723 | if (page_to_pfn(mem_map) != pgdat->node_start_pfn) |
2724 | mem_map -= pgdat->node_start_pfn; | 2724 | mem_map -= pgdat->node_start_pfn; |
2725 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 2725 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
2726 | } | 2726 | } |
2727 | #endif | 2727 | #endif |
2728 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 2728 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
2729 | } | 2729 | } |
2730 | 2730 | ||
2731 | void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, | 2731 | void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, |
2732 | unsigned long *zones_size, unsigned long node_start_pfn, | 2732 | unsigned long *zones_size, unsigned long node_start_pfn, |
2733 | unsigned long *zholes_size) | 2733 | unsigned long *zholes_size) |
2734 | { | 2734 | { |
2735 | pgdat->node_id = nid; | 2735 | pgdat->node_id = nid; |
2736 | pgdat->node_start_pfn = node_start_pfn; | 2736 | pgdat->node_start_pfn = node_start_pfn; |
2737 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 2737 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
2738 | 2738 | ||
2739 | alloc_node_mem_map(pgdat); | 2739 | alloc_node_mem_map(pgdat); |
2740 | 2740 | ||
2741 | free_area_init_core(pgdat, zones_size, zholes_size); | 2741 | free_area_init_core(pgdat, zones_size, zholes_size); |
2742 | } | 2742 | } |
2743 | 2743 | ||
2744 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 2744 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
2745 | /** | 2745 | /** |
2746 | * add_active_range - Register a range of PFNs backed by physical memory | 2746 | * add_active_range - Register a range of PFNs backed by physical memory |
2747 | * @nid: The node ID the range resides on | 2747 | * @nid: The node ID the range resides on |
2748 | * @start_pfn: The start PFN of the available physical memory | 2748 | * @start_pfn: The start PFN of the available physical memory |
2749 | * @end_pfn: The end PFN of the available physical memory | 2749 | * @end_pfn: The end PFN of the available physical memory |
2750 | * | 2750 | * |
2751 | * These ranges are stored in an early_node_map[] and later used by | 2751 | * These ranges are stored in an early_node_map[] and later used by |
2752 | * free_area_init_nodes() to calculate zone sizes and holes. If the | 2752 | * free_area_init_nodes() to calculate zone sizes and holes. If the |
2753 | * range spans a memory hole, it is up to the architecture to ensure | 2753 | * range spans a memory hole, it is up to the architecture to ensure |
2754 | * the memory is not freed by the bootmem allocator. If possible | 2754 | * the memory is not freed by the bootmem allocator. If possible |
2755 | * the range being registered will be merged with existing ranges. | 2755 | * the range being registered will be merged with existing ranges. |
2756 | */ | 2756 | */ |
2757 | void __init add_active_range(unsigned int nid, unsigned long start_pfn, | 2757 | void __init add_active_range(unsigned int nid, unsigned long start_pfn, |
2758 | unsigned long end_pfn) | 2758 | unsigned long end_pfn) |
2759 | { | 2759 | { |
2760 | int i; | 2760 | int i; |
2761 | 2761 | ||
2762 | printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " | 2762 | printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " |
2763 | "%d entries of %d used\n", | 2763 | "%d entries of %d used\n", |
2764 | nid, start_pfn, end_pfn, | 2764 | nid, start_pfn, end_pfn, |
2765 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | 2765 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); |
2766 | 2766 | ||
2767 | /* Merge with existing active regions if possible */ | 2767 | /* Merge with existing active regions if possible */ |
2768 | for (i = 0; i < nr_nodemap_entries; i++) { | 2768 | for (i = 0; i < nr_nodemap_entries; i++) { |
2769 | if (early_node_map[i].nid != nid) | 2769 | if (early_node_map[i].nid != nid) |
2770 | continue; | 2770 | continue; |
2771 | 2771 | ||
2772 | /* Skip if an existing region covers this new one */ | 2772 | /* Skip if an existing region covers this new one */ |
2773 | if (start_pfn >= early_node_map[i].start_pfn && | 2773 | if (start_pfn >= early_node_map[i].start_pfn && |
2774 | end_pfn <= early_node_map[i].end_pfn) | 2774 | end_pfn <= early_node_map[i].end_pfn) |
2775 | return; | 2775 | return; |
2776 | 2776 | ||
2777 | /* Merge forward if suitable */ | 2777 | /* Merge forward if suitable */ |
2778 | if (start_pfn <= early_node_map[i].end_pfn && | 2778 | if (start_pfn <= early_node_map[i].end_pfn && |
2779 | end_pfn > early_node_map[i].end_pfn) { | 2779 | end_pfn > early_node_map[i].end_pfn) { |
2780 | early_node_map[i].end_pfn = end_pfn; | 2780 | early_node_map[i].end_pfn = end_pfn; |
2781 | return; | 2781 | return; |
2782 | } | 2782 | } |
2783 | 2783 | ||
2784 | /* Merge backward if suitable */ | 2784 | /* Merge backward if suitable */ |
2785 | if (start_pfn < early_node_map[i].end_pfn && | 2785 | if (start_pfn < early_node_map[i].end_pfn && |
2786 | end_pfn >= early_node_map[i].start_pfn) { | 2786 | end_pfn >= early_node_map[i].start_pfn) { |
2787 | early_node_map[i].start_pfn = start_pfn; | 2787 | early_node_map[i].start_pfn = start_pfn; |
2788 | return; | 2788 | return; |
2789 | } | 2789 | } |
2790 | } | 2790 | } |
2791 | 2791 | ||
2792 | /* Check that early_node_map is large enough */ | 2792 | /* Check that early_node_map is large enough */ |
2793 | if (i >= MAX_ACTIVE_REGIONS) { | 2793 | if (i >= MAX_ACTIVE_REGIONS) { |
2794 | printk(KERN_CRIT "More than %d memory regions, truncating\n", | 2794 | printk(KERN_CRIT "More than %d memory regions, truncating\n", |
2795 | MAX_ACTIVE_REGIONS); | 2795 | MAX_ACTIVE_REGIONS); |
2796 | return; | 2796 | return; |
2797 | } | 2797 | } |
2798 | 2798 | ||
2799 | early_node_map[i].nid = nid; | 2799 | early_node_map[i].nid = nid; |
2800 | early_node_map[i].start_pfn = start_pfn; | 2800 | early_node_map[i].start_pfn = start_pfn; |
2801 | early_node_map[i].end_pfn = end_pfn; | 2801 | early_node_map[i].end_pfn = end_pfn; |
2802 | nr_nodemap_entries = i + 1; | 2802 | nr_nodemap_entries = i + 1; |
2803 | } | 2803 | } |
2804 | 2804 | ||
2805 | /** | 2805 | /** |
2806 | * shrink_active_range - Shrink an existing registered range of PFNs | 2806 | * shrink_active_range - Shrink an existing registered range of PFNs |
2807 | * @nid: The node id the range is on that should be shrunk | 2807 | * @nid: The node id the range is on that should be shrunk |
2808 | * @old_end_pfn: The old end PFN of the range | 2808 | * @old_end_pfn: The old end PFN of the range |
2809 | * @new_end_pfn: The new PFN of the range | 2809 | * @new_end_pfn: The new PFN of the range |
2810 | * | 2810 | * |
2811 | * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. | 2811 | * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. |
2812 | * The map is kept at the end physical page range that has already been | 2812 | * The map is kept at the end physical page range that has already been |
2813 | * registered with add_active_range(). This function allows an arch to shrink | 2813 | * registered with add_active_range(). This function allows an arch to shrink |
2814 | * an existing registered range. | 2814 | * an existing registered range. |
2815 | */ | 2815 | */ |
2816 | void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, | 2816 | void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, |
2817 | unsigned long new_end_pfn) | 2817 | unsigned long new_end_pfn) |
2818 | { | 2818 | { |
2819 | int i; | 2819 | int i; |
2820 | 2820 | ||
2821 | /* Find the old active region end and shrink */ | 2821 | /* Find the old active region end and shrink */ |
2822 | for_each_active_range_index_in_nid(i, nid) | 2822 | for_each_active_range_index_in_nid(i, nid) |
2823 | if (early_node_map[i].end_pfn == old_end_pfn) { | 2823 | if (early_node_map[i].end_pfn == old_end_pfn) { |
2824 | early_node_map[i].end_pfn = new_end_pfn; | 2824 | early_node_map[i].end_pfn = new_end_pfn; |
2825 | break; | 2825 | break; |
2826 | } | 2826 | } |
2827 | } | 2827 | } |
2828 | 2828 | ||
2829 | /** | 2829 | /** |
2830 | * remove_all_active_ranges - Remove all currently registered regions | 2830 | * remove_all_active_ranges - Remove all currently registered regions |
2831 | * | 2831 | * |
2832 | * During discovery, it may be found that a table like SRAT is invalid | 2832 | * During discovery, it may be found that a table like SRAT is invalid |
2833 | * and an alternative discovery method must be used. This function removes | 2833 | * and an alternative discovery method must be used. This function removes |
2834 | * all currently registered regions. | 2834 | * all currently registered regions. |
2835 | */ | 2835 | */ |
2836 | void __init remove_all_active_ranges(void) | 2836 | void __init remove_all_active_ranges(void) |
2837 | { | 2837 | { |
2838 | memset(early_node_map, 0, sizeof(early_node_map)); | 2838 | memset(early_node_map, 0, sizeof(early_node_map)); |
2839 | nr_nodemap_entries = 0; | 2839 | nr_nodemap_entries = 0; |
2840 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | 2840 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE |
2841 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | 2841 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); |
2842 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | 2842 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); |
2843 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 2843 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
2844 | } | 2844 | } |
2845 | 2845 | ||
2846 | /* Compare two active node_active_regions */ | 2846 | /* Compare two active node_active_regions */ |
2847 | static int __init cmp_node_active_region(const void *a, const void *b) | 2847 | static int __init cmp_node_active_region(const void *a, const void *b) |
2848 | { | 2848 | { |
2849 | struct node_active_region *arange = (struct node_active_region *)a; | 2849 | struct node_active_region *arange = (struct node_active_region *)a; |
2850 | struct node_active_region *brange = (struct node_active_region *)b; | 2850 | struct node_active_region *brange = (struct node_active_region *)b; |
2851 | 2851 | ||
2852 | /* Done this way to avoid overflows */ | 2852 | /* Done this way to avoid overflows */ |
2853 | if (arange->start_pfn > brange->start_pfn) | 2853 | if (arange->start_pfn > brange->start_pfn) |
2854 | return 1; | 2854 | return 1; |
2855 | if (arange->start_pfn < brange->start_pfn) | 2855 | if (arange->start_pfn < brange->start_pfn) |
2856 | return -1; | 2856 | return -1; |
2857 | 2857 | ||
2858 | return 0; | 2858 | return 0; |
2859 | } | 2859 | } |
2860 | 2860 | ||
2861 | /* sort the node_map by start_pfn */ | 2861 | /* sort the node_map by start_pfn */ |
2862 | static void __init sort_node_map(void) | 2862 | static void __init sort_node_map(void) |
2863 | { | 2863 | { |
2864 | sort(early_node_map, (size_t)nr_nodemap_entries, | 2864 | sort(early_node_map, (size_t)nr_nodemap_entries, |
2865 | sizeof(struct node_active_region), | 2865 | sizeof(struct node_active_region), |
2866 | cmp_node_active_region, NULL); | 2866 | cmp_node_active_region, NULL); |
2867 | } | 2867 | } |
2868 | 2868 | ||
2869 | /* Find the lowest pfn for a node */ | 2869 | /* Find the lowest pfn for a node */ |
2870 | unsigned long __init find_min_pfn_for_node(unsigned long nid) | 2870 | unsigned long __init find_min_pfn_for_node(unsigned long nid) |
2871 | { | 2871 | { |
2872 | int i; | 2872 | int i; |
2873 | unsigned long min_pfn = ULONG_MAX; | 2873 | unsigned long min_pfn = ULONG_MAX; |
2874 | 2874 | ||
2875 | /* Assuming a sorted map, the first range found has the starting pfn */ | 2875 | /* Assuming a sorted map, the first range found has the starting pfn */ |
2876 | for_each_active_range_index_in_nid(i, nid) | 2876 | for_each_active_range_index_in_nid(i, nid) |
2877 | min_pfn = min(min_pfn, early_node_map[i].start_pfn); | 2877 | min_pfn = min(min_pfn, early_node_map[i].start_pfn); |
2878 | 2878 | ||
2879 | if (min_pfn == ULONG_MAX) { | 2879 | if (min_pfn == ULONG_MAX) { |
2880 | printk(KERN_WARNING | 2880 | printk(KERN_WARNING |
2881 | "Could not find start_pfn for node %lu\n", nid); | 2881 | "Could not find start_pfn for node %lu\n", nid); |
2882 | return 0; | 2882 | return 0; |
2883 | } | 2883 | } |
2884 | 2884 | ||
2885 | return min_pfn; | 2885 | return min_pfn; |
2886 | } | 2886 | } |
2887 | 2887 | ||
2888 | /** | 2888 | /** |
2889 | * find_min_pfn_with_active_regions - Find the minimum PFN registered | 2889 | * find_min_pfn_with_active_regions - Find the minimum PFN registered |
2890 | * | 2890 | * |
2891 | * It returns the minimum PFN based on information provided via | 2891 | * It returns the minimum PFN based on information provided via |
2892 | * add_active_range(). | 2892 | * add_active_range(). |
2893 | */ | 2893 | */ |
2894 | unsigned long __init find_min_pfn_with_active_regions(void) | 2894 | unsigned long __init find_min_pfn_with_active_regions(void) |
2895 | { | 2895 | { |
2896 | return find_min_pfn_for_node(MAX_NUMNODES); | 2896 | return find_min_pfn_for_node(MAX_NUMNODES); |
2897 | } | 2897 | } |
2898 | 2898 | ||
2899 | /** | 2899 | /** |
2900 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | 2900 | * find_max_pfn_with_active_regions - Find the maximum PFN registered |
2901 | * | 2901 | * |
2902 | * It returns the maximum PFN based on information provided via | 2902 | * It returns the maximum PFN based on information provided via |
2903 | * add_active_range(). | 2903 | * add_active_range(). |
2904 | */ | 2904 | */ |
2905 | unsigned long __init find_max_pfn_with_active_regions(void) | 2905 | unsigned long __init find_max_pfn_with_active_regions(void) |
2906 | { | 2906 | { |
2907 | int i; | 2907 | int i; |
2908 | unsigned long max_pfn = 0; | 2908 | unsigned long max_pfn = 0; |
2909 | 2909 | ||
2910 | for (i = 0; i < nr_nodemap_entries; i++) | 2910 | for (i = 0; i < nr_nodemap_entries; i++) |
2911 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | 2911 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); |
2912 | 2912 | ||
2913 | return max_pfn; | 2913 | return max_pfn; |
2914 | } | 2914 | } |
2915 | 2915 | ||
2916 | /** | 2916 | /** |
2917 | * free_area_init_nodes - Initialise all pg_data_t and zone data | 2917 | * free_area_init_nodes - Initialise all pg_data_t and zone data |
2918 | * @max_zone_pfn: an array of max PFNs for each zone | 2918 | * @max_zone_pfn: an array of max PFNs for each zone |
2919 | * | 2919 | * |
2920 | * This will call free_area_init_node() for each active node in the system. | 2920 | * This will call free_area_init_node() for each active node in the system. |
2921 | * Using the page ranges provided by add_active_range(), the size of each | 2921 | * Using the page ranges provided by add_active_range(), the size of each |
2922 | * zone in each node and their holes is calculated. If the maximum PFN | 2922 | * zone in each node and their holes is calculated. If the maximum PFN |
2923 | * between two adjacent zones match, it is assumed that the zone is empty. | 2923 | * between two adjacent zones match, it is assumed that the zone is empty. |
2924 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed | 2924 | * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed |
2925 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone | 2925 | * that arch_max_dma32_pfn has no pages. It is also assumed that a zone |
2926 | * starts where the previous one ended. For example, ZONE_DMA32 starts | 2926 | * starts where the previous one ended. For example, ZONE_DMA32 starts |
2927 | * at arch_max_dma_pfn. | 2927 | * at arch_max_dma_pfn. |
2928 | */ | 2928 | */ |
2929 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) | 2929 | void __init free_area_init_nodes(unsigned long *max_zone_pfn) |
2930 | { | 2930 | { |
2931 | unsigned long nid; | 2931 | unsigned long nid; |
2932 | enum zone_type i; | 2932 | enum zone_type i; |
2933 | 2933 | ||
2934 | /* Sort early_node_map as initialisation assumes it is sorted */ | 2934 | /* Sort early_node_map as initialisation assumes it is sorted */ |
2935 | sort_node_map(); | 2935 | sort_node_map(); |
2936 | 2936 | ||
2937 | /* Record where the zone boundaries are */ | 2937 | /* Record where the zone boundaries are */ |
2938 | memset(arch_zone_lowest_possible_pfn, 0, | 2938 | memset(arch_zone_lowest_possible_pfn, 0, |
2939 | sizeof(arch_zone_lowest_possible_pfn)); | 2939 | sizeof(arch_zone_lowest_possible_pfn)); |
2940 | memset(arch_zone_highest_possible_pfn, 0, | 2940 | memset(arch_zone_highest_possible_pfn, 0, |
2941 | sizeof(arch_zone_highest_possible_pfn)); | 2941 | sizeof(arch_zone_highest_possible_pfn)); |
2942 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); | 2942 | arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); |
2943 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; | 2943 | arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; |
2944 | for (i = 1; i < MAX_NR_ZONES; i++) { | 2944 | for (i = 1; i < MAX_NR_ZONES; i++) { |
2945 | arch_zone_lowest_possible_pfn[i] = | 2945 | arch_zone_lowest_possible_pfn[i] = |
2946 | arch_zone_highest_possible_pfn[i-1]; | 2946 | arch_zone_highest_possible_pfn[i-1]; |
2947 | arch_zone_highest_possible_pfn[i] = | 2947 | arch_zone_highest_possible_pfn[i] = |
2948 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); | 2948 | max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); |
2949 | } | 2949 | } |
2950 | 2950 | ||
2951 | /* Print out the zone ranges */ | 2951 | /* Print out the zone ranges */ |
2952 | printk("Zone PFN ranges:\n"); | 2952 | printk("Zone PFN ranges:\n"); |
2953 | for (i = 0; i < MAX_NR_ZONES; i++) | 2953 | for (i = 0; i < MAX_NR_ZONES; i++) |
2954 | printk(" %-8s %8lu -> %8lu\n", | 2954 | printk(" %-8s %8lu -> %8lu\n", |
2955 | zone_names[i], | 2955 | zone_names[i], |
2956 | arch_zone_lowest_possible_pfn[i], | 2956 | arch_zone_lowest_possible_pfn[i], |
2957 | arch_zone_highest_possible_pfn[i]); | 2957 | arch_zone_highest_possible_pfn[i]); |
2958 | 2958 | ||
2959 | /* Print out the early_node_map[] */ | 2959 | /* Print out the early_node_map[] */ |
2960 | printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); | 2960 | printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); |
2961 | for (i = 0; i < nr_nodemap_entries; i++) | 2961 | for (i = 0; i < nr_nodemap_entries; i++) |
2962 | printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, | 2962 | printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, |
2963 | early_node_map[i].start_pfn, | 2963 | early_node_map[i].start_pfn, |
2964 | early_node_map[i].end_pfn); | 2964 | early_node_map[i].end_pfn); |
2965 | 2965 | ||
2966 | /* Initialise every node */ | 2966 | /* Initialise every node */ |
2967 | setup_nr_node_ids(); | 2967 | setup_nr_node_ids(); |
2968 | for_each_online_node(nid) { | 2968 | for_each_online_node(nid) { |
2969 | pg_data_t *pgdat = NODE_DATA(nid); | 2969 | pg_data_t *pgdat = NODE_DATA(nid); |
2970 | free_area_init_node(nid, pgdat, NULL, | 2970 | free_area_init_node(nid, pgdat, NULL, |
2971 | find_min_pfn_for_node(nid), NULL); | 2971 | find_min_pfn_for_node(nid), NULL); |
2972 | } | 2972 | } |
2973 | } | 2973 | } |
2974 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 2974 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
2975 | 2975 | ||
2976 | /** | 2976 | /** |
2977 | * set_dma_reserve - set the specified number of pages reserved in the first zone | 2977 | * set_dma_reserve - set the specified number of pages reserved in the first zone |
2978 | * @new_dma_reserve: The number of pages to mark reserved | 2978 | * @new_dma_reserve: The number of pages to mark reserved |
2979 | * | 2979 | * |
2980 | * The per-cpu batchsize and zone watermarks are determined by present_pages. | 2980 | * The per-cpu batchsize and zone watermarks are determined by present_pages. |
2981 | * In the DMA zone, a significant percentage may be consumed by kernel image | 2981 | * In the DMA zone, a significant percentage may be consumed by kernel image |
2982 | * and other unfreeable allocations which can skew the watermarks badly. This | 2982 | * and other unfreeable allocations which can skew the watermarks badly. This |
2983 | * function may optionally be used to account for unfreeable pages in the | 2983 | * function may optionally be used to account for unfreeable pages in the |
2984 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and | 2984 | * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and |
2985 | * smaller per-cpu batchsize. | 2985 | * smaller per-cpu batchsize. |
2986 | */ | 2986 | */ |
2987 | void __init set_dma_reserve(unsigned long new_dma_reserve) | 2987 | void __init set_dma_reserve(unsigned long new_dma_reserve) |
2988 | { | 2988 | { |
2989 | dma_reserve = new_dma_reserve; | 2989 | dma_reserve = new_dma_reserve; |
2990 | } | 2990 | } |
2991 | 2991 | ||
2992 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 2992 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
2993 | static bootmem_data_t contig_bootmem_data; | 2993 | static bootmem_data_t contig_bootmem_data; |
2994 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | 2994 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; |
2995 | 2995 | ||
2996 | EXPORT_SYMBOL(contig_page_data); | 2996 | EXPORT_SYMBOL(contig_page_data); |
2997 | #endif | 2997 | #endif |
2998 | 2998 | ||
2999 | void __init free_area_init(unsigned long *zones_size) | 2999 | void __init free_area_init(unsigned long *zones_size) |
3000 | { | 3000 | { |
3001 | free_area_init_node(0, NODE_DATA(0), zones_size, | 3001 | free_area_init_node(0, NODE_DATA(0), zones_size, |
3002 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 3002 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
3003 | } | 3003 | } |
3004 | 3004 | ||
3005 | static int page_alloc_cpu_notify(struct notifier_block *self, | 3005 | static int page_alloc_cpu_notify(struct notifier_block *self, |
3006 | unsigned long action, void *hcpu) | 3006 | unsigned long action, void *hcpu) |
3007 | { | 3007 | { |
3008 | int cpu = (unsigned long)hcpu; | 3008 | int cpu = (unsigned long)hcpu; |
3009 | 3009 | ||
3010 | if (action == CPU_DEAD) { | 3010 | if (action == CPU_DEAD) { |
3011 | local_irq_disable(); | 3011 | local_irq_disable(); |
3012 | __drain_pages(cpu); | 3012 | __drain_pages(cpu); |
3013 | vm_events_fold_cpu(cpu); | 3013 | vm_events_fold_cpu(cpu); |
3014 | local_irq_enable(); | 3014 | local_irq_enable(); |
3015 | refresh_cpu_vm_stats(cpu); | 3015 | refresh_cpu_vm_stats(cpu); |
3016 | } | 3016 | } |
3017 | return NOTIFY_OK; | 3017 | return NOTIFY_OK; |
3018 | } | 3018 | } |
3019 | 3019 | ||
3020 | void __init page_alloc_init(void) | 3020 | void __init page_alloc_init(void) |
3021 | { | 3021 | { |
3022 | hotcpu_notifier(page_alloc_cpu_notify, 0); | 3022 | hotcpu_notifier(page_alloc_cpu_notify, 0); |
3023 | } | 3023 | } |
3024 | 3024 | ||
3025 | /* | 3025 | /* |
3026 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio | 3026 | * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio |
3027 | * or min_free_kbytes changes. | 3027 | * or min_free_kbytes changes. |
3028 | */ | 3028 | */ |
3029 | static void calculate_totalreserve_pages(void) | 3029 | static void calculate_totalreserve_pages(void) |
3030 | { | 3030 | { |
3031 | struct pglist_data *pgdat; | 3031 | struct pglist_data *pgdat; |
3032 | unsigned long reserve_pages = 0; | 3032 | unsigned long reserve_pages = 0; |
3033 | enum zone_type i, j; | 3033 | enum zone_type i, j; |
3034 | 3034 | ||
3035 | for_each_online_pgdat(pgdat) { | 3035 | for_each_online_pgdat(pgdat) { |
3036 | for (i = 0; i < MAX_NR_ZONES; i++) { | 3036 | for (i = 0; i < MAX_NR_ZONES; i++) { |
3037 | struct zone *zone = pgdat->node_zones + i; | 3037 | struct zone *zone = pgdat->node_zones + i; |
3038 | unsigned long max = 0; | 3038 | unsigned long max = 0; |
3039 | 3039 | ||
3040 | /* Find valid and maximum lowmem_reserve in the zone */ | 3040 | /* Find valid and maximum lowmem_reserve in the zone */ |
3041 | for (j = i; j < MAX_NR_ZONES; j++) { | 3041 | for (j = i; j < MAX_NR_ZONES; j++) { |
3042 | if (zone->lowmem_reserve[j] > max) | 3042 | if (zone->lowmem_reserve[j] > max) |
3043 | max = zone->lowmem_reserve[j]; | 3043 | max = zone->lowmem_reserve[j]; |
3044 | } | 3044 | } |
3045 | 3045 | ||
3046 | /* we treat pages_high as reserved pages. */ | 3046 | /* we treat pages_high as reserved pages. */ |
3047 | max += zone->pages_high; | 3047 | max += zone->pages_high; |
3048 | 3048 | ||
3049 | if (max > zone->present_pages) | 3049 | if (max > zone->present_pages) |
3050 | max = zone->present_pages; | 3050 | max = zone->present_pages; |
3051 | reserve_pages += max; | 3051 | reserve_pages += max; |
3052 | } | 3052 | } |
3053 | } | 3053 | } |
3054 | totalreserve_pages = reserve_pages; | 3054 | totalreserve_pages = reserve_pages; |
3055 | } | 3055 | } |
3056 | 3056 | ||
3057 | /* | 3057 | /* |
3058 | * setup_per_zone_lowmem_reserve - called whenever | 3058 | * setup_per_zone_lowmem_reserve - called whenever |
3059 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone | 3059 | * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone |
3060 | * has a correct pages reserved value, so an adequate number of | 3060 | * has a correct pages reserved value, so an adequate number of |
3061 | * pages are left in the zone after a successful __alloc_pages(). | 3061 | * pages are left in the zone after a successful __alloc_pages(). |
3062 | */ | 3062 | */ |
3063 | static void setup_per_zone_lowmem_reserve(void) | 3063 | static void setup_per_zone_lowmem_reserve(void) |
3064 | { | 3064 | { |
3065 | struct pglist_data *pgdat; | 3065 | struct pglist_data *pgdat; |
3066 | enum zone_type j, idx; | 3066 | enum zone_type j, idx; |
3067 | 3067 | ||
3068 | for_each_online_pgdat(pgdat) { | 3068 | for_each_online_pgdat(pgdat) { |
3069 | for (j = 0; j < MAX_NR_ZONES; j++) { | 3069 | for (j = 0; j < MAX_NR_ZONES; j++) { |
3070 | struct zone *zone = pgdat->node_zones + j; | 3070 | struct zone *zone = pgdat->node_zones + j; |
3071 | unsigned long present_pages = zone->present_pages; | 3071 | unsigned long present_pages = zone->present_pages; |
3072 | 3072 | ||
3073 | zone->lowmem_reserve[j] = 0; | 3073 | zone->lowmem_reserve[j] = 0; |
3074 | 3074 | ||
3075 | idx = j; | 3075 | idx = j; |
3076 | while (idx) { | 3076 | while (idx) { |
3077 | struct zone *lower_zone; | 3077 | struct zone *lower_zone; |
3078 | 3078 | ||
3079 | idx--; | 3079 | idx--; |
3080 | 3080 | ||
3081 | if (sysctl_lowmem_reserve_ratio[idx] < 1) | 3081 | if (sysctl_lowmem_reserve_ratio[idx] < 1) |
3082 | sysctl_lowmem_reserve_ratio[idx] = 1; | 3082 | sysctl_lowmem_reserve_ratio[idx] = 1; |
3083 | 3083 | ||
3084 | lower_zone = pgdat->node_zones + idx; | 3084 | lower_zone = pgdat->node_zones + idx; |
3085 | lower_zone->lowmem_reserve[j] = present_pages / | 3085 | lower_zone->lowmem_reserve[j] = present_pages / |
3086 | sysctl_lowmem_reserve_ratio[idx]; | 3086 | sysctl_lowmem_reserve_ratio[idx]; |
3087 | present_pages += lower_zone->present_pages; | 3087 | present_pages += lower_zone->present_pages; |
3088 | } | 3088 | } |
3089 | } | 3089 | } |
3090 | } | 3090 | } |
3091 | 3091 | ||
3092 | /* update totalreserve_pages */ | 3092 | /* update totalreserve_pages */ |
3093 | calculate_totalreserve_pages(); | 3093 | calculate_totalreserve_pages(); |
3094 | } | 3094 | } |
3095 | 3095 | ||
3096 | /** | 3096 | /** |
3097 | * setup_per_zone_pages_min - called when min_free_kbytes changes. | 3097 | * setup_per_zone_pages_min - called when min_free_kbytes changes. |
3098 | * | 3098 | * |
3099 | * Ensures that the pages_{min,low,high} values for each zone are set correctly | 3099 | * Ensures that the pages_{min,low,high} values for each zone are set correctly |
3100 | * with respect to min_free_kbytes. | 3100 | * with respect to min_free_kbytes. |
3101 | */ | 3101 | */ |
3102 | void setup_per_zone_pages_min(void) | 3102 | void setup_per_zone_pages_min(void) |
3103 | { | 3103 | { |
3104 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 3104 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
3105 | unsigned long lowmem_pages = 0; | 3105 | unsigned long lowmem_pages = 0; |
3106 | struct zone *zone; | 3106 | struct zone *zone; |
3107 | unsigned long flags; | 3107 | unsigned long flags; |
3108 | 3108 | ||
3109 | /* Calculate total number of !ZONE_HIGHMEM pages */ | 3109 | /* Calculate total number of !ZONE_HIGHMEM pages */ |
3110 | for_each_zone(zone) { | 3110 | for_each_zone(zone) { |
3111 | if (!is_highmem(zone)) | 3111 | if (!is_highmem(zone)) |
3112 | lowmem_pages += zone->present_pages; | 3112 | lowmem_pages += zone->present_pages; |
3113 | } | 3113 | } |
3114 | 3114 | ||
3115 | for_each_zone(zone) { | 3115 | for_each_zone(zone) { |
3116 | u64 tmp; | 3116 | u64 tmp; |
3117 | 3117 | ||
3118 | spin_lock_irqsave(&zone->lru_lock, flags); | 3118 | spin_lock_irqsave(&zone->lru_lock, flags); |
3119 | tmp = (u64)pages_min * zone->present_pages; | 3119 | tmp = (u64)pages_min * zone->present_pages; |
3120 | do_div(tmp, lowmem_pages); | 3120 | do_div(tmp, lowmem_pages); |
3121 | if (is_highmem(zone)) { | 3121 | if (is_highmem(zone)) { |
3122 | /* | 3122 | /* |
3123 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't | 3123 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
3124 | * need highmem pages, so cap pages_min to a small | 3124 | * need highmem pages, so cap pages_min to a small |
3125 | * value here. | 3125 | * value here. |
3126 | * | 3126 | * |
3127 | * The (pages_high-pages_low) and (pages_low-pages_min) | 3127 | * The (pages_high-pages_low) and (pages_low-pages_min) |
3128 | * deltas controls asynch page reclaim, and so should | 3128 | * deltas controls asynch page reclaim, and so should |
3129 | * not be capped for highmem. | 3129 | * not be capped for highmem. |
3130 | */ | 3130 | */ |
3131 | int min_pages; | 3131 | int min_pages; |
3132 | 3132 | ||
3133 | min_pages = zone->present_pages / 1024; | 3133 | min_pages = zone->present_pages / 1024; |
3134 | if (min_pages < SWAP_CLUSTER_MAX) | 3134 | if (min_pages < SWAP_CLUSTER_MAX) |
3135 | min_pages = SWAP_CLUSTER_MAX; | 3135 | min_pages = SWAP_CLUSTER_MAX; |
3136 | if (min_pages > 128) | 3136 | if (min_pages > 128) |
3137 | min_pages = 128; | 3137 | min_pages = 128; |
3138 | zone->pages_min = min_pages; | 3138 | zone->pages_min = min_pages; |
3139 | } else { | 3139 | } else { |
3140 | /* | 3140 | /* |
3141 | * If it's a lowmem zone, reserve a number of pages | 3141 | * If it's a lowmem zone, reserve a number of pages |
3142 | * proportionate to the zone's size. | 3142 | * proportionate to the zone's size. |
3143 | */ | 3143 | */ |
3144 | zone->pages_min = tmp; | 3144 | zone->pages_min = tmp; |
3145 | } | 3145 | } |
3146 | 3146 | ||
3147 | zone->pages_low = zone->pages_min + (tmp >> 2); | 3147 | zone->pages_low = zone->pages_min + (tmp >> 2); |
3148 | zone->pages_high = zone->pages_min + (tmp >> 1); | 3148 | zone->pages_high = zone->pages_min + (tmp >> 1); |
3149 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3149 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3150 | } | 3150 | } |
3151 | 3151 | ||
3152 | /* update totalreserve_pages */ | 3152 | /* update totalreserve_pages */ |
3153 | calculate_totalreserve_pages(); | 3153 | calculate_totalreserve_pages(); |
3154 | } | 3154 | } |
3155 | 3155 | ||
3156 | /* | 3156 | /* |
3157 | * Initialise min_free_kbytes. | 3157 | * Initialise min_free_kbytes. |
3158 | * | 3158 | * |
3159 | * For small machines we want it small (128k min). For large machines | 3159 | * For small machines we want it small (128k min). For large machines |
3160 | * we want it large (64MB max). But it is not linear, because network | 3160 | * we want it large (64MB max). But it is not linear, because network |
3161 | * bandwidth does not increase linearly with machine size. We use | 3161 | * bandwidth does not increase linearly with machine size. We use |
3162 | * | 3162 | * |
3163 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: | 3163 | * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: |
3164 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) | 3164 | * min_free_kbytes = sqrt(lowmem_kbytes * 16) |
3165 | * | 3165 | * |
3166 | * which yields | 3166 | * which yields |
3167 | * | 3167 | * |
3168 | * 16MB: 512k | 3168 | * 16MB: 512k |
3169 | * 32MB: 724k | 3169 | * 32MB: 724k |
3170 | * 64MB: 1024k | 3170 | * 64MB: 1024k |
3171 | * 128MB: 1448k | 3171 | * 128MB: 1448k |
3172 | * 256MB: 2048k | 3172 | * 256MB: 2048k |
3173 | * 512MB: 2896k | 3173 | * 512MB: 2896k |
3174 | * 1024MB: 4096k | 3174 | * 1024MB: 4096k |
3175 | * 2048MB: 5792k | 3175 | * 2048MB: 5792k |
3176 | * 4096MB: 8192k | 3176 | * 4096MB: 8192k |
3177 | * 8192MB: 11584k | 3177 | * 8192MB: 11584k |
3178 | * 16384MB: 16384k | 3178 | * 16384MB: 16384k |
3179 | */ | 3179 | */ |
3180 | static int __init init_per_zone_pages_min(void) | 3180 | static int __init init_per_zone_pages_min(void) |
3181 | { | 3181 | { |
3182 | unsigned long lowmem_kbytes; | 3182 | unsigned long lowmem_kbytes; |
3183 | 3183 | ||
3184 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); | 3184 | lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); |
3185 | 3185 | ||
3186 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); | 3186 | min_free_kbytes = int_sqrt(lowmem_kbytes * 16); |
3187 | if (min_free_kbytes < 128) | 3187 | if (min_free_kbytes < 128) |
3188 | min_free_kbytes = 128; | 3188 | min_free_kbytes = 128; |
3189 | if (min_free_kbytes > 65536) | 3189 | if (min_free_kbytes > 65536) |
3190 | min_free_kbytes = 65536; | 3190 | min_free_kbytes = 65536; |
3191 | setup_per_zone_pages_min(); | 3191 | setup_per_zone_pages_min(); |
3192 | setup_per_zone_lowmem_reserve(); | 3192 | setup_per_zone_lowmem_reserve(); |
3193 | return 0; | 3193 | return 0; |
3194 | } | 3194 | } |
3195 | module_init(init_per_zone_pages_min) | 3195 | module_init(init_per_zone_pages_min) |
3196 | 3196 | ||
3197 | /* | 3197 | /* |
3198 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 3198 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
3199 | * that we can call two helper functions whenever min_free_kbytes | 3199 | * that we can call two helper functions whenever min_free_kbytes |
3200 | * changes. | 3200 | * changes. |
3201 | */ | 3201 | */ |
3202 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 3202 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
3203 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 3203 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
3204 | { | 3204 | { |
3205 | proc_dointvec(table, write, file, buffer, length, ppos); | 3205 | proc_dointvec(table, write, file, buffer, length, ppos); |
3206 | setup_per_zone_pages_min(); | 3206 | setup_per_zone_pages_min(); |
3207 | return 0; | 3207 | return 0; |
3208 | } | 3208 | } |
3209 | 3209 | ||
3210 | #ifdef CONFIG_NUMA | 3210 | #ifdef CONFIG_NUMA |
3211 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | 3211 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, |
3212 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 3212 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
3213 | { | 3213 | { |
3214 | struct zone *zone; | 3214 | struct zone *zone; |
3215 | int rc; | 3215 | int rc; |
3216 | 3216 | ||
3217 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 3217 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); |
3218 | if (rc) | 3218 | if (rc) |
3219 | return rc; | 3219 | return rc; |
3220 | 3220 | ||
3221 | for_each_zone(zone) | 3221 | for_each_zone(zone) |
3222 | zone->min_unmapped_pages = (zone->present_pages * | 3222 | zone->min_unmapped_pages = (zone->present_pages * |
3223 | sysctl_min_unmapped_ratio) / 100; | 3223 | sysctl_min_unmapped_ratio) / 100; |
3224 | return 0; | 3224 | return 0; |
3225 | } | 3225 | } |
3226 | 3226 | ||
3227 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | 3227 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, |
3228 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 3228 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
3229 | { | 3229 | { |
3230 | struct zone *zone; | 3230 | struct zone *zone; |
3231 | int rc; | 3231 | int rc; |
3232 | 3232 | ||
3233 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 3233 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); |
3234 | if (rc) | 3234 | if (rc) |
3235 | return rc; | 3235 | return rc; |
3236 | 3236 | ||
3237 | for_each_zone(zone) | 3237 | for_each_zone(zone) |
3238 | zone->min_slab_pages = (zone->present_pages * | 3238 | zone->min_slab_pages = (zone->present_pages * |
3239 | sysctl_min_slab_ratio) / 100; | 3239 | sysctl_min_slab_ratio) / 100; |
3240 | return 0; | 3240 | return 0; |
3241 | } | 3241 | } |
3242 | #endif | 3242 | #endif |
3243 | 3243 | ||
3244 | /* | 3244 | /* |
3245 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around | 3245 | * lowmem_reserve_ratio_sysctl_handler - just a wrapper around |
3246 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() | 3246 | * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() |
3247 | * whenever sysctl_lowmem_reserve_ratio changes. | 3247 | * whenever sysctl_lowmem_reserve_ratio changes. |
3248 | * | 3248 | * |
3249 | * The reserve ratio obviously has absolutely no relation with the | 3249 | * The reserve ratio obviously has absolutely no relation with the |
3250 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 3250 | * pages_min watermarks. The lowmem reserve ratio can only make sense |
3251 | * if in function of the boot time zone sizes. | 3251 | * if in function of the boot time zone sizes. |
3252 | */ | 3252 | */ |
3253 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 3253 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
3254 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 3254 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
3255 | { | 3255 | { |
3256 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 3256 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); |
3257 | setup_per_zone_lowmem_reserve(); | 3257 | setup_per_zone_lowmem_reserve(); |
3258 | return 0; | 3258 | return 0; |
3259 | } | 3259 | } |
3260 | 3260 | ||
3261 | /* | 3261 | /* |
3262 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each | 3262 | * percpu_pagelist_fraction - changes the pcp->high for each zone on each |
3263 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist | 3263 | * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist |
3264 | * can have before it gets flushed back to buddy allocator. | 3264 | * can have before it gets flushed back to buddy allocator. |
3265 | */ | 3265 | */ |
3266 | 3266 | ||
3267 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 3267 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
3268 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 3268 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
3269 | { | 3269 | { |
3270 | struct zone *zone; | 3270 | struct zone *zone; |
3271 | unsigned int cpu; | 3271 | unsigned int cpu; |
3272 | int ret; | 3272 | int ret; |
3273 | 3273 | ||
3274 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 3274 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); |
3275 | if (!write || (ret == -EINVAL)) | 3275 | if (!write || (ret == -EINVAL)) |
3276 | return ret; | 3276 | return ret; |
3277 | for_each_zone(zone) { | 3277 | for_each_zone(zone) { |
3278 | for_each_online_cpu(cpu) { | 3278 | for_each_online_cpu(cpu) { |
3279 | unsigned long high; | 3279 | unsigned long high; |
3280 | high = zone->present_pages / percpu_pagelist_fraction; | 3280 | high = zone->present_pages / percpu_pagelist_fraction; |
3281 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 3281 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); |
3282 | } | 3282 | } |
3283 | } | 3283 | } |
3284 | return 0; | 3284 | return 0; |
3285 | } | 3285 | } |
3286 | 3286 | ||
3287 | int hashdist = HASHDIST_DEFAULT; | 3287 | int hashdist = HASHDIST_DEFAULT; |
3288 | 3288 | ||
3289 | #ifdef CONFIG_NUMA | 3289 | #ifdef CONFIG_NUMA |
3290 | static int __init set_hashdist(char *str) | 3290 | static int __init set_hashdist(char *str) |
3291 | { | 3291 | { |
3292 | if (!str) | 3292 | if (!str) |
3293 | return 0; | 3293 | return 0; |
3294 | hashdist = simple_strtoul(str, &str, 0); | 3294 | hashdist = simple_strtoul(str, &str, 0); |
3295 | return 1; | 3295 | return 1; |
3296 | } | 3296 | } |
3297 | __setup("hashdist=", set_hashdist); | 3297 | __setup("hashdist=", set_hashdist); |
3298 | #endif | 3298 | #endif |
3299 | 3299 | ||
3300 | /* | 3300 | /* |
3301 | * allocate a large system hash table from bootmem | 3301 | * allocate a large system hash table from bootmem |
3302 | * - it is assumed that the hash table must contain an exact power-of-2 | 3302 | * - it is assumed that the hash table must contain an exact power-of-2 |
3303 | * quantity of entries | 3303 | * quantity of entries |
3304 | * - limit is the number of hash buckets, not the total allocation size | 3304 | * - limit is the number of hash buckets, not the total allocation size |
3305 | */ | 3305 | */ |
3306 | void *__init alloc_large_system_hash(const char *tablename, | 3306 | void *__init alloc_large_system_hash(const char *tablename, |
3307 | unsigned long bucketsize, | 3307 | unsigned long bucketsize, |
3308 | unsigned long numentries, | 3308 | unsigned long numentries, |
3309 | int scale, | 3309 | int scale, |
3310 | int flags, | 3310 | int flags, |
3311 | unsigned int *_hash_shift, | 3311 | unsigned int *_hash_shift, |
3312 | unsigned int *_hash_mask, | 3312 | unsigned int *_hash_mask, |
3313 | unsigned long limit) | 3313 | unsigned long limit) |
3314 | { | 3314 | { |
3315 | unsigned long long max = limit; | 3315 | unsigned long long max = limit; |
3316 | unsigned long log2qty, size; | 3316 | unsigned long log2qty, size; |
3317 | void *table = NULL; | 3317 | void *table = NULL; |
3318 | 3318 | ||
3319 | /* allow the kernel cmdline to have a say */ | 3319 | /* allow the kernel cmdline to have a say */ |
3320 | if (!numentries) { | 3320 | if (!numentries) { |
3321 | /* round applicable memory size up to nearest megabyte */ | 3321 | /* round applicable memory size up to nearest megabyte */ |
3322 | numentries = nr_kernel_pages; | 3322 | numentries = nr_kernel_pages; |
3323 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; | 3323 | numentries += (1UL << (20 - PAGE_SHIFT)) - 1; |
3324 | numentries >>= 20 - PAGE_SHIFT; | 3324 | numentries >>= 20 - PAGE_SHIFT; |
3325 | numentries <<= 20 - PAGE_SHIFT; | 3325 | numentries <<= 20 - PAGE_SHIFT; |
3326 | 3326 | ||
3327 | /* limit to 1 bucket per 2^scale bytes of low memory */ | 3327 | /* limit to 1 bucket per 2^scale bytes of low memory */ |
3328 | if (scale > PAGE_SHIFT) | 3328 | if (scale > PAGE_SHIFT) |
3329 | numentries >>= (scale - PAGE_SHIFT); | 3329 | numentries >>= (scale - PAGE_SHIFT); |
3330 | else | 3330 | else |
3331 | numentries <<= (PAGE_SHIFT - scale); | 3331 | numentries <<= (PAGE_SHIFT - scale); |
3332 | 3332 | ||
3333 | /* Make sure we've got at least a 0-order allocation.. */ | 3333 | /* Make sure we've got at least a 0-order allocation.. */ |
3334 | if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | 3334 | if (unlikely((numentries * bucketsize) < PAGE_SIZE)) |
3335 | numentries = PAGE_SIZE / bucketsize; | 3335 | numentries = PAGE_SIZE / bucketsize; |
3336 | } | 3336 | } |
3337 | numentries = roundup_pow_of_two(numentries); | 3337 | numentries = roundup_pow_of_two(numentries); |
3338 | 3338 | ||
3339 | /* limit allocation size to 1/16 total memory by default */ | 3339 | /* limit allocation size to 1/16 total memory by default */ |
3340 | if (max == 0) { | 3340 | if (max == 0) { |
3341 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; | 3341 | max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; |
3342 | do_div(max, bucketsize); | 3342 | do_div(max, bucketsize); |
3343 | } | 3343 | } |
3344 | 3344 | ||
3345 | if (numentries > max) | 3345 | if (numentries > max) |
3346 | numentries = max; | 3346 | numentries = max; |
3347 | 3347 | ||
3348 | log2qty = ilog2(numentries); | 3348 | log2qty = ilog2(numentries); |
3349 | 3349 | ||
3350 | do { | 3350 | do { |
3351 | size = bucketsize << log2qty; | 3351 | size = bucketsize << log2qty; |
3352 | if (flags & HASH_EARLY) | 3352 | if (flags & HASH_EARLY) |
3353 | table = alloc_bootmem(size); | 3353 | table = alloc_bootmem(size); |
3354 | else if (hashdist) | 3354 | else if (hashdist) |
3355 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 3355 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
3356 | else { | 3356 | else { |
3357 | unsigned long order; | 3357 | unsigned long order; |
3358 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | 3358 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) |
3359 | ; | 3359 | ; |
3360 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | 3360 | table = (void*) __get_free_pages(GFP_ATOMIC, order); |
3361 | } | 3361 | } |
3362 | } while (!table && size > PAGE_SIZE && --log2qty); | 3362 | } while (!table && size > PAGE_SIZE && --log2qty); |
3363 | 3363 | ||
3364 | if (!table) | 3364 | if (!table) |
3365 | panic("Failed to allocate %s hash table\n", tablename); | 3365 | panic("Failed to allocate %s hash table\n", tablename); |
3366 | 3366 | ||
3367 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3367 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", |
3368 | tablename, | 3368 | tablename, |
3369 | (1U << log2qty), | 3369 | (1U << log2qty), |
3370 | ilog2(size) - PAGE_SHIFT, | 3370 | ilog2(size) - PAGE_SHIFT, |
3371 | size); | 3371 | size); |
3372 | 3372 | ||
3373 | if (_hash_shift) | 3373 | if (_hash_shift) |
3374 | *_hash_shift = log2qty; | 3374 | *_hash_shift = log2qty; |
3375 | if (_hash_mask) | 3375 | if (_hash_mask) |
3376 | *_hash_mask = (1 << log2qty) - 1; | 3376 | *_hash_mask = (1 << log2qty) - 1; |
3377 | 3377 | ||
3378 | return table; | 3378 | return table; |
3379 | } | 3379 | } |
3380 | 3380 | ||
3381 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE | 3381 | #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE |
3382 | struct page *pfn_to_page(unsigned long pfn) | 3382 | struct page *pfn_to_page(unsigned long pfn) |
3383 | { | 3383 | { |
3384 | return __pfn_to_page(pfn); | 3384 | return __pfn_to_page(pfn); |
3385 | } | 3385 | } |
3386 | unsigned long page_to_pfn(struct page *page) | 3386 | unsigned long page_to_pfn(struct page *page) |
3387 | { | 3387 | { |
3388 | return __page_to_pfn(page); | 3388 | return __page_to_pfn(page); |
3389 | } | 3389 | } |
3390 | EXPORT_SYMBOL(pfn_to_page); | 3390 | EXPORT_SYMBOL(pfn_to_page); |
3391 | EXPORT_SYMBOL(page_to_pfn); | 3391 | EXPORT_SYMBOL(page_to_pfn); |
3392 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ | 3392 | #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ |
3393 | 3393 | ||
3394 | 3394 | ||
3395 | 3395 |