Commit 476f35348eb8d2a827765992899fea78b7dcc46f

Authored by Christoph Lameter
Committed by Linus Torvalds
1 parent aee16b3cee

Safer nr_node_ids and nr_node_ids determination and initial values

The nr_cpu_ids value is currently only calculated in smp_init.  However, it
may be needed before (SLUB needs it on kmem_cache_init!) and other kernel
components may also want to allocate dynamically sized per cpu array before
smp_init.  So move the determination of possible cpus into sched_init()
where we already loop over all possible cpus early in boot.

Also initialize both nr_node_ids and nr_cpu_ids with the highest value they
could take.  If we have accidental users before these values are determined
then the current valud of 0 may cause too small per cpu and per node arrays
to be allocated.  If it is set to the maximum possible then we only waste
some memory for early boot users.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 9 additions and 9 deletions Inline Diff

1 /* 1 /*
2 * linux/init/main.c 2 * linux/init/main.c
3 * 3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * 5 *
6 * GK 2/5/95 - Changed to support mounting root fs via NFS 6 * GK 2/5/95 - Changed to support mounting root fs via NFS
7 * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96 7 * Added initrd & change_root: Werner Almesberger & Hans Lermen, Feb '96
8 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96 8 * Moan early if gcc is old, avoiding bogus kernels - Paul Gortmaker, May '96
9 * Simplified starting of init: Michael A. Griffith <grif@acm.org> 9 * Simplified starting of init: Michael A. Griffith <grif@acm.org>
10 */ 10 */
11 11
12 #include <linux/types.h> 12 #include <linux/types.h>
13 #include <linux/module.h> 13 #include <linux/module.h>
14 #include <linux/proc_fs.h> 14 #include <linux/proc_fs.h>
15 #include <linux/kernel.h> 15 #include <linux/kernel.h>
16 #include <linux/syscalls.h> 16 #include <linux/syscalls.h>
17 #include <linux/string.h> 17 #include <linux/string.h>
18 #include <linux/ctype.h> 18 #include <linux/ctype.h>
19 #include <linux/delay.h> 19 #include <linux/delay.h>
20 #include <linux/utsname.h> 20 #include <linux/utsname.h>
21 #include <linux/ioport.h> 21 #include <linux/ioport.h>
22 #include <linux/init.h> 22 #include <linux/init.h>
23 #include <linux/smp_lock.h> 23 #include <linux/smp_lock.h>
24 #include <linux/initrd.h> 24 #include <linux/initrd.h>
25 #include <linux/hdreg.h> 25 #include <linux/hdreg.h>
26 #include <linux/bootmem.h> 26 #include <linux/bootmem.h>
27 #include <linux/tty.h> 27 #include <linux/tty.h>
28 #include <linux/gfp.h> 28 #include <linux/gfp.h>
29 #include <linux/percpu.h> 29 #include <linux/percpu.h>
30 #include <linux/kmod.h> 30 #include <linux/kmod.h>
31 #include <linux/kernel_stat.h> 31 #include <linux/kernel_stat.h>
32 #include <linux/start_kernel.h> 32 #include <linux/start_kernel.h>
33 #include <linux/security.h> 33 #include <linux/security.h>
34 #include <linux/workqueue.h> 34 #include <linux/workqueue.h>
35 #include <linux/profile.h> 35 #include <linux/profile.h>
36 #include <linux/rcupdate.h> 36 #include <linux/rcupdate.h>
37 #include <linux/moduleparam.h> 37 #include <linux/moduleparam.h>
38 #include <linux/kallsyms.h> 38 #include <linux/kallsyms.h>
39 #include <linux/writeback.h> 39 #include <linux/writeback.h>
40 #include <linux/cpu.h> 40 #include <linux/cpu.h>
41 #include <linux/cpuset.h> 41 #include <linux/cpuset.h>
42 #include <linux/efi.h> 42 #include <linux/efi.h>
43 #include <linux/tick.h> 43 #include <linux/tick.h>
44 #include <linux/interrupt.h> 44 #include <linux/interrupt.h>
45 #include <linux/taskstats_kern.h> 45 #include <linux/taskstats_kern.h>
46 #include <linux/delayacct.h> 46 #include <linux/delayacct.h>
47 #include <linux/unistd.h> 47 #include <linux/unistd.h>
48 #include <linux/rmap.h> 48 #include <linux/rmap.h>
49 #include <linux/mempolicy.h> 49 #include <linux/mempolicy.h>
50 #include <linux/key.h> 50 #include <linux/key.h>
51 #include <linux/unwind.h> 51 #include <linux/unwind.h>
52 #include <linux/buffer_head.h> 52 #include <linux/buffer_head.h>
53 #include <linux/debug_locks.h> 53 #include <linux/debug_locks.h>
54 #include <linux/lockdep.h> 54 #include <linux/lockdep.h>
55 #include <linux/pid_namespace.h> 55 #include <linux/pid_namespace.h>
56 #include <linux/device.h> 56 #include <linux/device.h>
57 57
58 #include <asm/io.h> 58 #include <asm/io.h>
59 #include <asm/bugs.h> 59 #include <asm/bugs.h>
60 #include <asm/setup.h> 60 #include <asm/setup.h>
61 #include <asm/sections.h> 61 #include <asm/sections.h>
62 #include <asm/cacheflush.h> 62 #include <asm/cacheflush.h>
63 63
64 #ifdef CONFIG_X86_LOCAL_APIC 64 #ifdef CONFIG_X86_LOCAL_APIC
65 #include <asm/smp.h> 65 #include <asm/smp.h>
66 #endif 66 #endif
67 67
68 /* 68 /*
69 * This is one of the first .c files built. Error out early if we have compiler 69 * This is one of the first .c files built. Error out early if we have compiler
70 * trouble. 70 * trouble.
71 * 71 *
72 * Versions of gcc older than that listed below may actually compile and link 72 * Versions of gcc older than that listed below may actually compile and link
73 * okay, but the end product can have subtle run time bugs. To avoid associated 73 * okay, but the end product can have subtle run time bugs. To avoid associated
74 * bogus bug reports, we flatly refuse to compile with a gcc that is known to be 74 * bogus bug reports, we flatly refuse to compile with a gcc that is known to be
75 * too old from the very beginning. 75 * too old from the very beginning.
76 */ 76 */
77 #if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2) 77 #if (__GNUC__ < 3) || (__GNUC__ == 3 && __GNUC_MINOR__ < 2)
78 #error Sorry, your GCC is too old. It builds incorrect kernels. 78 #error Sorry, your GCC is too old. It builds incorrect kernels.
79 #endif 79 #endif
80 80
81 #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0 81 #if __GNUC__ == 4 && __GNUC_MINOR__ == 1 && __GNUC_PATCHLEVEL__ == 0
82 #warning gcc-4.1.0 is known to miscompile the kernel. A different compiler version is recommended. 82 #warning gcc-4.1.0 is known to miscompile the kernel. A different compiler version is recommended.
83 #endif 83 #endif
84 84
85 static int kernel_init(void *); 85 static int kernel_init(void *);
86 86
87 extern void init_IRQ(void); 87 extern void init_IRQ(void);
88 extern void fork_init(unsigned long); 88 extern void fork_init(unsigned long);
89 extern void mca_init(void); 89 extern void mca_init(void);
90 extern void sbus_init(void); 90 extern void sbus_init(void);
91 extern void signals_init(void); 91 extern void signals_init(void);
92 extern void pidhash_init(void); 92 extern void pidhash_init(void);
93 extern void pidmap_init(void); 93 extern void pidmap_init(void);
94 extern void prio_tree_init(void); 94 extern void prio_tree_init(void);
95 extern void radix_tree_init(void); 95 extern void radix_tree_init(void);
96 extern void free_initmem(void); 96 extern void free_initmem(void);
97 extern void prepare_namespace(void); 97 extern void prepare_namespace(void);
98 #ifdef CONFIG_ACPI 98 #ifdef CONFIG_ACPI
99 extern void acpi_early_init(void); 99 extern void acpi_early_init(void);
100 #else 100 #else
101 static inline void acpi_early_init(void) { } 101 static inline void acpi_early_init(void) { }
102 #endif 102 #endif
103 #ifndef CONFIG_DEBUG_RODATA 103 #ifndef CONFIG_DEBUG_RODATA
104 static inline void mark_rodata_ro(void) { } 104 static inline void mark_rodata_ro(void) { }
105 #endif 105 #endif
106 106
107 #ifdef CONFIG_TC 107 #ifdef CONFIG_TC
108 extern void tc_init(void); 108 extern void tc_init(void);
109 #endif 109 #endif
110 110
111 enum system_states system_state; 111 enum system_states system_state;
112 EXPORT_SYMBOL(system_state); 112 EXPORT_SYMBOL(system_state);
113 113
114 /* 114 /*
115 * Boot command-line arguments 115 * Boot command-line arguments
116 */ 116 */
117 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT 117 #define MAX_INIT_ARGS CONFIG_INIT_ENV_ARG_LIMIT
118 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT 118 #define MAX_INIT_ENVS CONFIG_INIT_ENV_ARG_LIMIT
119 119
120 extern void time_init(void); 120 extern void time_init(void);
121 /* Default late time init is NULL. archs can override this later. */ 121 /* Default late time init is NULL. archs can override this later. */
122 void (*late_time_init)(void); 122 void (*late_time_init)(void);
123 extern void softirq_init(void); 123 extern void softirq_init(void);
124 124
125 /* Untouched command line saved by arch-specific code. */ 125 /* Untouched command line saved by arch-specific code. */
126 char __initdata boot_command_line[COMMAND_LINE_SIZE]; 126 char __initdata boot_command_line[COMMAND_LINE_SIZE];
127 /* Untouched saved command line (eg. for /proc) */ 127 /* Untouched saved command line (eg. for /proc) */
128 char *saved_command_line; 128 char *saved_command_line;
129 /* Command line for parameter parsing */ 129 /* Command line for parameter parsing */
130 static char *static_command_line; 130 static char *static_command_line;
131 131
132 static char *execute_command; 132 static char *execute_command;
133 static char *ramdisk_execute_command; 133 static char *ramdisk_execute_command;
134 134
135 /* Setup configured maximum number of CPUs to activate */ 135 /* Setup configured maximum number of CPUs to activate */
136 static unsigned int max_cpus = NR_CPUS; 136 static unsigned int max_cpus = NR_CPUS;
137 137
138 /* 138 /*
139 * If set, this is an indication to the drivers that reset the underlying 139 * If set, this is an indication to the drivers that reset the underlying
140 * device before going ahead with the initialization otherwise driver might 140 * device before going ahead with the initialization otherwise driver might
141 * rely on the BIOS and skip the reset operation. 141 * rely on the BIOS and skip the reset operation.
142 * 142 *
143 * This is useful if kernel is booting in an unreliable environment. 143 * This is useful if kernel is booting in an unreliable environment.
144 * For ex. kdump situaiton where previous kernel has crashed, BIOS has been 144 * For ex. kdump situaiton where previous kernel has crashed, BIOS has been
145 * skipped and devices will be in unknown state. 145 * skipped and devices will be in unknown state.
146 */ 146 */
147 unsigned int reset_devices; 147 unsigned int reset_devices;
148 EXPORT_SYMBOL(reset_devices); 148 EXPORT_SYMBOL(reset_devices);
149 149
150 /* 150 /*
151 * Setup routine for controlling SMP activation 151 * Setup routine for controlling SMP activation
152 * 152 *
153 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP 153 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
154 * activation entirely (the MPS table probe still happens, though). 154 * activation entirely (the MPS table probe still happens, though).
155 * 155 *
156 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer 156 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
157 * greater than 0, limits the maximum number of CPUs activated in 157 * greater than 0, limits the maximum number of CPUs activated in
158 * SMP mode to <NUM>. 158 * SMP mode to <NUM>.
159 */ 159 */
160 static int __init nosmp(char *str) 160 static int __init nosmp(char *str)
161 { 161 {
162 max_cpus = 0; 162 max_cpus = 0;
163 return 1; 163 return 1;
164 } 164 }
165 165
166 __setup("nosmp", nosmp); 166 __setup("nosmp", nosmp);
167 167
168 static int __init maxcpus(char *str) 168 static int __init maxcpus(char *str)
169 { 169 {
170 get_option(&str, &max_cpus); 170 get_option(&str, &max_cpus);
171 return 1; 171 return 1;
172 } 172 }
173 173
174 __setup("maxcpus=", maxcpus); 174 __setup("maxcpus=", maxcpus);
175 175
176 static int __init set_reset_devices(char *str) 176 static int __init set_reset_devices(char *str)
177 { 177 {
178 reset_devices = 1; 178 reset_devices = 1;
179 return 1; 179 return 1;
180 } 180 }
181 181
182 __setup("reset_devices", set_reset_devices); 182 __setup("reset_devices", set_reset_devices);
183 183
184 static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; 184 static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
185 char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; 185 char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
186 static const char *panic_later, *panic_param; 186 static const char *panic_later, *panic_param;
187 187
188 extern struct obs_kernel_param __setup_start[], __setup_end[]; 188 extern struct obs_kernel_param __setup_start[], __setup_end[];
189 189
190 static int __init obsolete_checksetup(char *line) 190 static int __init obsolete_checksetup(char *line)
191 { 191 {
192 struct obs_kernel_param *p; 192 struct obs_kernel_param *p;
193 int had_early_param = 0; 193 int had_early_param = 0;
194 194
195 p = __setup_start; 195 p = __setup_start;
196 do { 196 do {
197 int n = strlen(p->str); 197 int n = strlen(p->str);
198 if (!strncmp(line, p->str, n)) { 198 if (!strncmp(line, p->str, n)) {
199 if (p->early) { 199 if (p->early) {
200 /* Already done in parse_early_param? 200 /* Already done in parse_early_param?
201 * (Needs exact match on param part). 201 * (Needs exact match on param part).
202 * Keep iterating, as we can have early 202 * Keep iterating, as we can have early
203 * params and __setups of same names 8( */ 203 * params and __setups of same names 8( */
204 if (line[n] == '\0' || line[n] == '=') 204 if (line[n] == '\0' || line[n] == '=')
205 had_early_param = 1; 205 had_early_param = 1;
206 } else if (!p->setup_func) { 206 } else if (!p->setup_func) {
207 printk(KERN_WARNING "Parameter %s is obsolete," 207 printk(KERN_WARNING "Parameter %s is obsolete,"
208 " ignored\n", p->str); 208 " ignored\n", p->str);
209 return 1; 209 return 1;
210 } else if (p->setup_func(line + n)) 210 } else if (p->setup_func(line + n))
211 return 1; 211 return 1;
212 } 212 }
213 p++; 213 p++;
214 } while (p < __setup_end); 214 } while (p < __setup_end);
215 215
216 return had_early_param; 216 return had_early_param;
217 } 217 }
218 218
219 /* 219 /*
220 * This should be approx 2 Bo*oMips to start (note initial shift), and will 220 * This should be approx 2 Bo*oMips to start (note initial shift), and will
221 * still work even if initially too large, it will just take slightly longer 221 * still work even if initially too large, it will just take slightly longer
222 */ 222 */
223 unsigned long loops_per_jiffy = (1<<12); 223 unsigned long loops_per_jiffy = (1<<12);
224 224
225 EXPORT_SYMBOL(loops_per_jiffy); 225 EXPORT_SYMBOL(loops_per_jiffy);
226 226
227 static int __init debug_kernel(char *str) 227 static int __init debug_kernel(char *str)
228 { 228 {
229 if (*str) 229 if (*str)
230 return 0; 230 return 0;
231 console_loglevel = 10; 231 console_loglevel = 10;
232 return 1; 232 return 1;
233 } 233 }
234 234
235 static int __init quiet_kernel(char *str) 235 static int __init quiet_kernel(char *str)
236 { 236 {
237 if (*str) 237 if (*str)
238 return 0; 238 return 0;
239 console_loglevel = 4; 239 console_loglevel = 4;
240 return 1; 240 return 1;
241 } 241 }
242 242
243 __setup("debug", debug_kernel); 243 __setup("debug", debug_kernel);
244 __setup("quiet", quiet_kernel); 244 __setup("quiet", quiet_kernel);
245 245
246 static int __init loglevel(char *str) 246 static int __init loglevel(char *str)
247 { 247 {
248 get_option(&str, &console_loglevel); 248 get_option(&str, &console_loglevel);
249 return 1; 249 return 1;
250 } 250 }
251 251
252 __setup("loglevel=", loglevel); 252 __setup("loglevel=", loglevel);
253 253
254 /* 254 /*
255 * Unknown boot options get handed to init, unless they look like 255 * Unknown boot options get handed to init, unless they look like
256 * failed parameters 256 * failed parameters
257 */ 257 */
258 static int __init unknown_bootoption(char *param, char *val) 258 static int __init unknown_bootoption(char *param, char *val)
259 { 259 {
260 /* Change NUL term back to "=", to make "param" the whole string. */ 260 /* Change NUL term back to "=", to make "param" the whole string. */
261 if (val) { 261 if (val) {
262 /* param=val or param="val"? */ 262 /* param=val or param="val"? */
263 if (val == param+strlen(param)+1) 263 if (val == param+strlen(param)+1)
264 val[-1] = '='; 264 val[-1] = '=';
265 else if (val == param+strlen(param)+2) { 265 else if (val == param+strlen(param)+2) {
266 val[-2] = '='; 266 val[-2] = '=';
267 memmove(val-1, val, strlen(val)+1); 267 memmove(val-1, val, strlen(val)+1);
268 val--; 268 val--;
269 } else 269 } else
270 BUG(); 270 BUG();
271 } 271 }
272 272
273 /* Handle obsolete-style parameters */ 273 /* Handle obsolete-style parameters */
274 if (obsolete_checksetup(param)) 274 if (obsolete_checksetup(param))
275 return 0; 275 return 0;
276 276
277 /* 277 /*
278 * Preemptive maintenance for "why didn't my mispelled command 278 * Preemptive maintenance for "why didn't my mispelled command
279 * line work?" 279 * line work?"
280 */ 280 */
281 if (strchr(param, '.') && (!val || strchr(param, '.') < val)) { 281 if (strchr(param, '.') && (!val || strchr(param, '.') < val)) {
282 printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param); 282 printk(KERN_ERR "Unknown boot option `%s': ignoring\n", param);
283 return 0; 283 return 0;
284 } 284 }
285 285
286 if (panic_later) 286 if (panic_later)
287 return 0; 287 return 0;
288 288
289 if (val) { 289 if (val) {
290 /* Environment option */ 290 /* Environment option */
291 unsigned int i; 291 unsigned int i;
292 for (i = 0; envp_init[i]; i++) { 292 for (i = 0; envp_init[i]; i++) {
293 if (i == MAX_INIT_ENVS) { 293 if (i == MAX_INIT_ENVS) {
294 panic_later = "Too many boot env vars at `%s'"; 294 panic_later = "Too many boot env vars at `%s'";
295 panic_param = param; 295 panic_param = param;
296 } 296 }
297 if (!strncmp(param, envp_init[i], val - param)) 297 if (!strncmp(param, envp_init[i], val - param))
298 break; 298 break;
299 } 299 }
300 envp_init[i] = param; 300 envp_init[i] = param;
301 } else { 301 } else {
302 /* Command line option */ 302 /* Command line option */
303 unsigned int i; 303 unsigned int i;
304 for (i = 0; argv_init[i]; i++) { 304 for (i = 0; argv_init[i]; i++) {
305 if (i == MAX_INIT_ARGS) { 305 if (i == MAX_INIT_ARGS) {
306 panic_later = "Too many boot init vars at `%s'"; 306 panic_later = "Too many boot init vars at `%s'";
307 panic_param = param; 307 panic_param = param;
308 } 308 }
309 } 309 }
310 argv_init[i] = param; 310 argv_init[i] = param;
311 } 311 }
312 return 0; 312 return 0;
313 } 313 }
314 314
315 static int __init init_setup(char *str) 315 static int __init init_setup(char *str)
316 { 316 {
317 unsigned int i; 317 unsigned int i;
318 318
319 execute_command = str; 319 execute_command = str;
320 /* 320 /*
321 * In case LILO is going to boot us with default command line, 321 * In case LILO is going to boot us with default command line,
322 * it prepends "auto" before the whole cmdline which makes 322 * it prepends "auto" before the whole cmdline which makes
323 * the shell think it should execute a script with such name. 323 * the shell think it should execute a script with such name.
324 * So we ignore all arguments entered _before_ init=... [MJ] 324 * So we ignore all arguments entered _before_ init=... [MJ]
325 */ 325 */
326 for (i = 1; i < MAX_INIT_ARGS; i++) 326 for (i = 1; i < MAX_INIT_ARGS; i++)
327 argv_init[i] = NULL; 327 argv_init[i] = NULL;
328 return 1; 328 return 1;
329 } 329 }
330 __setup("init=", init_setup); 330 __setup("init=", init_setup);
331 331
332 static int __init rdinit_setup(char *str) 332 static int __init rdinit_setup(char *str)
333 { 333 {
334 unsigned int i; 334 unsigned int i;
335 335
336 ramdisk_execute_command = str; 336 ramdisk_execute_command = str;
337 /* See "auto" comment in init_setup */ 337 /* See "auto" comment in init_setup */
338 for (i = 1; i < MAX_INIT_ARGS; i++) 338 for (i = 1; i < MAX_INIT_ARGS; i++)
339 argv_init[i] = NULL; 339 argv_init[i] = NULL;
340 return 1; 340 return 1;
341 } 341 }
342 __setup("rdinit=", rdinit_setup); 342 __setup("rdinit=", rdinit_setup);
343 343
344 #ifndef CONFIG_SMP 344 #ifndef CONFIG_SMP
345 345
346 #ifdef CONFIG_X86_LOCAL_APIC 346 #ifdef CONFIG_X86_LOCAL_APIC
347 static void __init smp_init(void) 347 static void __init smp_init(void)
348 { 348 {
349 APIC_init_uniprocessor(); 349 APIC_init_uniprocessor();
350 } 350 }
351 #else 351 #else
352 #define smp_init() do { } while (0) 352 #define smp_init() do { } while (0)
353 #endif 353 #endif
354 354
355 static inline void setup_per_cpu_areas(void) { } 355 static inline void setup_per_cpu_areas(void) { }
356 static inline void smp_prepare_cpus(unsigned int maxcpus) { } 356 static inline void smp_prepare_cpus(unsigned int maxcpus) { }
357 357
358 #else 358 #else
359 359
360 #ifdef __GENERIC_PER_CPU 360 #ifdef __GENERIC_PER_CPU
361 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 361 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
362 362
363 EXPORT_SYMBOL(__per_cpu_offset); 363 EXPORT_SYMBOL(__per_cpu_offset);
364 364
365 static void __init setup_per_cpu_areas(void) 365 static void __init setup_per_cpu_areas(void)
366 { 366 {
367 unsigned long size, i; 367 unsigned long size, i;
368 char *ptr; 368 char *ptr;
369 unsigned long nr_possible_cpus = num_possible_cpus(); 369 unsigned long nr_possible_cpus = num_possible_cpus();
370 370
371 /* Copy section for each CPU (we discard the original) */ 371 /* Copy section for each CPU (we discard the original) */
372 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); 372 size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
373 ptr = alloc_bootmem_pages(size * nr_possible_cpus); 373 ptr = alloc_bootmem_pages(size * nr_possible_cpus);
374 374
375 for_each_possible_cpu(i) { 375 for_each_possible_cpu(i) {
376 __per_cpu_offset[i] = ptr - __per_cpu_start; 376 __per_cpu_offset[i] = ptr - __per_cpu_start;
377 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 377 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
378 ptr += size; 378 ptr += size;
379 } 379 }
380 } 380 }
381 #endif /* !__GENERIC_PER_CPU */ 381 #endif /* !__GENERIC_PER_CPU */
382 382
383 /* Called by boot processor to activate the rest. */ 383 /* Called by boot processor to activate the rest. */
384 static void __init smp_init(void) 384 static void __init smp_init(void)
385 { 385 {
386 unsigned int cpu; 386 unsigned int cpu;
387 unsigned highest = 0;
388
389 for_each_cpu_mask(cpu, cpu_possible_map)
390 highest = cpu;
391 nr_cpu_ids = highest + 1;
392 387
393 /* FIXME: This should be done in userspace --RR */ 388 /* FIXME: This should be done in userspace --RR */
394 for_each_present_cpu(cpu) { 389 for_each_present_cpu(cpu) {
395 if (num_online_cpus() >= max_cpus) 390 if (num_online_cpus() >= max_cpus)
396 break; 391 break;
397 if (!cpu_online(cpu)) 392 if (!cpu_online(cpu))
398 cpu_up(cpu); 393 cpu_up(cpu);
399 } 394 }
400 395
401 /* Any cleanup work */ 396 /* Any cleanup work */
402 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 397 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
403 smp_cpus_done(max_cpus); 398 smp_cpus_done(max_cpus);
404 } 399 }
405 400
406 #endif 401 #endif
407 402
408 /* 403 /*
409 * We need to store the untouched command line for future reference. 404 * We need to store the untouched command line for future reference.
410 * We also need to store the touched command line since the parameter 405 * We also need to store the touched command line since the parameter
411 * parsing is performed in place, and we should allow a component to 406 * parsing is performed in place, and we should allow a component to
412 * store reference of name/value for future reference. 407 * store reference of name/value for future reference.
413 */ 408 */
414 static void __init setup_command_line(char *command_line) 409 static void __init setup_command_line(char *command_line)
415 { 410 {
416 saved_command_line = alloc_bootmem(strlen (boot_command_line)+1); 411 saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);
417 static_command_line = alloc_bootmem(strlen (command_line)+1); 412 static_command_line = alloc_bootmem(strlen (command_line)+1);
418 strcpy (saved_command_line, boot_command_line); 413 strcpy (saved_command_line, boot_command_line);
419 strcpy (static_command_line, command_line); 414 strcpy (static_command_line, command_line);
420 } 415 }
421 416
422 /* 417 /*
423 * We need to finalize in a non-__init function or else race conditions 418 * We need to finalize in a non-__init function or else race conditions
424 * between the root thread and the init thread may cause start_kernel to 419 * between the root thread and the init thread may cause start_kernel to
425 * be reaped by free_initmem before the root thread has proceeded to 420 * be reaped by free_initmem before the root thread has proceeded to
426 * cpu_idle. 421 * cpu_idle.
427 * 422 *
428 * gcc-3.4 accidentally inlines this function, so use noinline. 423 * gcc-3.4 accidentally inlines this function, so use noinline.
429 */ 424 */
430 425
431 static void noinline rest_init(void) 426 static void noinline rest_init(void)
432 __releases(kernel_lock) 427 __releases(kernel_lock)
433 { 428 {
434 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); 429 kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
435 numa_default_policy(); 430 numa_default_policy();
436 unlock_kernel(); 431 unlock_kernel();
437 432
438 /* 433 /*
439 * The boot idle thread must execute schedule() 434 * The boot idle thread must execute schedule()
440 * at least one to get things moving: 435 * at least one to get things moving:
441 */ 436 */
442 preempt_enable_no_resched(); 437 preempt_enable_no_resched();
443 schedule(); 438 schedule();
444 preempt_disable(); 439 preempt_disable();
445 440
446 /* Call into cpu_idle with preempt disabled */ 441 /* Call into cpu_idle with preempt disabled */
447 cpu_idle(); 442 cpu_idle();
448 } 443 }
449 444
450 /* Check for early params. */ 445 /* Check for early params. */
451 static int __init do_early_param(char *param, char *val) 446 static int __init do_early_param(char *param, char *val)
452 { 447 {
453 struct obs_kernel_param *p; 448 struct obs_kernel_param *p;
454 449
455 for (p = __setup_start; p < __setup_end; p++) { 450 for (p = __setup_start; p < __setup_end; p++) {
456 if (p->early && strcmp(param, p->str) == 0) { 451 if (p->early && strcmp(param, p->str) == 0) {
457 if (p->setup_func(val) != 0) 452 if (p->setup_func(val) != 0)
458 printk(KERN_WARNING 453 printk(KERN_WARNING
459 "Malformed early option '%s'\n", param); 454 "Malformed early option '%s'\n", param);
460 } 455 }
461 } 456 }
462 /* We accept everything at this stage. */ 457 /* We accept everything at this stage. */
463 return 0; 458 return 0;
464 } 459 }
465 460
466 /* Arch code calls this early on, or if not, just before other parsing. */ 461 /* Arch code calls this early on, or if not, just before other parsing. */
467 void __init parse_early_param(void) 462 void __init parse_early_param(void)
468 { 463 {
469 static __initdata int done = 0; 464 static __initdata int done = 0;
470 static __initdata char tmp_cmdline[COMMAND_LINE_SIZE]; 465 static __initdata char tmp_cmdline[COMMAND_LINE_SIZE];
471 466
472 if (done) 467 if (done)
473 return; 468 return;
474 469
475 /* All fall through to do_early_param. */ 470 /* All fall through to do_early_param. */
476 strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE); 471 strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);
477 parse_args("early options", tmp_cmdline, NULL, 0, do_early_param); 472 parse_args("early options", tmp_cmdline, NULL, 0, do_early_param);
478 done = 1; 473 done = 1;
479 } 474 }
480 475
481 /* 476 /*
482 * Activate the first processor. 477 * Activate the first processor.
483 */ 478 */
484 479
485 static void __init boot_cpu_init(void) 480 static void __init boot_cpu_init(void)
486 { 481 {
487 int cpu = smp_processor_id(); 482 int cpu = smp_processor_id();
488 /* Mark the boot cpu "present", "online" etc for SMP and UP case */ 483 /* Mark the boot cpu "present", "online" etc for SMP and UP case */
489 cpu_set(cpu, cpu_online_map); 484 cpu_set(cpu, cpu_online_map);
490 cpu_set(cpu, cpu_present_map); 485 cpu_set(cpu, cpu_present_map);
491 cpu_set(cpu, cpu_possible_map); 486 cpu_set(cpu, cpu_possible_map);
492 } 487 }
493 488
494 void __init __attribute__((weak)) smp_setup_processor_id(void) 489 void __init __attribute__((weak)) smp_setup_processor_id(void)
495 { 490 {
496 } 491 }
497 492
498 asmlinkage void __init start_kernel(void) 493 asmlinkage void __init start_kernel(void)
499 { 494 {
500 char * command_line; 495 char * command_line;
501 extern struct kernel_param __start___param[], __stop___param[]; 496 extern struct kernel_param __start___param[], __stop___param[];
502 497
503 smp_setup_processor_id(); 498 smp_setup_processor_id();
504 499
505 /* 500 /*
506 * Need to run as early as possible, to initialize the 501 * Need to run as early as possible, to initialize the
507 * lockdep hash: 502 * lockdep hash:
508 */ 503 */
509 unwind_init(); 504 unwind_init();
510 lockdep_init(); 505 lockdep_init();
511 506
512 local_irq_disable(); 507 local_irq_disable();
513 early_boot_irqs_off(); 508 early_boot_irqs_off();
514 early_init_irq_lock_class(); 509 early_init_irq_lock_class();
515 510
516 /* 511 /*
517 * Interrupts are still disabled. Do necessary setups, then 512 * Interrupts are still disabled. Do necessary setups, then
518 * enable them 513 * enable them
519 */ 514 */
520 lock_kernel(); 515 lock_kernel();
521 tick_init(); 516 tick_init();
522 boot_cpu_init(); 517 boot_cpu_init();
523 page_address_init(); 518 page_address_init();
524 printk(KERN_NOTICE); 519 printk(KERN_NOTICE);
525 printk(linux_banner); 520 printk(linux_banner);
526 setup_arch(&command_line); 521 setup_arch(&command_line);
527 setup_command_line(command_line); 522 setup_command_line(command_line);
528 unwind_setup(); 523 unwind_setup();
529 setup_per_cpu_areas(); 524 setup_per_cpu_areas();
530 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ 525 smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
531 526
532 /* 527 /*
533 * Set up the scheduler prior starting any interrupts (such as the 528 * Set up the scheduler prior starting any interrupts (such as the
534 * timer interrupt). Full topology setup happens at smp_init() 529 * timer interrupt). Full topology setup happens at smp_init()
535 * time - but meanwhile we still have a functioning scheduler. 530 * time - but meanwhile we still have a functioning scheduler.
536 */ 531 */
537 sched_init(); 532 sched_init();
538 /* 533 /*
539 * Disable preemption - early bootup scheduling is extremely 534 * Disable preemption - early bootup scheduling is extremely
540 * fragile until we cpu_idle() for the first time. 535 * fragile until we cpu_idle() for the first time.
541 */ 536 */
542 preempt_disable(); 537 preempt_disable();
543 build_all_zonelists(); 538 build_all_zonelists();
544 page_alloc_init(); 539 page_alloc_init();
545 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); 540 printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
546 parse_early_param(); 541 parse_early_param();
547 parse_args("Booting kernel", static_command_line, __start___param, 542 parse_args("Booting kernel", static_command_line, __start___param,
548 __stop___param - __start___param, 543 __stop___param - __start___param,
549 &unknown_bootoption); 544 &unknown_bootoption);
550 if (!irqs_disabled()) { 545 if (!irqs_disabled()) {
551 printk(KERN_WARNING "start_kernel(): bug: interrupts were " 546 printk(KERN_WARNING "start_kernel(): bug: interrupts were "
552 "enabled *very* early, fixing it\n"); 547 "enabled *very* early, fixing it\n");
553 local_irq_disable(); 548 local_irq_disable();
554 } 549 }
555 sort_main_extable(); 550 sort_main_extable();
556 trap_init(); 551 trap_init();
557 rcu_init(); 552 rcu_init();
558 init_IRQ(); 553 init_IRQ();
559 pidhash_init(); 554 pidhash_init();
560 init_timers(); 555 init_timers();
561 hrtimers_init(); 556 hrtimers_init();
562 softirq_init(); 557 softirq_init();
563 timekeeping_init(); 558 timekeeping_init();
564 time_init(); 559 time_init();
565 profile_init(); 560 profile_init();
566 if (!irqs_disabled()) 561 if (!irqs_disabled())
567 printk("start_kernel(): bug: interrupts were enabled early\n"); 562 printk("start_kernel(): bug: interrupts were enabled early\n");
568 early_boot_irqs_on(); 563 early_boot_irqs_on();
569 local_irq_enable(); 564 local_irq_enable();
570 565
571 /* 566 /*
572 * HACK ALERT! This is early. We're enabling the console before 567 * HACK ALERT! This is early. We're enabling the console before
573 * we've done PCI setups etc, and console_init() must be aware of 568 * we've done PCI setups etc, and console_init() must be aware of
574 * this. But we do want output early, in case something goes wrong. 569 * this. But we do want output early, in case something goes wrong.
575 */ 570 */
576 console_init(); 571 console_init();
577 if (panic_later) 572 if (panic_later)
578 panic(panic_later, panic_param); 573 panic(panic_later, panic_param);
579 574
580 lockdep_info(); 575 lockdep_info();
581 576
582 /* 577 /*
583 * Need to run this when irqs are enabled, because it wants 578 * Need to run this when irqs are enabled, because it wants
584 * to self-test [hard/soft]-irqs on/off lock inversion bugs 579 * to self-test [hard/soft]-irqs on/off lock inversion bugs
585 * too: 580 * too:
586 */ 581 */
587 locking_selftest(); 582 locking_selftest();
588 583
589 #ifdef CONFIG_BLK_DEV_INITRD 584 #ifdef CONFIG_BLK_DEV_INITRD
590 if (initrd_start && !initrd_below_start_ok && 585 if (initrd_start && !initrd_below_start_ok &&
591 initrd_start < min_low_pfn << PAGE_SHIFT) { 586 initrd_start < min_low_pfn << PAGE_SHIFT) {
592 printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - " 587 printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
593 "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT); 588 "disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);
594 initrd_start = 0; 589 initrd_start = 0;
595 } 590 }
596 #endif 591 #endif
597 vfs_caches_init_early(); 592 vfs_caches_init_early();
598 cpuset_init_early(); 593 cpuset_init_early();
599 mem_init(); 594 mem_init();
600 kmem_cache_init(); 595 kmem_cache_init();
601 setup_per_cpu_pageset(); 596 setup_per_cpu_pageset();
602 numa_policy_init(); 597 numa_policy_init();
603 if (late_time_init) 598 if (late_time_init)
604 late_time_init(); 599 late_time_init();
605 calibrate_delay(); 600 calibrate_delay();
606 pidmap_init(); 601 pidmap_init();
607 pgtable_cache_init(); 602 pgtable_cache_init();
608 prio_tree_init(); 603 prio_tree_init();
609 anon_vma_init(); 604 anon_vma_init();
610 #ifdef CONFIG_X86 605 #ifdef CONFIG_X86
611 if (efi_enabled) 606 if (efi_enabled)
612 efi_enter_virtual_mode(); 607 efi_enter_virtual_mode();
613 #endif 608 #endif
614 fork_init(num_physpages); 609 fork_init(num_physpages);
615 proc_caches_init(); 610 proc_caches_init();
616 buffer_init(); 611 buffer_init();
617 unnamed_dev_init(); 612 unnamed_dev_init();
618 key_init(); 613 key_init();
619 security_init(); 614 security_init();
620 vfs_caches_init(num_physpages); 615 vfs_caches_init(num_physpages);
621 radix_tree_init(); 616 radix_tree_init();
622 signals_init(); 617 signals_init();
623 /* rootfs populating might need page-writeback */ 618 /* rootfs populating might need page-writeback */
624 page_writeback_init(); 619 page_writeback_init();
625 #ifdef CONFIG_PROC_FS 620 #ifdef CONFIG_PROC_FS
626 proc_root_init(); 621 proc_root_init();
627 #endif 622 #endif
628 cpuset_init(); 623 cpuset_init();
629 taskstats_init_early(); 624 taskstats_init_early();
630 delayacct_init(); 625 delayacct_init();
631 626
632 check_bugs(); 627 check_bugs();
633 628
634 acpi_early_init(); /* before LAPIC and SMP init */ 629 acpi_early_init(); /* before LAPIC and SMP init */
635 630
636 /* Do the rest non-__init'ed, we're now alive */ 631 /* Do the rest non-__init'ed, we're now alive */
637 rest_init(); 632 rest_init();
638 } 633 }
639 634
640 static int __initdata initcall_debug; 635 static int __initdata initcall_debug;
641 636
642 static int __init initcall_debug_setup(char *str) 637 static int __init initcall_debug_setup(char *str)
643 { 638 {
644 initcall_debug = 1; 639 initcall_debug = 1;
645 return 1; 640 return 1;
646 } 641 }
647 __setup("initcall_debug", initcall_debug_setup); 642 __setup("initcall_debug", initcall_debug_setup);
648 643
649 extern initcall_t __initcall_start[], __initcall_end[]; 644 extern initcall_t __initcall_start[], __initcall_end[];
650 645
651 static void __init do_initcalls(void) 646 static void __init do_initcalls(void)
652 { 647 {
653 initcall_t *call; 648 initcall_t *call;
654 int count = preempt_count(); 649 int count = preempt_count();
655 650
656 for (call = __initcall_start; call < __initcall_end; call++) { 651 for (call = __initcall_start; call < __initcall_end; call++) {
657 char *msg = NULL; 652 char *msg = NULL;
658 char msgbuf[40]; 653 char msgbuf[40];
659 int result; 654 int result;
660 655
661 if (initcall_debug) { 656 if (initcall_debug) {
662 printk("Calling initcall 0x%p", *call); 657 printk("Calling initcall 0x%p", *call);
663 print_fn_descriptor_symbol(": %s()", 658 print_fn_descriptor_symbol(": %s()",
664 (unsigned long) *call); 659 (unsigned long) *call);
665 printk("\n"); 660 printk("\n");
666 } 661 }
667 662
668 result = (*call)(); 663 result = (*call)();
669 664
670 if (result && result != -ENODEV && initcall_debug) { 665 if (result && result != -ENODEV && initcall_debug) {
671 sprintf(msgbuf, "error code %d", result); 666 sprintf(msgbuf, "error code %d", result);
672 msg = msgbuf; 667 msg = msgbuf;
673 } 668 }
674 if (preempt_count() != count) { 669 if (preempt_count() != count) {
675 msg = "preemption imbalance"; 670 msg = "preemption imbalance";
676 preempt_count() = count; 671 preempt_count() = count;
677 } 672 }
678 if (irqs_disabled()) { 673 if (irqs_disabled()) {
679 msg = "disabled interrupts"; 674 msg = "disabled interrupts";
680 local_irq_enable(); 675 local_irq_enable();
681 } 676 }
682 if (msg) { 677 if (msg) {
683 printk(KERN_WARNING "initcall at 0x%p", *call); 678 printk(KERN_WARNING "initcall at 0x%p", *call);
684 print_fn_descriptor_symbol(": %s()", 679 print_fn_descriptor_symbol(": %s()",
685 (unsigned long) *call); 680 (unsigned long) *call);
686 printk(": returned with %s\n", msg); 681 printk(": returned with %s\n", msg);
687 } 682 }
688 } 683 }
689 684
690 /* Make sure there is no pending stuff from the initcall sequence */ 685 /* Make sure there is no pending stuff from the initcall sequence */
691 flush_scheduled_work(); 686 flush_scheduled_work();
692 } 687 }
693 688
694 /* 689 /*
695 * Ok, the machine is now initialized. None of the devices 690 * Ok, the machine is now initialized. None of the devices
696 * have been touched yet, but the CPU subsystem is up and 691 * have been touched yet, but the CPU subsystem is up and
697 * running, and memory and process management works. 692 * running, and memory and process management works.
698 * 693 *
699 * Now we can finally start doing some real work.. 694 * Now we can finally start doing some real work..
700 */ 695 */
701 static void __init do_basic_setup(void) 696 static void __init do_basic_setup(void)
702 { 697 {
703 /* drivers will send hotplug events */ 698 /* drivers will send hotplug events */
704 init_workqueues(); 699 init_workqueues();
705 usermodehelper_init(); 700 usermodehelper_init();
706 driver_init(); 701 driver_init();
707 init_irq_proc(); 702 init_irq_proc();
708 do_initcalls(); 703 do_initcalls();
709 } 704 }
710 705
711 static void __init do_pre_smp_initcalls(void) 706 static void __init do_pre_smp_initcalls(void)
712 { 707 {
713 extern int spawn_ksoftirqd(void); 708 extern int spawn_ksoftirqd(void);
714 #ifdef CONFIG_SMP 709 #ifdef CONFIG_SMP
715 extern int migration_init(void); 710 extern int migration_init(void);
716 711
717 migration_init(); 712 migration_init();
718 #endif 713 #endif
719 spawn_ksoftirqd(); 714 spawn_ksoftirqd();
720 spawn_softlockup_task(); 715 spawn_softlockup_task();
721 } 716 }
722 717
723 static void run_init_process(char *init_filename) 718 static void run_init_process(char *init_filename)
724 { 719 {
725 argv_init[0] = init_filename; 720 argv_init[0] = init_filename;
726 kernel_execve(init_filename, argv_init, envp_init); 721 kernel_execve(init_filename, argv_init, envp_init);
727 } 722 }
728 723
729 /* This is a non __init function. Force it to be noinline otherwise gcc 724 /* This is a non __init function. Force it to be noinline otherwise gcc
730 * makes it inline to init() and it becomes part of init.text section 725 * makes it inline to init() and it becomes part of init.text section
731 */ 726 */
732 static int noinline init_post(void) 727 static int noinline init_post(void)
733 { 728 {
734 free_initmem(); 729 free_initmem();
735 unlock_kernel(); 730 unlock_kernel();
736 mark_rodata_ro(); 731 mark_rodata_ro();
737 system_state = SYSTEM_RUNNING; 732 system_state = SYSTEM_RUNNING;
738 numa_default_policy(); 733 numa_default_policy();
739 734
740 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) 735 if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
741 printk(KERN_WARNING "Warning: unable to open an initial console.\n"); 736 printk(KERN_WARNING "Warning: unable to open an initial console.\n");
742 737
743 (void) sys_dup(0); 738 (void) sys_dup(0);
744 (void) sys_dup(0); 739 (void) sys_dup(0);
745 740
746 if (ramdisk_execute_command) { 741 if (ramdisk_execute_command) {
747 run_init_process(ramdisk_execute_command); 742 run_init_process(ramdisk_execute_command);
748 printk(KERN_WARNING "Failed to execute %s\n", 743 printk(KERN_WARNING "Failed to execute %s\n",
749 ramdisk_execute_command); 744 ramdisk_execute_command);
750 } 745 }
751 746
752 /* 747 /*
753 * We try each of these until one succeeds. 748 * We try each of these until one succeeds.
754 * 749 *
755 * The Bourne shell can be used instead of init if we are 750 * The Bourne shell can be used instead of init if we are
756 * trying to recover a really broken machine. 751 * trying to recover a really broken machine.
757 */ 752 */
758 if (execute_command) { 753 if (execute_command) {
759 run_init_process(execute_command); 754 run_init_process(execute_command);
760 printk(KERN_WARNING "Failed to execute %s. Attempting " 755 printk(KERN_WARNING "Failed to execute %s. Attempting "
761 "defaults...\n", execute_command); 756 "defaults...\n", execute_command);
762 } 757 }
763 run_init_process("/sbin/init"); 758 run_init_process("/sbin/init");
764 run_init_process("/etc/init"); 759 run_init_process("/etc/init");
765 run_init_process("/bin/init"); 760 run_init_process("/bin/init");
766 run_init_process("/bin/sh"); 761 run_init_process("/bin/sh");
767 762
768 panic("No init found. Try passing init= option to kernel."); 763 panic("No init found. Try passing init= option to kernel.");
769 } 764 }
770 765
771 static int __init kernel_init(void * unused) 766 static int __init kernel_init(void * unused)
772 { 767 {
773 lock_kernel(); 768 lock_kernel();
774 /* 769 /*
775 * init can run on any cpu. 770 * init can run on any cpu.
776 */ 771 */
777 set_cpus_allowed(current, CPU_MASK_ALL); 772 set_cpus_allowed(current, CPU_MASK_ALL);
778 /* 773 /*
779 * Tell the world that we're going to be the grim 774 * Tell the world that we're going to be the grim
780 * reaper of innocent orphaned children. 775 * reaper of innocent orphaned children.
781 * 776 *
782 * We don't want people to have to make incorrect 777 * We don't want people to have to make incorrect
783 * assumptions about where in the task array this 778 * assumptions about where in the task array this
784 * can be found. 779 * can be found.
785 */ 780 */
786 init_pid_ns.child_reaper = current; 781 init_pid_ns.child_reaper = current;
787 782
788 cad_pid = task_pid(current); 783 cad_pid = task_pid(current);
789 784
790 smp_prepare_cpus(max_cpus); 785 smp_prepare_cpus(max_cpus);
791 786
792 do_pre_smp_initcalls(); 787 do_pre_smp_initcalls();
793 788
794 smp_init(); 789 smp_init();
795 sched_init_smp(); 790 sched_init_smp();
796 791
797 cpuset_init_smp(); 792 cpuset_init_smp();
798 793
799 do_basic_setup(); 794 do_basic_setup();
800 795
801 /* 796 /*
802 * check if there is an early userspace init. If yes, let it do all 797 * check if there is an early userspace init. If yes, let it do all
803 * the work 798 * the work
804 */ 799 */
805 800
806 if (!ramdisk_execute_command) 801 if (!ramdisk_execute_command)
807 ramdisk_execute_command = "/init"; 802 ramdisk_execute_command = "/init";
808 803
809 if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) { 804 if (sys_access((const char __user *) ramdisk_execute_command, 0) != 0) {
810 ramdisk_execute_command = NULL; 805 ramdisk_execute_command = NULL;
811 prepare_namespace(); 806 prepare_namespace();
812 } 807 }
813 808
814 /* 809 /*
815 * Ok, we have completed the initial bootup, and 810 * Ok, we have completed the initial bootup, and
816 * we're essentially up and running. Get rid of the 811 * we're essentially up and running. Get rid of the
817 * initmem segments and start the user-mode stuff.. 812 * initmem segments and start the user-mode stuff..
818 */ 813 */
819 init_post(); 814 init_post();
820 return 0; 815 return 0;
821 } 816 }
822 817
1 /* 1 /*
2 * kernel/sched.c 2 * kernel/sched.c
3 * 3 *
4 * Kernel scheduler and related syscalls 4 * Kernel scheduler and related syscalls
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 * 7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and 8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe 9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff 10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli 11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: 12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with 13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices 14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions 15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love. 16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas. 17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin 18 * 2004-04-02 Scheduler domains code by Nick Piggin
19 */ 19 */
20 20
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/module.h> 22 #include <linux/module.h>
23 #include <linux/nmi.h> 23 #include <linux/nmi.h>
24 #include <linux/init.h> 24 #include <linux/init.h>
25 #include <asm/uaccess.h> 25 #include <asm/uaccess.h>
26 #include <linux/highmem.h> 26 #include <linux/highmem.h>
27 #include <linux/smp_lock.h> 27 #include <linux/smp_lock.h>
28 #include <asm/mmu_context.h> 28 #include <asm/mmu_context.h>
29 #include <linux/interrupt.h> 29 #include <linux/interrupt.h>
30 #include <linux/capability.h> 30 #include <linux/capability.h>
31 #include <linux/completion.h> 31 #include <linux/completion.h>
32 #include <linux/kernel_stat.h> 32 #include <linux/kernel_stat.h>
33 #include <linux/debug_locks.h> 33 #include <linux/debug_locks.h>
34 #include <linux/security.h> 34 #include <linux/security.h>
35 #include <linux/notifier.h> 35 #include <linux/notifier.h>
36 #include <linux/profile.h> 36 #include <linux/profile.h>
37 #include <linux/freezer.h> 37 #include <linux/freezer.h>
38 #include <linux/vmalloc.h> 38 #include <linux/vmalloc.h>
39 #include <linux/blkdev.h> 39 #include <linux/blkdev.h>
40 #include <linux/delay.h> 40 #include <linux/delay.h>
41 #include <linux/smp.h> 41 #include <linux/smp.h>
42 #include <linux/threads.h> 42 #include <linux/threads.h>
43 #include <linux/timer.h> 43 #include <linux/timer.h>
44 #include <linux/rcupdate.h> 44 #include <linux/rcupdate.h>
45 #include <linux/cpu.h> 45 #include <linux/cpu.h>
46 #include <linux/cpuset.h> 46 #include <linux/cpuset.h>
47 #include <linux/percpu.h> 47 #include <linux/percpu.h>
48 #include <linux/kthread.h> 48 #include <linux/kthread.h>
49 #include <linux/seq_file.h> 49 #include <linux/seq_file.h>
50 #include <linux/syscalls.h> 50 #include <linux/syscalls.h>
51 #include <linux/times.h> 51 #include <linux/times.h>
52 #include <linux/tsacct_kern.h> 52 #include <linux/tsacct_kern.h>
53 #include <linux/kprobes.h> 53 #include <linux/kprobes.h>
54 #include <linux/delayacct.h> 54 #include <linux/delayacct.h>
55 #include <asm/tlb.h> 55 #include <asm/tlb.h>
56 56
57 #include <asm/unistd.h> 57 #include <asm/unistd.h>
58 58
59 /* 59 /*
60 * Scheduler clock - returns current time in nanosec units. 60 * Scheduler clock - returns current time in nanosec units.
61 * This is default implementation. 61 * This is default implementation.
62 * Architectures and sub-architectures can override this. 62 * Architectures and sub-architectures can override this.
63 */ 63 */
64 unsigned long long __attribute__((weak)) sched_clock(void) 64 unsigned long long __attribute__((weak)) sched_clock(void)
65 { 65 {
66 return (unsigned long long)jiffies * (1000000000 / HZ); 66 return (unsigned long long)jiffies * (1000000000 / HZ);
67 } 67 }
68 68
69 /* 69 /*
70 * Convert user-nice values [ -20 ... 0 ... 19 ] 70 * Convert user-nice values [ -20 ... 0 ... 19 ]
71 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 71 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
72 * and back. 72 * and back.
73 */ 73 */
74 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) 74 #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
75 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) 75 #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
76 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) 76 #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
77 77
78 /* 78 /*
79 * 'User priority' is the nice value converted to something we 79 * 'User priority' is the nice value converted to something we
80 * can work with better when scaling various scheduler parameters, 80 * can work with better when scaling various scheduler parameters,
81 * it's a [ 0 ... 39 ] range. 81 * it's a [ 0 ... 39 ] range.
82 */ 82 */
83 #define USER_PRIO(p) ((p)-MAX_RT_PRIO) 83 #define USER_PRIO(p) ((p)-MAX_RT_PRIO)
84 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) 84 #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
85 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 85 #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
86 86
87 /* 87 /*
88 * Some helpers for converting nanosecond timing to jiffy resolution 88 * Some helpers for converting nanosecond timing to jiffy resolution
89 */ 89 */
90 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) 90 #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
91 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) 91 #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
92 92
93 /* 93 /*
94 * These are the 'tuning knobs' of the scheduler: 94 * These are the 'tuning knobs' of the scheduler:
95 * 95 *
96 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), 96 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
97 * default timeslice is 100 msecs, maximum timeslice is 800 msecs. 97 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
98 * Timeslices get refilled after they expire. 98 * Timeslices get refilled after they expire.
99 */ 99 */
100 #define MIN_TIMESLICE max(5 * HZ / 1000, 1) 100 #define MIN_TIMESLICE max(5 * HZ / 1000, 1)
101 #define DEF_TIMESLICE (100 * HZ / 1000) 101 #define DEF_TIMESLICE (100 * HZ / 1000)
102 #define ON_RUNQUEUE_WEIGHT 30 102 #define ON_RUNQUEUE_WEIGHT 30
103 #define CHILD_PENALTY 95 103 #define CHILD_PENALTY 95
104 #define PARENT_PENALTY 100 104 #define PARENT_PENALTY 100
105 #define EXIT_WEIGHT 3 105 #define EXIT_WEIGHT 3
106 #define PRIO_BONUS_RATIO 25 106 #define PRIO_BONUS_RATIO 25
107 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) 107 #define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
108 #define INTERACTIVE_DELTA 2 108 #define INTERACTIVE_DELTA 2
109 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) 109 #define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
110 #define STARVATION_LIMIT (MAX_SLEEP_AVG) 110 #define STARVATION_LIMIT (MAX_SLEEP_AVG)
111 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) 111 #define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
112 112
113 /* 113 /*
114 * If a task is 'interactive' then we reinsert it in the active 114 * If a task is 'interactive' then we reinsert it in the active
115 * array after it has expired its current timeslice. (it will not 115 * array after it has expired its current timeslice. (it will not
116 * continue to run immediately, it will still roundrobin with 116 * continue to run immediately, it will still roundrobin with
117 * other interactive tasks.) 117 * other interactive tasks.)
118 * 118 *
119 * This part scales the interactivity limit depending on niceness. 119 * This part scales the interactivity limit depending on niceness.
120 * 120 *
121 * We scale it linearly, offset by the INTERACTIVE_DELTA delta. 121 * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
122 * Here are a few examples of different nice levels: 122 * Here are a few examples of different nice levels:
123 * 123 *
124 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] 124 * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
125 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] 125 * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
126 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] 126 * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
127 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] 127 * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
128 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] 128 * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
129 * 129 *
130 * (the X axis represents the possible -5 ... 0 ... +5 dynamic 130 * (the X axis represents the possible -5 ... 0 ... +5 dynamic
131 * priority range a task can explore, a value of '1' means the 131 * priority range a task can explore, a value of '1' means the
132 * task is rated interactive.) 132 * task is rated interactive.)
133 * 133 *
134 * Ie. nice +19 tasks can never get 'interactive' enough to be 134 * Ie. nice +19 tasks can never get 'interactive' enough to be
135 * reinserted into the active array. And only heavily CPU-hog nice -20 135 * reinserted into the active array. And only heavily CPU-hog nice -20
136 * tasks will be expired. Default nice 0 tasks are somewhere between, 136 * tasks will be expired. Default nice 0 tasks are somewhere between,
137 * it takes some effort for them to get interactive, but it's not 137 * it takes some effort for them to get interactive, but it's not
138 * too hard. 138 * too hard.
139 */ 139 */
140 140
141 #define CURRENT_BONUS(p) \ 141 #define CURRENT_BONUS(p) \
142 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ 142 (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
143 MAX_SLEEP_AVG) 143 MAX_SLEEP_AVG)
144 144
145 #define GRANULARITY (10 * HZ / 1000 ? : 1) 145 #define GRANULARITY (10 * HZ / 1000 ? : 1)
146 146
147 #ifdef CONFIG_SMP 147 #ifdef CONFIG_SMP
148 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ 148 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
149 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ 149 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
150 num_online_cpus()) 150 num_online_cpus())
151 #else 151 #else
152 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \ 152 #define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) 153 (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
154 #endif 154 #endif
155 155
156 #define SCALE(v1,v1_max,v2_max) \ 156 #define SCALE(v1,v1_max,v2_max) \
157 (v1) * (v2_max) / (v1_max) 157 (v1) * (v2_max) / (v1_max)
158 158
159 #define DELTA(p) \ 159 #define DELTA(p) \
160 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \ 160 (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
161 INTERACTIVE_DELTA) 161 INTERACTIVE_DELTA)
162 162
163 #define TASK_INTERACTIVE(p) \ 163 #define TASK_INTERACTIVE(p) \
164 ((p)->prio <= (p)->static_prio - DELTA(p)) 164 ((p)->prio <= (p)->static_prio - DELTA(p))
165 165
166 #define INTERACTIVE_SLEEP(p) \ 166 #define INTERACTIVE_SLEEP(p) \
167 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ 167 (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
168 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) 168 (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
169 169
170 #define TASK_PREEMPTS_CURR(p, rq) \ 170 #define TASK_PREEMPTS_CURR(p, rq) \
171 ((p)->prio < (rq)->curr->prio) 171 ((p)->prio < (rq)->curr->prio)
172 172
173 #define SCALE_PRIO(x, prio) \ 173 #define SCALE_PRIO(x, prio) \
174 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) 174 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
175 175
176 static unsigned int static_prio_timeslice(int static_prio) 176 static unsigned int static_prio_timeslice(int static_prio)
177 { 177 {
178 if (static_prio < NICE_TO_PRIO(0)) 178 if (static_prio < NICE_TO_PRIO(0))
179 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); 179 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
180 else 180 else
181 return SCALE_PRIO(DEF_TIMESLICE, static_prio); 181 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
182 } 182 }
183 183
184 /* 184 /*
185 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] 185 * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
186 * to time slice values: [800ms ... 100ms ... 5ms] 186 * to time slice values: [800ms ... 100ms ... 5ms]
187 * 187 *
188 * The higher a thread's priority, the bigger timeslices 188 * The higher a thread's priority, the bigger timeslices
189 * it gets during one round of execution. But even the lowest 189 * it gets during one round of execution. But even the lowest
190 * priority thread gets MIN_TIMESLICE worth of execution time. 190 * priority thread gets MIN_TIMESLICE worth of execution time.
191 */ 191 */
192 192
193 static inline unsigned int task_timeslice(struct task_struct *p) 193 static inline unsigned int task_timeslice(struct task_struct *p)
194 { 194 {
195 return static_prio_timeslice(p->static_prio); 195 return static_prio_timeslice(p->static_prio);
196 } 196 }
197 197
198 /* 198 /*
199 * These are the runqueue data structures: 199 * These are the runqueue data structures:
200 */ 200 */
201 201
202 struct prio_array { 202 struct prio_array {
203 unsigned int nr_active; 203 unsigned int nr_active;
204 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ 204 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
205 struct list_head queue[MAX_PRIO]; 205 struct list_head queue[MAX_PRIO];
206 }; 206 };
207 207
208 /* 208 /*
209 * This is the main, per-CPU runqueue data structure. 209 * This is the main, per-CPU runqueue data structure.
210 * 210 *
211 * Locking rule: those places that want to lock multiple runqueues 211 * Locking rule: those places that want to lock multiple runqueues
212 * (such as the load balancing or the thread migration code), lock 212 * (such as the load balancing or the thread migration code), lock
213 * acquire operations must be ordered by ascending &runqueue. 213 * acquire operations must be ordered by ascending &runqueue.
214 */ 214 */
215 struct rq { 215 struct rq {
216 spinlock_t lock; 216 spinlock_t lock;
217 217
218 /* 218 /*
219 * nr_running and cpu_load should be in the same cacheline because 219 * nr_running and cpu_load should be in the same cacheline because
220 * remote CPUs use both these fields when doing load calculation. 220 * remote CPUs use both these fields when doing load calculation.
221 */ 221 */
222 unsigned long nr_running; 222 unsigned long nr_running;
223 unsigned long raw_weighted_load; 223 unsigned long raw_weighted_load;
224 #ifdef CONFIG_SMP 224 #ifdef CONFIG_SMP
225 unsigned long cpu_load[3]; 225 unsigned long cpu_load[3];
226 #endif 226 #endif
227 unsigned long long nr_switches; 227 unsigned long long nr_switches;
228 228
229 /* 229 /*
230 * This is part of a global counter where only the total sum 230 * This is part of a global counter where only the total sum
231 * over all CPUs matters. A task can increase this counter on 231 * over all CPUs matters. A task can increase this counter on
232 * one CPU and if it got migrated afterwards it may decrease 232 * one CPU and if it got migrated afterwards it may decrease
233 * it on another CPU. Always updated under the runqueue lock: 233 * it on another CPU. Always updated under the runqueue lock:
234 */ 234 */
235 unsigned long nr_uninterruptible; 235 unsigned long nr_uninterruptible;
236 236
237 unsigned long expired_timestamp; 237 unsigned long expired_timestamp;
238 /* Cached timestamp set by update_cpu_clock() */ 238 /* Cached timestamp set by update_cpu_clock() */
239 unsigned long long most_recent_timestamp; 239 unsigned long long most_recent_timestamp;
240 struct task_struct *curr, *idle; 240 struct task_struct *curr, *idle;
241 unsigned long next_balance; 241 unsigned long next_balance;
242 struct mm_struct *prev_mm; 242 struct mm_struct *prev_mm;
243 struct prio_array *active, *expired, arrays[2]; 243 struct prio_array *active, *expired, arrays[2];
244 int best_expired_prio; 244 int best_expired_prio;
245 atomic_t nr_iowait; 245 atomic_t nr_iowait;
246 246
247 #ifdef CONFIG_SMP 247 #ifdef CONFIG_SMP
248 struct sched_domain *sd; 248 struct sched_domain *sd;
249 249
250 /* For active balancing */ 250 /* For active balancing */
251 int active_balance; 251 int active_balance;
252 int push_cpu; 252 int push_cpu;
253 int cpu; /* cpu of this runqueue */ 253 int cpu; /* cpu of this runqueue */
254 254
255 struct task_struct *migration_thread; 255 struct task_struct *migration_thread;
256 struct list_head migration_queue; 256 struct list_head migration_queue;
257 #endif 257 #endif
258 258
259 #ifdef CONFIG_SCHEDSTATS 259 #ifdef CONFIG_SCHEDSTATS
260 /* latency stats */ 260 /* latency stats */
261 struct sched_info rq_sched_info; 261 struct sched_info rq_sched_info;
262 262
263 /* sys_sched_yield() stats */ 263 /* sys_sched_yield() stats */
264 unsigned long yld_exp_empty; 264 unsigned long yld_exp_empty;
265 unsigned long yld_act_empty; 265 unsigned long yld_act_empty;
266 unsigned long yld_both_empty; 266 unsigned long yld_both_empty;
267 unsigned long yld_cnt; 267 unsigned long yld_cnt;
268 268
269 /* schedule() stats */ 269 /* schedule() stats */
270 unsigned long sched_switch; 270 unsigned long sched_switch;
271 unsigned long sched_cnt; 271 unsigned long sched_cnt;
272 unsigned long sched_goidle; 272 unsigned long sched_goidle;
273 273
274 /* try_to_wake_up() stats */ 274 /* try_to_wake_up() stats */
275 unsigned long ttwu_cnt; 275 unsigned long ttwu_cnt;
276 unsigned long ttwu_local; 276 unsigned long ttwu_local;
277 #endif 277 #endif
278 struct lock_class_key rq_lock_key; 278 struct lock_class_key rq_lock_key;
279 }; 279 };
280 280
281 static DEFINE_PER_CPU(struct rq, runqueues); 281 static DEFINE_PER_CPU(struct rq, runqueues);
282 282
283 static inline int cpu_of(struct rq *rq) 283 static inline int cpu_of(struct rq *rq)
284 { 284 {
285 #ifdef CONFIG_SMP 285 #ifdef CONFIG_SMP
286 return rq->cpu; 286 return rq->cpu;
287 #else 287 #else
288 return 0; 288 return 0;
289 #endif 289 #endif
290 } 290 }
291 291
292 /* 292 /*
293 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 293 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
294 * See detach_destroy_domains: synchronize_sched for details. 294 * See detach_destroy_domains: synchronize_sched for details.
295 * 295 *
296 * The domain tree of any CPU may only be accessed from within 296 * The domain tree of any CPU may only be accessed from within
297 * preempt-disabled sections. 297 * preempt-disabled sections.
298 */ 298 */
299 #define for_each_domain(cpu, __sd) \ 299 #define for_each_domain(cpu, __sd) \
300 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 300 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
301 301
302 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 302 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
303 #define this_rq() (&__get_cpu_var(runqueues)) 303 #define this_rq() (&__get_cpu_var(runqueues))
304 #define task_rq(p) cpu_rq(task_cpu(p)) 304 #define task_rq(p) cpu_rq(task_cpu(p))
305 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 305 #define cpu_curr(cpu) (cpu_rq(cpu)->curr)
306 306
307 #ifndef prepare_arch_switch 307 #ifndef prepare_arch_switch
308 # define prepare_arch_switch(next) do { } while (0) 308 # define prepare_arch_switch(next) do { } while (0)
309 #endif 309 #endif
310 #ifndef finish_arch_switch 310 #ifndef finish_arch_switch
311 # define finish_arch_switch(prev) do { } while (0) 311 # define finish_arch_switch(prev) do { } while (0)
312 #endif 312 #endif
313 313
314 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 314 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
315 static inline int task_running(struct rq *rq, struct task_struct *p) 315 static inline int task_running(struct rq *rq, struct task_struct *p)
316 { 316 {
317 return rq->curr == p; 317 return rq->curr == p;
318 } 318 }
319 319
320 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 320 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
321 { 321 {
322 } 322 }
323 323
324 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 324 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
325 { 325 {
326 #ifdef CONFIG_DEBUG_SPINLOCK 326 #ifdef CONFIG_DEBUG_SPINLOCK
327 /* this is a valid case when another task releases the spinlock */ 327 /* this is a valid case when another task releases the spinlock */
328 rq->lock.owner = current; 328 rq->lock.owner = current;
329 #endif 329 #endif
330 /* 330 /*
331 * If we are tracking spinlock dependencies then we have to 331 * If we are tracking spinlock dependencies then we have to
332 * fix up the runqueue lock - which gets 'carried over' from 332 * fix up the runqueue lock - which gets 'carried over' from
333 * prev into current: 333 * prev into current:
334 */ 334 */
335 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 335 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
336 336
337 spin_unlock_irq(&rq->lock); 337 spin_unlock_irq(&rq->lock);
338 } 338 }
339 339
340 #else /* __ARCH_WANT_UNLOCKED_CTXSW */ 340 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
341 static inline int task_running(struct rq *rq, struct task_struct *p) 341 static inline int task_running(struct rq *rq, struct task_struct *p)
342 { 342 {
343 #ifdef CONFIG_SMP 343 #ifdef CONFIG_SMP
344 return p->oncpu; 344 return p->oncpu;
345 #else 345 #else
346 return rq->curr == p; 346 return rq->curr == p;
347 #endif 347 #endif
348 } 348 }
349 349
350 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 350 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
351 { 351 {
352 #ifdef CONFIG_SMP 352 #ifdef CONFIG_SMP
353 /* 353 /*
354 * We can optimise this out completely for !SMP, because the 354 * We can optimise this out completely for !SMP, because the
355 * SMP rebalancing from interrupt is the only thing that cares 355 * SMP rebalancing from interrupt is the only thing that cares
356 * here. 356 * here.
357 */ 357 */
358 next->oncpu = 1; 358 next->oncpu = 1;
359 #endif 359 #endif
360 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 360 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
361 spin_unlock_irq(&rq->lock); 361 spin_unlock_irq(&rq->lock);
362 #else 362 #else
363 spin_unlock(&rq->lock); 363 spin_unlock(&rq->lock);
364 #endif 364 #endif
365 } 365 }
366 366
367 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 367 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
368 { 368 {
369 #ifdef CONFIG_SMP 369 #ifdef CONFIG_SMP
370 /* 370 /*
371 * After ->oncpu is cleared, the task can be moved to a different CPU. 371 * After ->oncpu is cleared, the task can be moved to a different CPU.
372 * We must ensure this doesn't happen until the switch is completely 372 * We must ensure this doesn't happen until the switch is completely
373 * finished. 373 * finished.
374 */ 374 */
375 smp_wmb(); 375 smp_wmb();
376 prev->oncpu = 0; 376 prev->oncpu = 0;
377 #endif 377 #endif
378 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 378 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
379 local_irq_enable(); 379 local_irq_enable();
380 #endif 380 #endif
381 } 381 }
382 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 382 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
383 383
384 /* 384 /*
385 * __task_rq_lock - lock the runqueue a given task resides on. 385 * __task_rq_lock - lock the runqueue a given task resides on.
386 * Must be called interrupts disabled. 386 * Must be called interrupts disabled.
387 */ 387 */
388 static inline struct rq *__task_rq_lock(struct task_struct *p) 388 static inline struct rq *__task_rq_lock(struct task_struct *p)
389 __acquires(rq->lock) 389 __acquires(rq->lock)
390 { 390 {
391 struct rq *rq; 391 struct rq *rq;
392 392
393 repeat_lock_task: 393 repeat_lock_task:
394 rq = task_rq(p); 394 rq = task_rq(p);
395 spin_lock(&rq->lock); 395 spin_lock(&rq->lock);
396 if (unlikely(rq != task_rq(p))) { 396 if (unlikely(rq != task_rq(p))) {
397 spin_unlock(&rq->lock); 397 spin_unlock(&rq->lock);
398 goto repeat_lock_task; 398 goto repeat_lock_task;
399 } 399 }
400 return rq; 400 return rq;
401 } 401 }
402 402
403 /* 403 /*
404 * task_rq_lock - lock the runqueue a given task resides on and disable 404 * task_rq_lock - lock the runqueue a given task resides on and disable
405 * interrupts. Note the ordering: we can safely lookup the task_rq without 405 * interrupts. Note the ordering: we can safely lookup the task_rq without
406 * explicitly disabling preemption. 406 * explicitly disabling preemption.
407 */ 407 */
408 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 408 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
409 __acquires(rq->lock) 409 __acquires(rq->lock)
410 { 410 {
411 struct rq *rq; 411 struct rq *rq;
412 412
413 repeat_lock_task: 413 repeat_lock_task:
414 local_irq_save(*flags); 414 local_irq_save(*flags);
415 rq = task_rq(p); 415 rq = task_rq(p);
416 spin_lock(&rq->lock); 416 spin_lock(&rq->lock);
417 if (unlikely(rq != task_rq(p))) { 417 if (unlikely(rq != task_rq(p))) {
418 spin_unlock_irqrestore(&rq->lock, *flags); 418 spin_unlock_irqrestore(&rq->lock, *flags);
419 goto repeat_lock_task; 419 goto repeat_lock_task;
420 } 420 }
421 return rq; 421 return rq;
422 } 422 }
423 423
424 static inline void __task_rq_unlock(struct rq *rq) 424 static inline void __task_rq_unlock(struct rq *rq)
425 __releases(rq->lock) 425 __releases(rq->lock)
426 { 426 {
427 spin_unlock(&rq->lock); 427 spin_unlock(&rq->lock);
428 } 428 }
429 429
430 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 430 static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
431 __releases(rq->lock) 431 __releases(rq->lock)
432 { 432 {
433 spin_unlock_irqrestore(&rq->lock, *flags); 433 spin_unlock_irqrestore(&rq->lock, *flags);
434 } 434 }
435 435
436 #ifdef CONFIG_SCHEDSTATS 436 #ifdef CONFIG_SCHEDSTATS
437 /* 437 /*
438 * bump this up when changing the output format or the meaning of an existing 438 * bump this up when changing the output format or the meaning of an existing
439 * format, so that tools can adapt (or abort) 439 * format, so that tools can adapt (or abort)
440 */ 440 */
441 #define SCHEDSTAT_VERSION 14 441 #define SCHEDSTAT_VERSION 14
442 442
443 static int show_schedstat(struct seq_file *seq, void *v) 443 static int show_schedstat(struct seq_file *seq, void *v)
444 { 444 {
445 int cpu; 445 int cpu;
446 446
447 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 447 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
448 seq_printf(seq, "timestamp %lu\n", jiffies); 448 seq_printf(seq, "timestamp %lu\n", jiffies);
449 for_each_online_cpu(cpu) { 449 for_each_online_cpu(cpu) {
450 struct rq *rq = cpu_rq(cpu); 450 struct rq *rq = cpu_rq(cpu);
451 #ifdef CONFIG_SMP 451 #ifdef CONFIG_SMP
452 struct sched_domain *sd; 452 struct sched_domain *sd;
453 int dcnt = 0; 453 int dcnt = 0;
454 #endif 454 #endif
455 455
456 /* runqueue-specific stats */ 456 /* runqueue-specific stats */
457 seq_printf(seq, 457 seq_printf(seq,
458 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", 458 "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
459 cpu, rq->yld_both_empty, 459 cpu, rq->yld_both_empty,
460 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, 460 rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
461 rq->sched_switch, rq->sched_cnt, rq->sched_goidle, 461 rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
462 rq->ttwu_cnt, rq->ttwu_local, 462 rq->ttwu_cnt, rq->ttwu_local,
463 rq->rq_sched_info.cpu_time, 463 rq->rq_sched_info.cpu_time,
464 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); 464 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
465 465
466 seq_printf(seq, "\n"); 466 seq_printf(seq, "\n");
467 467
468 #ifdef CONFIG_SMP 468 #ifdef CONFIG_SMP
469 /* domain-specific stats */ 469 /* domain-specific stats */
470 preempt_disable(); 470 preempt_disable();
471 for_each_domain(cpu, sd) { 471 for_each_domain(cpu, sd) {
472 enum idle_type itype; 472 enum idle_type itype;
473 char mask_str[NR_CPUS]; 473 char mask_str[NR_CPUS];
474 474
475 cpumask_scnprintf(mask_str, NR_CPUS, sd->span); 475 cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
476 seq_printf(seq, "domain%d %s", dcnt++, mask_str); 476 seq_printf(seq, "domain%d %s", dcnt++, mask_str);
477 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; 477 for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
478 itype++) { 478 itype++) {
479 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " 479 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
480 "%lu", 480 "%lu",
481 sd->lb_cnt[itype], 481 sd->lb_cnt[itype],
482 sd->lb_balanced[itype], 482 sd->lb_balanced[itype],
483 sd->lb_failed[itype], 483 sd->lb_failed[itype],
484 sd->lb_imbalance[itype], 484 sd->lb_imbalance[itype],
485 sd->lb_gained[itype], 485 sd->lb_gained[itype],
486 sd->lb_hot_gained[itype], 486 sd->lb_hot_gained[itype],
487 sd->lb_nobusyq[itype], 487 sd->lb_nobusyq[itype],
488 sd->lb_nobusyg[itype]); 488 sd->lb_nobusyg[itype]);
489 } 489 }
490 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" 490 seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
491 " %lu %lu %lu\n", 491 " %lu %lu %lu\n",
492 sd->alb_cnt, sd->alb_failed, sd->alb_pushed, 492 sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
493 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, 493 sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
494 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, 494 sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
495 sd->ttwu_wake_remote, sd->ttwu_move_affine, 495 sd->ttwu_wake_remote, sd->ttwu_move_affine,
496 sd->ttwu_move_balance); 496 sd->ttwu_move_balance);
497 } 497 }
498 preempt_enable(); 498 preempt_enable();
499 #endif 499 #endif
500 } 500 }
501 return 0; 501 return 0;
502 } 502 }
503 503
504 static int schedstat_open(struct inode *inode, struct file *file) 504 static int schedstat_open(struct inode *inode, struct file *file)
505 { 505 {
506 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 506 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
507 char *buf = kmalloc(size, GFP_KERNEL); 507 char *buf = kmalloc(size, GFP_KERNEL);
508 struct seq_file *m; 508 struct seq_file *m;
509 int res; 509 int res;
510 510
511 if (!buf) 511 if (!buf)
512 return -ENOMEM; 512 return -ENOMEM;
513 res = single_open(file, show_schedstat, NULL); 513 res = single_open(file, show_schedstat, NULL);
514 if (!res) { 514 if (!res) {
515 m = file->private_data; 515 m = file->private_data;
516 m->buf = buf; 516 m->buf = buf;
517 m->size = size; 517 m->size = size;
518 } else 518 } else
519 kfree(buf); 519 kfree(buf);
520 return res; 520 return res;
521 } 521 }
522 522
523 const struct file_operations proc_schedstat_operations = { 523 const struct file_operations proc_schedstat_operations = {
524 .open = schedstat_open, 524 .open = schedstat_open,
525 .read = seq_read, 525 .read = seq_read,
526 .llseek = seq_lseek, 526 .llseek = seq_lseek,
527 .release = single_release, 527 .release = single_release,
528 }; 528 };
529 529
530 /* 530 /*
531 * Expects runqueue lock to be held for atomicity of update 531 * Expects runqueue lock to be held for atomicity of update
532 */ 532 */
533 static inline void 533 static inline void
534 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) 534 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
535 { 535 {
536 if (rq) { 536 if (rq) {
537 rq->rq_sched_info.run_delay += delta_jiffies; 537 rq->rq_sched_info.run_delay += delta_jiffies;
538 rq->rq_sched_info.pcnt++; 538 rq->rq_sched_info.pcnt++;
539 } 539 }
540 } 540 }
541 541
542 /* 542 /*
543 * Expects runqueue lock to be held for atomicity of update 543 * Expects runqueue lock to be held for atomicity of update
544 */ 544 */
545 static inline void 545 static inline void
546 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) 546 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
547 { 547 {
548 if (rq) 548 if (rq)
549 rq->rq_sched_info.cpu_time += delta_jiffies; 549 rq->rq_sched_info.cpu_time += delta_jiffies;
550 } 550 }
551 # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 551 # define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
552 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 552 # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
553 #else /* !CONFIG_SCHEDSTATS */ 553 #else /* !CONFIG_SCHEDSTATS */
554 static inline void 554 static inline void
555 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) 555 rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
556 {} 556 {}
557 static inline void 557 static inline void
558 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) 558 rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
559 {} 559 {}
560 # define schedstat_inc(rq, field) do { } while (0) 560 # define schedstat_inc(rq, field) do { } while (0)
561 # define schedstat_add(rq, field, amt) do { } while (0) 561 # define schedstat_add(rq, field, amt) do { } while (0)
562 #endif 562 #endif
563 563
564 /* 564 /*
565 * this_rq_lock - lock this runqueue and disable interrupts. 565 * this_rq_lock - lock this runqueue and disable interrupts.
566 */ 566 */
567 static inline struct rq *this_rq_lock(void) 567 static inline struct rq *this_rq_lock(void)
568 __acquires(rq->lock) 568 __acquires(rq->lock)
569 { 569 {
570 struct rq *rq; 570 struct rq *rq;
571 571
572 local_irq_disable(); 572 local_irq_disable();
573 rq = this_rq(); 573 rq = this_rq();
574 spin_lock(&rq->lock); 574 spin_lock(&rq->lock);
575 575
576 return rq; 576 return rq;
577 } 577 }
578 578
579 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 579 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
580 /* 580 /*
581 * Called when a process is dequeued from the active array and given 581 * Called when a process is dequeued from the active array and given
582 * the cpu. We should note that with the exception of interactive 582 * the cpu. We should note that with the exception of interactive
583 * tasks, the expired queue will become the active queue after the active 583 * tasks, the expired queue will become the active queue after the active
584 * queue is empty, without explicitly dequeuing and requeuing tasks in the 584 * queue is empty, without explicitly dequeuing and requeuing tasks in the
585 * expired queue. (Interactive tasks may be requeued directly to the 585 * expired queue. (Interactive tasks may be requeued directly to the
586 * active queue, thus delaying tasks in the expired queue from running; 586 * active queue, thus delaying tasks in the expired queue from running;
587 * see scheduler_tick()). 587 * see scheduler_tick()).
588 * 588 *
589 * This function is only called from sched_info_arrive(), rather than 589 * This function is only called from sched_info_arrive(), rather than
590 * dequeue_task(). Even though a task may be queued and dequeued multiple 590 * dequeue_task(). Even though a task may be queued and dequeued multiple
591 * times as it is shuffled about, we're really interested in knowing how 591 * times as it is shuffled about, we're really interested in knowing how
592 * long it was from the *first* time it was queued to the time that it 592 * long it was from the *first* time it was queued to the time that it
593 * finally hit a cpu. 593 * finally hit a cpu.
594 */ 594 */
595 static inline void sched_info_dequeued(struct task_struct *t) 595 static inline void sched_info_dequeued(struct task_struct *t)
596 { 596 {
597 t->sched_info.last_queued = 0; 597 t->sched_info.last_queued = 0;
598 } 598 }
599 599
600 /* 600 /*
601 * Called when a task finally hits the cpu. We can now calculate how 601 * Called when a task finally hits the cpu. We can now calculate how
602 * long it was waiting to run. We also note when it began so that we 602 * long it was waiting to run. We also note when it began so that we
603 * can keep stats on how long its timeslice is. 603 * can keep stats on how long its timeslice is.
604 */ 604 */
605 static void sched_info_arrive(struct task_struct *t) 605 static void sched_info_arrive(struct task_struct *t)
606 { 606 {
607 unsigned long now = jiffies, delta_jiffies = 0; 607 unsigned long now = jiffies, delta_jiffies = 0;
608 608
609 if (t->sched_info.last_queued) 609 if (t->sched_info.last_queued)
610 delta_jiffies = now - t->sched_info.last_queued; 610 delta_jiffies = now - t->sched_info.last_queued;
611 sched_info_dequeued(t); 611 sched_info_dequeued(t);
612 t->sched_info.run_delay += delta_jiffies; 612 t->sched_info.run_delay += delta_jiffies;
613 t->sched_info.last_arrival = now; 613 t->sched_info.last_arrival = now;
614 t->sched_info.pcnt++; 614 t->sched_info.pcnt++;
615 615
616 rq_sched_info_arrive(task_rq(t), delta_jiffies); 616 rq_sched_info_arrive(task_rq(t), delta_jiffies);
617 } 617 }
618 618
619 /* 619 /*
620 * Called when a process is queued into either the active or expired 620 * Called when a process is queued into either the active or expired
621 * array. The time is noted and later used to determine how long we 621 * array. The time is noted and later used to determine how long we
622 * had to wait for us to reach the cpu. Since the expired queue will 622 * had to wait for us to reach the cpu. Since the expired queue will
623 * become the active queue after active queue is empty, without dequeuing 623 * become the active queue after active queue is empty, without dequeuing
624 * and requeuing any tasks, we are interested in queuing to either. It 624 * and requeuing any tasks, we are interested in queuing to either. It
625 * is unusual but not impossible for tasks to be dequeued and immediately 625 * is unusual but not impossible for tasks to be dequeued and immediately
626 * requeued in the same or another array: this can happen in sched_yield(), 626 * requeued in the same or another array: this can happen in sched_yield(),
627 * set_user_nice(), and even load_balance() as it moves tasks from runqueue 627 * set_user_nice(), and even load_balance() as it moves tasks from runqueue
628 * to runqueue. 628 * to runqueue.
629 * 629 *
630 * This function is only called from enqueue_task(), but also only updates 630 * This function is only called from enqueue_task(), but also only updates
631 * the timestamp if it is already not set. It's assumed that 631 * the timestamp if it is already not set. It's assumed that
632 * sched_info_dequeued() will clear that stamp when appropriate. 632 * sched_info_dequeued() will clear that stamp when appropriate.
633 */ 633 */
634 static inline void sched_info_queued(struct task_struct *t) 634 static inline void sched_info_queued(struct task_struct *t)
635 { 635 {
636 if (unlikely(sched_info_on())) 636 if (unlikely(sched_info_on()))
637 if (!t->sched_info.last_queued) 637 if (!t->sched_info.last_queued)
638 t->sched_info.last_queued = jiffies; 638 t->sched_info.last_queued = jiffies;
639 } 639 }
640 640
641 /* 641 /*
642 * Called when a process ceases being the active-running process, either 642 * Called when a process ceases being the active-running process, either
643 * voluntarily or involuntarily. Now we can calculate how long we ran. 643 * voluntarily or involuntarily. Now we can calculate how long we ran.
644 */ 644 */
645 static inline void sched_info_depart(struct task_struct *t) 645 static inline void sched_info_depart(struct task_struct *t)
646 { 646 {
647 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; 647 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
648 648
649 t->sched_info.cpu_time += delta_jiffies; 649 t->sched_info.cpu_time += delta_jiffies;
650 rq_sched_info_depart(task_rq(t), delta_jiffies); 650 rq_sched_info_depart(task_rq(t), delta_jiffies);
651 } 651 }
652 652
653 /* 653 /*
654 * Called when tasks are switched involuntarily due, typically, to expiring 654 * Called when tasks are switched involuntarily due, typically, to expiring
655 * their time slice. (This may also be called when switching to or from 655 * their time slice. (This may also be called when switching to or from
656 * the idle task.) We are only called when prev != next. 656 * the idle task.) We are only called when prev != next.
657 */ 657 */
658 static inline void 658 static inline void
659 __sched_info_switch(struct task_struct *prev, struct task_struct *next) 659 __sched_info_switch(struct task_struct *prev, struct task_struct *next)
660 { 660 {
661 struct rq *rq = task_rq(prev); 661 struct rq *rq = task_rq(prev);
662 662
663 /* 663 /*
664 * prev now departs the cpu. It's not interesting to record 664 * prev now departs the cpu. It's not interesting to record
665 * stats about how efficient we were at scheduling the idle 665 * stats about how efficient we were at scheduling the idle
666 * process, however. 666 * process, however.
667 */ 667 */
668 if (prev != rq->idle) 668 if (prev != rq->idle)
669 sched_info_depart(prev); 669 sched_info_depart(prev);
670 670
671 if (next != rq->idle) 671 if (next != rq->idle)
672 sched_info_arrive(next); 672 sched_info_arrive(next);
673 } 673 }
674 static inline void 674 static inline void
675 sched_info_switch(struct task_struct *prev, struct task_struct *next) 675 sched_info_switch(struct task_struct *prev, struct task_struct *next)
676 { 676 {
677 if (unlikely(sched_info_on())) 677 if (unlikely(sched_info_on()))
678 __sched_info_switch(prev, next); 678 __sched_info_switch(prev, next);
679 } 679 }
680 #else 680 #else
681 #define sched_info_queued(t) do { } while (0) 681 #define sched_info_queued(t) do { } while (0)
682 #define sched_info_switch(t, next) do { } while (0) 682 #define sched_info_switch(t, next) do { } while (0)
683 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 683 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
684 684
685 /* 685 /*
686 * Adding/removing a task to/from a priority array: 686 * Adding/removing a task to/from a priority array:
687 */ 687 */
688 static void dequeue_task(struct task_struct *p, struct prio_array *array) 688 static void dequeue_task(struct task_struct *p, struct prio_array *array)
689 { 689 {
690 array->nr_active--; 690 array->nr_active--;
691 list_del(&p->run_list); 691 list_del(&p->run_list);
692 if (list_empty(array->queue + p->prio)) 692 if (list_empty(array->queue + p->prio))
693 __clear_bit(p->prio, array->bitmap); 693 __clear_bit(p->prio, array->bitmap);
694 } 694 }
695 695
696 static void enqueue_task(struct task_struct *p, struct prio_array *array) 696 static void enqueue_task(struct task_struct *p, struct prio_array *array)
697 { 697 {
698 sched_info_queued(p); 698 sched_info_queued(p);
699 list_add_tail(&p->run_list, array->queue + p->prio); 699 list_add_tail(&p->run_list, array->queue + p->prio);
700 __set_bit(p->prio, array->bitmap); 700 __set_bit(p->prio, array->bitmap);
701 array->nr_active++; 701 array->nr_active++;
702 p->array = array; 702 p->array = array;
703 } 703 }
704 704
705 /* 705 /*
706 * Put task to the end of the run list without the overhead of dequeue 706 * Put task to the end of the run list without the overhead of dequeue
707 * followed by enqueue. 707 * followed by enqueue.
708 */ 708 */
709 static void requeue_task(struct task_struct *p, struct prio_array *array) 709 static void requeue_task(struct task_struct *p, struct prio_array *array)
710 { 710 {
711 list_move_tail(&p->run_list, array->queue + p->prio); 711 list_move_tail(&p->run_list, array->queue + p->prio);
712 } 712 }
713 713
714 static inline void 714 static inline void
715 enqueue_task_head(struct task_struct *p, struct prio_array *array) 715 enqueue_task_head(struct task_struct *p, struct prio_array *array)
716 { 716 {
717 list_add(&p->run_list, array->queue + p->prio); 717 list_add(&p->run_list, array->queue + p->prio);
718 __set_bit(p->prio, array->bitmap); 718 __set_bit(p->prio, array->bitmap);
719 array->nr_active++; 719 array->nr_active++;
720 p->array = array; 720 p->array = array;
721 } 721 }
722 722
723 /* 723 /*
724 * __normal_prio - return the priority that is based on the static 724 * __normal_prio - return the priority that is based on the static
725 * priority but is modified by bonuses/penalties. 725 * priority but is modified by bonuses/penalties.
726 * 726 *
727 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 727 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
728 * into the -5 ... 0 ... +5 bonus/penalty range. 728 * into the -5 ... 0 ... +5 bonus/penalty range.
729 * 729 *
730 * We use 25% of the full 0...39 priority range so that: 730 * We use 25% of the full 0...39 priority range so that:
731 * 731 *
732 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. 732 * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
733 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. 733 * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
734 * 734 *
735 * Both properties are important to certain workloads. 735 * Both properties are important to certain workloads.
736 */ 736 */
737 737
738 static inline int __normal_prio(struct task_struct *p) 738 static inline int __normal_prio(struct task_struct *p)
739 { 739 {
740 int bonus, prio; 740 int bonus, prio;
741 741
742 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 742 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
743 743
744 prio = p->static_prio - bonus; 744 prio = p->static_prio - bonus;
745 if (prio < MAX_RT_PRIO) 745 if (prio < MAX_RT_PRIO)
746 prio = MAX_RT_PRIO; 746 prio = MAX_RT_PRIO;
747 if (prio > MAX_PRIO-1) 747 if (prio > MAX_PRIO-1)
748 prio = MAX_PRIO-1; 748 prio = MAX_PRIO-1;
749 return prio; 749 return prio;
750 } 750 }
751 751
752 /* 752 /*
753 * To aid in avoiding the subversion of "niceness" due to uneven distribution 753 * To aid in avoiding the subversion of "niceness" due to uneven distribution
754 * of tasks with abnormal "nice" values across CPUs the contribution that 754 * of tasks with abnormal "nice" values across CPUs the contribution that
755 * each task makes to its run queue's load is weighted according to its 755 * each task makes to its run queue's load is weighted according to its
756 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 756 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
757 * scaled version of the new time slice allocation that they receive on time 757 * scaled version of the new time slice allocation that they receive on time
758 * slice expiry etc. 758 * slice expiry etc.
759 */ 759 */
760 760
761 /* 761 /*
762 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE 762 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
763 * If static_prio_timeslice() is ever changed to break this assumption then 763 * If static_prio_timeslice() is ever changed to break this assumption then
764 * this code will need modification 764 * this code will need modification
765 */ 765 */
766 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE 766 #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
767 #define LOAD_WEIGHT(lp) \ 767 #define LOAD_WEIGHT(lp) \
768 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) 768 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
769 #define PRIO_TO_LOAD_WEIGHT(prio) \ 769 #define PRIO_TO_LOAD_WEIGHT(prio) \
770 LOAD_WEIGHT(static_prio_timeslice(prio)) 770 LOAD_WEIGHT(static_prio_timeslice(prio))
771 #define RTPRIO_TO_LOAD_WEIGHT(rp) \ 771 #define RTPRIO_TO_LOAD_WEIGHT(rp) \
772 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) 772 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
773 773
774 static void set_load_weight(struct task_struct *p) 774 static void set_load_weight(struct task_struct *p)
775 { 775 {
776 if (has_rt_policy(p)) { 776 if (has_rt_policy(p)) {
777 #ifdef CONFIG_SMP 777 #ifdef CONFIG_SMP
778 if (p == task_rq(p)->migration_thread) 778 if (p == task_rq(p)->migration_thread)
779 /* 779 /*
780 * The migration thread does the actual balancing. 780 * The migration thread does the actual balancing.
781 * Giving its load any weight will skew balancing 781 * Giving its load any weight will skew balancing
782 * adversely. 782 * adversely.
783 */ 783 */
784 p->load_weight = 0; 784 p->load_weight = 0;
785 else 785 else
786 #endif 786 #endif
787 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); 787 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
788 } else 788 } else
789 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); 789 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
790 } 790 }
791 791
792 static inline void 792 static inline void
793 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) 793 inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
794 { 794 {
795 rq->raw_weighted_load += p->load_weight; 795 rq->raw_weighted_load += p->load_weight;
796 } 796 }
797 797
798 static inline void 798 static inline void
799 dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) 799 dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
800 { 800 {
801 rq->raw_weighted_load -= p->load_weight; 801 rq->raw_weighted_load -= p->load_weight;
802 } 802 }
803 803
804 static inline void inc_nr_running(struct task_struct *p, struct rq *rq) 804 static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
805 { 805 {
806 rq->nr_running++; 806 rq->nr_running++;
807 inc_raw_weighted_load(rq, p); 807 inc_raw_weighted_load(rq, p);
808 } 808 }
809 809
810 static inline void dec_nr_running(struct task_struct *p, struct rq *rq) 810 static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
811 { 811 {
812 rq->nr_running--; 812 rq->nr_running--;
813 dec_raw_weighted_load(rq, p); 813 dec_raw_weighted_load(rq, p);
814 } 814 }
815 815
816 /* 816 /*
817 * Calculate the expected normal priority: i.e. priority 817 * Calculate the expected normal priority: i.e. priority
818 * without taking RT-inheritance into account. Might be 818 * without taking RT-inheritance into account. Might be
819 * boosted by interactivity modifiers. Changes upon fork, 819 * boosted by interactivity modifiers. Changes upon fork,
820 * setprio syscalls, and whenever the interactivity 820 * setprio syscalls, and whenever the interactivity
821 * estimator recalculates. 821 * estimator recalculates.
822 */ 822 */
823 static inline int normal_prio(struct task_struct *p) 823 static inline int normal_prio(struct task_struct *p)
824 { 824 {
825 int prio; 825 int prio;
826 826
827 if (has_rt_policy(p)) 827 if (has_rt_policy(p))
828 prio = MAX_RT_PRIO-1 - p->rt_priority; 828 prio = MAX_RT_PRIO-1 - p->rt_priority;
829 else 829 else
830 prio = __normal_prio(p); 830 prio = __normal_prio(p);
831 return prio; 831 return prio;
832 } 832 }
833 833
834 /* 834 /*
835 * Calculate the current priority, i.e. the priority 835 * Calculate the current priority, i.e. the priority
836 * taken into account by the scheduler. This value might 836 * taken into account by the scheduler. This value might
837 * be boosted by RT tasks, or might be boosted by 837 * be boosted by RT tasks, or might be boosted by
838 * interactivity modifiers. Will be RT if the task got 838 * interactivity modifiers. Will be RT if the task got
839 * RT-boosted. If not then it returns p->normal_prio. 839 * RT-boosted. If not then it returns p->normal_prio.
840 */ 840 */
841 static int effective_prio(struct task_struct *p) 841 static int effective_prio(struct task_struct *p)
842 { 842 {
843 p->normal_prio = normal_prio(p); 843 p->normal_prio = normal_prio(p);
844 /* 844 /*
845 * If we are RT tasks or we were boosted to RT priority, 845 * If we are RT tasks or we were boosted to RT priority,
846 * keep the priority unchanged. Otherwise, update priority 846 * keep the priority unchanged. Otherwise, update priority
847 * to the normal priority: 847 * to the normal priority:
848 */ 848 */
849 if (!rt_prio(p->prio)) 849 if (!rt_prio(p->prio))
850 return p->normal_prio; 850 return p->normal_prio;
851 return p->prio; 851 return p->prio;
852 } 852 }
853 853
854 /* 854 /*
855 * __activate_task - move a task to the runqueue. 855 * __activate_task - move a task to the runqueue.
856 */ 856 */
857 static void __activate_task(struct task_struct *p, struct rq *rq) 857 static void __activate_task(struct task_struct *p, struct rq *rq)
858 { 858 {
859 struct prio_array *target = rq->active; 859 struct prio_array *target = rq->active;
860 860
861 if (batch_task(p)) 861 if (batch_task(p))
862 target = rq->expired; 862 target = rq->expired;
863 enqueue_task(p, target); 863 enqueue_task(p, target);
864 inc_nr_running(p, rq); 864 inc_nr_running(p, rq);
865 } 865 }
866 866
867 /* 867 /*
868 * __activate_idle_task - move idle task to the _front_ of runqueue. 868 * __activate_idle_task - move idle task to the _front_ of runqueue.
869 */ 869 */
870 static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) 870 static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
871 { 871 {
872 enqueue_task_head(p, rq->active); 872 enqueue_task_head(p, rq->active);
873 inc_nr_running(p, rq); 873 inc_nr_running(p, rq);
874 } 874 }
875 875
876 /* 876 /*
877 * Recalculate p->normal_prio and p->prio after having slept, 877 * Recalculate p->normal_prio and p->prio after having slept,
878 * updating the sleep-average too: 878 * updating the sleep-average too:
879 */ 879 */
880 static int recalc_task_prio(struct task_struct *p, unsigned long long now) 880 static int recalc_task_prio(struct task_struct *p, unsigned long long now)
881 { 881 {
882 /* Caller must always ensure 'now >= p->timestamp' */ 882 /* Caller must always ensure 'now >= p->timestamp' */
883 unsigned long sleep_time = now - p->timestamp; 883 unsigned long sleep_time = now - p->timestamp;
884 884
885 if (batch_task(p)) 885 if (batch_task(p))
886 sleep_time = 0; 886 sleep_time = 0;
887 887
888 if (likely(sleep_time > 0)) { 888 if (likely(sleep_time > 0)) {
889 /* 889 /*
890 * This ceiling is set to the lowest priority that would allow 890 * This ceiling is set to the lowest priority that would allow
891 * a task to be reinserted into the active array on timeslice 891 * a task to be reinserted into the active array on timeslice
892 * completion. 892 * completion.
893 */ 893 */
894 unsigned long ceiling = INTERACTIVE_SLEEP(p); 894 unsigned long ceiling = INTERACTIVE_SLEEP(p);
895 895
896 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { 896 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
897 /* 897 /*
898 * Prevents user tasks from achieving best priority 898 * Prevents user tasks from achieving best priority
899 * with one single large enough sleep. 899 * with one single large enough sleep.
900 */ 900 */
901 p->sleep_avg = ceiling; 901 p->sleep_avg = ceiling;
902 /* 902 /*
903 * Using INTERACTIVE_SLEEP() as a ceiling places a 903 * Using INTERACTIVE_SLEEP() as a ceiling places a
904 * nice(0) task 1ms sleep away from promotion, and 904 * nice(0) task 1ms sleep away from promotion, and
905 * gives it 700ms to round-robin with no chance of 905 * gives it 700ms to round-robin with no chance of
906 * being demoted. This is more than generous, so 906 * being demoted. This is more than generous, so
907 * mark this sleep as non-interactive to prevent the 907 * mark this sleep as non-interactive to prevent the
908 * on-runqueue bonus logic from intervening should 908 * on-runqueue bonus logic from intervening should
909 * this task not receive cpu immediately. 909 * this task not receive cpu immediately.
910 */ 910 */
911 p->sleep_type = SLEEP_NONINTERACTIVE; 911 p->sleep_type = SLEEP_NONINTERACTIVE;
912 } else { 912 } else {
913 /* 913 /*
914 * Tasks waking from uninterruptible sleep are 914 * Tasks waking from uninterruptible sleep are
915 * limited in their sleep_avg rise as they 915 * limited in their sleep_avg rise as they
916 * are likely to be waiting on I/O 916 * are likely to be waiting on I/O
917 */ 917 */
918 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 918 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
919 if (p->sleep_avg >= ceiling) 919 if (p->sleep_avg >= ceiling)
920 sleep_time = 0; 920 sleep_time = 0;
921 else if (p->sleep_avg + sleep_time >= 921 else if (p->sleep_avg + sleep_time >=
922 ceiling) { 922 ceiling) {
923 p->sleep_avg = ceiling; 923 p->sleep_avg = ceiling;
924 sleep_time = 0; 924 sleep_time = 0;
925 } 925 }
926 } 926 }
927 927
928 /* 928 /*
929 * This code gives a bonus to interactive tasks. 929 * This code gives a bonus to interactive tasks.
930 * 930 *
931 * The boost works by updating the 'average sleep time' 931 * The boost works by updating the 'average sleep time'
932 * value here, based on ->timestamp. The more time a 932 * value here, based on ->timestamp. The more time a
933 * task spends sleeping, the higher the average gets - 933 * task spends sleeping, the higher the average gets -
934 * and the higher the priority boost gets as well. 934 * and the higher the priority boost gets as well.
935 */ 935 */
936 p->sleep_avg += sleep_time; 936 p->sleep_avg += sleep_time;
937 937
938 } 938 }
939 if (p->sleep_avg > NS_MAX_SLEEP_AVG) 939 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
940 p->sleep_avg = NS_MAX_SLEEP_AVG; 940 p->sleep_avg = NS_MAX_SLEEP_AVG;
941 } 941 }
942 942
943 return effective_prio(p); 943 return effective_prio(p);
944 } 944 }
945 945
946 /* 946 /*
947 * activate_task - move a task to the runqueue and do priority recalculation 947 * activate_task - move a task to the runqueue and do priority recalculation
948 * 948 *
949 * Update all the scheduling statistics stuff. (sleep average 949 * Update all the scheduling statistics stuff. (sleep average
950 * calculation, priority modifiers, etc.) 950 * calculation, priority modifiers, etc.)
951 */ 951 */
952 static void activate_task(struct task_struct *p, struct rq *rq, int local) 952 static void activate_task(struct task_struct *p, struct rq *rq, int local)
953 { 953 {
954 unsigned long long now; 954 unsigned long long now;
955 955
956 if (rt_task(p)) 956 if (rt_task(p))
957 goto out; 957 goto out;
958 958
959 now = sched_clock(); 959 now = sched_clock();
960 #ifdef CONFIG_SMP 960 #ifdef CONFIG_SMP
961 if (!local) { 961 if (!local) {
962 /* Compensate for drifting sched_clock */ 962 /* Compensate for drifting sched_clock */
963 struct rq *this_rq = this_rq(); 963 struct rq *this_rq = this_rq();
964 now = (now - this_rq->most_recent_timestamp) 964 now = (now - this_rq->most_recent_timestamp)
965 + rq->most_recent_timestamp; 965 + rq->most_recent_timestamp;
966 } 966 }
967 #endif 967 #endif
968 968
969 /* 969 /*
970 * Sleep time is in units of nanosecs, so shift by 20 to get a 970 * Sleep time is in units of nanosecs, so shift by 20 to get a
971 * milliseconds-range estimation of the amount of time that the task 971 * milliseconds-range estimation of the amount of time that the task
972 * spent sleeping: 972 * spent sleeping:
973 */ 973 */
974 if (unlikely(prof_on == SLEEP_PROFILING)) { 974 if (unlikely(prof_on == SLEEP_PROFILING)) {
975 if (p->state == TASK_UNINTERRUPTIBLE) 975 if (p->state == TASK_UNINTERRUPTIBLE)
976 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), 976 profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
977 (now - p->timestamp) >> 20); 977 (now - p->timestamp) >> 20);
978 } 978 }
979 979
980 p->prio = recalc_task_prio(p, now); 980 p->prio = recalc_task_prio(p, now);
981 981
982 /* 982 /*
983 * This checks to make sure it's not an uninterruptible task 983 * This checks to make sure it's not an uninterruptible task
984 * that is now waking up. 984 * that is now waking up.
985 */ 985 */
986 if (p->sleep_type == SLEEP_NORMAL) { 986 if (p->sleep_type == SLEEP_NORMAL) {
987 /* 987 /*
988 * Tasks which were woken up by interrupts (ie. hw events) 988 * Tasks which were woken up by interrupts (ie. hw events)
989 * are most likely of interactive nature. So we give them 989 * are most likely of interactive nature. So we give them
990 * the credit of extending their sleep time to the period 990 * the credit of extending their sleep time to the period
991 * of time they spend on the runqueue, waiting for execution 991 * of time they spend on the runqueue, waiting for execution
992 * on a CPU, first time around: 992 * on a CPU, first time around:
993 */ 993 */
994 if (in_interrupt()) 994 if (in_interrupt())
995 p->sleep_type = SLEEP_INTERRUPTED; 995 p->sleep_type = SLEEP_INTERRUPTED;
996 else { 996 else {
997 /* 997 /*
998 * Normal first-time wakeups get a credit too for 998 * Normal first-time wakeups get a credit too for
999 * on-runqueue time, but it will be weighted down: 999 * on-runqueue time, but it will be weighted down:
1000 */ 1000 */
1001 p->sleep_type = SLEEP_INTERACTIVE; 1001 p->sleep_type = SLEEP_INTERACTIVE;
1002 } 1002 }
1003 } 1003 }
1004 p->timestamp = now; 1004 p->timestamp = now;
1005 out: 1005 out:
1006 __activate_task(p, rq); 1006 __activate_task(p, rq);
1007 } 1007 }
1008 1008
1009 /* 1009 /*
1010 * deactivate_task - remove a task from the runqueue. 1010 * deactivate_task - remove a task from the runqueue.
1011 */ 1011 */
1012 static void deactivate_task(struct task_struct *p, struct rq *rq) 1012 static void deactivate_task(struct task_struct *p, struct rq *rq)
1013 { 1013 {
1014 dec_nr_running(p, rq); 1014 dec_nr_running(p, rq);
1015 dequeue_task(p, p->array); 1015 dequeue_task(p, p->array);
1016 p->array = NULL; 1016 p->array = NULL;
1017 } 1017 }
1018 1018
1019 /* 1019 /*
1020 * resched_task - mark a task 'to be rescheduled now'. 1020 * resched_task - mark a task 'to be rescheduled now'.
1021 * 1021 *
1022 * On UP this means the setting of the need_resched flag, on SMP it 1022 * On UP this means the setting of the need_resched flag, on SMP it
1023 * might also involve a cross-CPU call to trigger the scheduler on 1023 * might also involve a cross-CPU call to trigger the scheduler on
1024 * the target CPU. 1024 * the target CPU.
1025 */ 1025 */
1026 #ifdef CONFIG_SMP 1026 #ifdef CONFIG_SMP
1027 1027
1028 #ifndef tsk_is_polling 1028 #ifndef tsk_is_polling
1029 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1029 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1030 #endif 1030 #endif
1031 1031
1032 static void resched_task(struct task_struct *p) 1032 static void resched_task(struct task_struct *p)
1033 { 1033 {
1034 int cpu; 1034 int cpu;
1035 1035
1036 assert_spin_locked(&task_rq(p)->lock); 1036 assert_spin_locked(&task_rq(p)->lock);
1037 1037
1038 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1038 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1039 return; 1039 return;
1040 1040
1041 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1041 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1042 1042
1043 cpu = task_cpu(p); 1043 cpu = task_cpu(p);
1044 if (cpu == smp_processor_id()) 1044 if (cpu == smp_processor_id())
1045 return; 1045 return;
1046 1046
1047 /* NEED_RESCHED must be visible before we test polling */ 1047 /* NEED_RESCHED must be visible before we test polling */
1048 smp_mb(); 1048 smp_mb();
1049 if (!tsk_is_polling(p)) 1049 if (!tsk_is_polling(p))
1050 smp_send_reschedule(cpu); 1050 smp_send_reschedule(cpu);
1051 } 1051 }
1052 #else 1052 #else
1053 static inline void resched_task(struct task_struct *p) 1053 static inline void resched_task(struct task_struct *p)
1054 { 1054 {
1055 assert_spin_locked(&task_rq(p)->lock); 1055 assert_spin_locked(&task_rq(p)->lock);
1056 set_tsk_need_resched(p); 1056 set_tsk_need_resched(p);
1057 } 1057 }
1058 #endif 1058 #endif
1059 1059
1060 /** 1060 /**
1061 * task_curr - is this task currently executing on a CPU? 1061 * task_curr - is this task currently executing on a CPU?
1062 * @p: the task in question. 1062 * @p: the task in question.
1063 */ 1063 */
1064 inline int task_curr(const struct task_struct *p) 1064 inline int task_curr(const struct task_struct *p)
1065 { 1065 {
1066 return cpu_curr(task_cpu(p)) == p; 1066 return cpu_curr(task_cpu(p)) == p;
1067 } 1067 }
1068 1068
1069 /* Used instead of source_load when we know the type == 0 */ 1069 /* Used instead of source_load when we know the type == 0 */
1070 unsigned long weighted_cpuload(const int cpu) 1070 unsigned long weighted_cpuload(const int cpu)
1071 { 1071 {
1072 return cpu_rq(cpu)->raw_weighted_load; 1072 return cpu_rq(cpu)->raw_weighted_load;
1073 } 1073 }
1074 1074
1075 #ifdef CONFIG_SMP 1075 #ifdef CONFIG_SMP
1076 struct migration_req { 1076 struct migration_req {
1077 struct list_head list; 1077 struct list_head list;
1078 1078
1079 struct task_struct *task; 1079 struct task_struct *task;
1080 int dest_cpu; 1080 int dest_cpu;
1081 1081
1082 struct completion done; 1082 struct completion done;
1083 }; 1083 };
1084 1084
1085 /* 1085 /*
1086 * The task's runqueue lock must be held. 1086 * The task's runqueue lock must be held.
1087 * Returns true if you have to wait for migration thread. 1087 * Returns true if you have to wait for migration thread.
1088 */ 1088 */
1089 static int 1089 static int
1090 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) 1090 migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1091 { 1091 {
1092 struct rq *rq = task_rq(p); 1092 struct rq *rq = task_rq(p);
1093 1093
1094 /* 1094 /*
1095 * If the task is not on a runqueue (and not running), then 1095 * If the task is not on a runqueue (and not running), then
1096 * it is sufficient to simply update the task's cpu field. 1096 * it is sufficient to simply update the task's cpu field.
1097 */ 1097 */
1098 if (!p->array && !task_running(rq, p)) { 1098 if (!p->array && !task_running(rq, p)) {
1099 set_task_cpu(p, dest_cpu); 1099 set_task_cpu(p, dest_cpu);
1100 return 0; 1100 return 0;
1101 } 1101 }
1102 1102
1103 init_completion(&req->done); 1103 init_completion(&req->done);
1104 req->task = p; 1104 req->task = p;
1105 req->dest_cpu = dest_cpu; 1105 req->dest_cpu = dest_cpu;
1106 list_add(&req->list, &rq->migration_queue); 1106 list_add(&req->list, &rq->migration_queue);
1107 1107
1108 return 1; 1108 return 1;
1109 } 1109 }
1110 1110
1111 /* 1111 /*
1112 * wait_task_inactive - wait for a thread to unschedule. 1112 * wait_task_inactive - wait for a thread to unschedule.
1113 * 1113 *
1114 * The caller must ensure that the task *will* unschedule sometime soon, 1114 * The caller must ensure that the task *will* unschedule sometime soon,
1115 * else this function might spin for a *long* time. This function can't 1115 * else this function might spin for a *long* time. This function can't
1116 * be called with interrupts off, or it may introduce deadlock with 1116 * be called with interrupts off, or it may introduce deadlock with
1117 * smp_call_function() if an IPI is sent by the same process we are 1117 * smp_call_function() if an IPI is sent by the same process we are
1118 * waiting to become inactive. 1118 * waiting to become inactive.
1119 */ 1119 */
1120 void wait_task_inactive(struct task_struct *p) 1120 void wait_task_inactive(struct task_struct *p)
1121 { 1121 {
1122 unsigned long flags; 1122 unsigned long flags;
1123 struct rq *rq; 1123 struct rq *rq;
1124 int preempted; 1124 int preempted;
1125 1125
1126 repeat: 1126 repeat:
1127 rq = task_rq_lock(p, &flags); 1127 rq = task_rq_lock(p, &flags);
1128 /* Must be off runqueue entirely, not preempted. */ 1128 /* Must be off runqueue entirely, not preempted. */
1129 if (unlikely(p->array || task_running(rq, p))) { 1129 if (unlikely(p->array || task_running(rq, p))) {
1130 /* If it's preempted, we yield. It could be a while. */ 1130 /* If it's preempted, we yield. It could be a while. */
1131 preempted = !task_running(rq, p); 1131 preempted = !task_running(rq, p);
1132 task_rq_unlock(rq, &flags); 1132 task_rq_unlock(rq, &flags);
1133 cpu_relax(); 1133 cpu_relax();
1134 if (preempted) 1134 if (preempted)
1135 yield(); 1135 yield();
1136 goto repeat; 1136 goto repeat;
1137 } 1137 }
1138 task_rq_unlock(rq, &flags); 1138 task_rq_unlock(rq, &flags);
1139 } 1139 }
1140 1140
1141 /*** 1141 /***
1142 * kick_process - kick a running thread to enter/exit the kernel 1142 * kick_process - kick a running thread to enter/exit the kernel
1143 * @p: the to-be-kicked thread 1143 * @p: the to-be-kicked thread
1144 * 1144 *
1145 * Cause a process which is running on another CPU to enter 1145 * Cause a process which is running on another CPU to enter
1146 * kernel-mode, without any delay. (to get signals handled.) 1146 * kernel-mode, without any delay. (to get signals handled.)
1147 * 1147 *
1148 * NOTE: this function doesnt have to take the runqueue lock, 1148 * NOTE: this function doesnt have to take the runqueue lock,
1149 * because all it wants to ensure is that the remote task enters 1149 * because all it wants to ensure is that the remote task enters
1150 * the kernel. If the IPI races and the task has been migrated 1150 * the kernel. If the IPI races and the task has been migrated
1151 * to another CPU then no harm is done and the purpose has been 1151 * to another CPU then no harm is done and the purpose has been
1152 * achieved as well. 1152 * achieved as well.
1153 */ 1153 */
1154 void kick_process(struct task_struct *p) 1154 void kick_process(struct task_struct *p)
1155 { 1155 {
1156 int cpu; 1156 int cpu;
1157 1157
1158 preempt_disable(); 1158 preempt_disable();
1159 cpu = task_cpu(p); 1159 cpu = task_cpu(p);
1160 if ((cpu != smp_processor_id()) && task_curr(p)) 1160 if ((cpu != smp_processor_id()) && task_curr(p))
1161 smp_send_reschedule(cpu); 1161 smp_send_reschedule(cpu);
1162 preempt_enable(); 1162 preempt_enable();
1163 } 1163 }
1164 1164
1165 /* 1165 /*
1166 * Return a low guess at the load of a migration-source cpu weighted 1166 * Return a low guess at the load of a migration-source cpu weighted
1167 * according to the scheduling class and "nice" value. 1167 * according to the scheduling class and "nice" value.
1168 * 1168 *
1169 * We want to under-estimate the load of migration sources, to 1169 * We want to under-estimate the load of migration sources, to
1170 * balance conservatively. 1170 * balance conservatively.
1171 */ 1171 */
1172 static inline unsigned long source_load(int cpu, int type) 1172 static inline unsigned long source_load(int cpu, int type)
1173 { 1173 {
1174 struct rq *rq = cpu_rq(cpu); 1174 struct rq *rq = cpu_rq(cpu);
1175 1175
1176 if (type == 0) 1176 if (type == 0)
1177 return rq->raw_weighted_load; 1177 return rq->raw_weighted_load;
1178 1178
1179 return min(rq->cpu_load[type-1], rq->raw_weighted_load); 1179 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
1180 } 1180 }
1181 1181
1182 /* 1182 /*
1183 * Return a high guess at the load of a migration-target cpu weighted 1183 * Return a high guess at the load of a migration-target cpu weighted
1184 * according to the scheduling class and "nice" value. 1184 * according to the scheduling class and "nice" value.
1185 */ 1185 */
1186 static inline unsigned long target_load(int cpu, int type) 1186 static inline unsigned long target_load(int cpu, int type)
1187 { 1187 {
1188 struct rq *rq = cpu_rq(cpu); 1188 struct rq *rq = cpu_rq(cpu);
1189 1189
1190 if (type == 0) 1190 if (type == 0)
1191 return rq->raw_weighted_load; 1191 return rq->raw_weighted_load;
1192 1192
1193 return max(rq->cpu_load[type-1], rq->raw_weighted_load); 1193 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1194 } 1194 }
1195 1195
1196 /* 1196 /*
1197 * Return the average load per task on the cpu's run queue 1197 * Return the average load per task on the cpu's run queue
1198 */ 1198 */
1199 static inline unsigned long cpu_avg_load_per_task(int cpu) 1199 static inline unsigned long cpu_avg_load_per_task(int cpu)
1200 { 1200 {
1201 struct rq *rq = cpu_rq(cpu); 1201 struct rq *rq = cpu_rq(cpu);
1202 unsigned long n = rq->nr_running; 1202 unsigned long n = rq->nr_running;
1203 1203
1204 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; 1204 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
1205 } 1205 }
1206 1206
1207 /* 1207 /*
1208 * find_idlest_group finds and returns the least busy CPU group within the 1208 * find_idlest_group finds and returns the least busy CPU group within the
1209 * domain. 1209 * domain.
1210 */ 1210 */
1211 static struct sched_group * 1211 static struct sched_group *
1212 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) 1212 find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1213 { 1213 {
1214 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; 1214 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1215 unsigned long min_load = ULONG_MAX, this_load = 0; 1215 unsigned long min_load = ULONG_MAX, this_load = 0;
1216 int load_idx = sd->forkexec_idx; 1216 int load_idx = sd->forkexec_idx;
1217 int imbalance = 100 + (sd->imbalance_pct-100)/2; 1217 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1218 1218
1219 do { 1219 do {
1220 unsigned long load, avg_load; 1220 unsigned long load, avg_load;
1221 int local_group; 1221 int local_group;
1222 int i; 1222 int i;
1223 1223
1224 /* Skip over this group if it has no CPUs allowed */ 1224 /* Skip over this group if it has no CPUs allowed */
1225 if (!cpus_intersects(group->cpumask, p->cpus_allowed)) 1225 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1226 goto nextgroup; 1226 goto nextgroup;
1227 1227
1228 local_group = cpu_isset(this_cpu, group->cpumask); 1228 local_group = cpu_isset(this_cpu, group->cpumask);
1229 1229
1230 /* Tally up the load of all CPUs in the group */ 1230 /* Tally up the load of all CPUs in the group */
1231 avg_load = 0; 1231 avg_load = 0;
1232 1232
1233 for_each_cpu_mask(i, group->cpumask) { 1233 for_each_cpu_mask(i, group->cpumask) {
1234 /* Bias balancing toward cpus of our domain */ 1234 /* Bias balancing toward cpus of our domain */
1235 if (local_group) 1235 if (local_group)
1236 load = source_load(i, load_idx); 1236 load = source_load(i, load_idx);
1237 else 1237 else
1238 load = target_load(i, load_idx); 1238 load = target_load(i, load_idx);
1239 1239
1240 avg_load += load; 1240 avg_load += load;
1241 } 1241 }
1242 1242
1243 /* Adjust by relative CPU power of the group */ 1243 /* Adjust by relative CPU power of the group */
1244 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 1244 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1245 1245
1246 if (local_group) { 1246 if (local_group) {
1247 this_load = avg_load; 1247 this_load = avg_load;
1248 this = group; 1248 this = group;
1249 } else if (avg_load < min_load) { 1249 } else if (avg_load < min_load) {
1250 min_load = avg_load; 1250 min_load = avg_load;
1251 idlest = group; 1251 idlest = group;
1252 } 1252 }
1253 nextgroup: 1253 nextgroup:
1254 group = group->next; 1254 group = group->next;
1255 } while (group != sd->groups); 1255 } while (group != sd->groups);
1256 1256
1257 if (!idlest || 100*this_load < imbalance*min_load) 1257 if (!idlest || 100*this_load < imbalance*min_load)
1258 return NULL; 1258 return NULL;
1259 return idlest; 1259 return idlest;
1260 } 1260 }
1261 1261
1262 /* 1262 /*
1263 * find_idlest_cpu - find the idlest cpu among the cpus in group. 1263 * find_idlest_cpu - find the idlest cpu among the cpus in group.
1264 */ 1264 */
1265 static int 1265 static int
1266 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 1266 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1267 { 1267 {
1268 cpumask_t tmp; 1268 cpumask_t tmp;
1269 unsigned long load, min_load = ULONG_MAX; 1269 unsigned long load, min_load = ULONG_MAX;
1270 int idlest = -1; 1270 int idlest = -1;
1271 int i; 1271 int i;
1272 1272
1273 /* Traverse only the allowed CPUs */ 1273 /* Traverse only the allowed CPUs */
1274 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1274 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1275 1275
1276 for_each_cpu_mask(i, tmp) { 1276 for_each_cpu_mask(i, tmp) {
1277 load = weighted_cpuload(i); 1277 load = weighted_cpuload(i);
1278 1278
1279 if (load < min_load || (load == min_load && i == this_cpu)) { 1279 if (load < min_load || (load == min_load && i == this_cpu)) {
1280 min_load = load; 1280 min_load = load;
1281 idlest = i; 1281 idlest = i;
1282 } 1282 }
1283 } 1283 }
1284 1284
1285 return idlest; 1285 return idlest;
1286 } 1286 }
1287 1287
1288 /* 1288 /*
1289 * sched_balance_self: balance the current task (running on cpu) in domains 1289 * sched_balance_self: balance the current task (running on cpu) in domains
1290 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and 1290 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1291 * SD_BALANCE_EXEC. 1291 * SD_BALANCE_EXEC.
1292 * 1292 *
1293 * Balance, ie. select the least loaded group. 1293 * Balance, ie. select the least loaded group.
1294 * 1294 *
1295 * Returns the target CPU number, or the same CPU if no balancing is needed. 1295 * Returns the target CPU number, or the same CPU if no balancing is needed.
1296 * 1296 *
1297 * preempt must be disabled. 1297 * preempt must be disabled.
1298 */ 1298 */
1299 static int sched_balance_self(int cpu, int flag) 1299 static int sched_balance_self(int cpu, int flag)
1300 { 1300 {
1301 struct task_struct *t = current; 1301 struct task_struct *t = current;
1302 struct sched_domain *tmp, *sd = NULL; 1302 struct sched_domain *tmp, *sd = NULL;
1303 1303
1304 for_each_domain(cpu, tmp) { 1304 for_each_domain(cpu, tmp) {
1305 /* 1305 /*
1306 * If power savings logic is enabled for a domain, stop there. 1306 * If power savings logic is enabled for a domain, stop there.
1307 */ 1307 */
1308 if (tmp->flags & SD_POWERSAVINGS_BALANCE) 1308 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1309 break; 1309 break;
1310 if (tmp->flags & flag) 1310 if (tmp->flags & flag)
1311 sd = tmp; 1311 sd = tmp;
1312 } 1312 }
1313 1313
1314 while (sd) { 1314 while (sd) {
1315 cpumask_t span; 1315 cpumask_t span;
1316 struct sched_group *group; 1316 struct sched_group *group;
1317 int new_cpu, weight; 1317 int new_cpu, weight;
1318 1318
1319 if (!(sd->flags & flag)) { 1319 if (!(sd->flags & flag)) {
1320 sd = sd->child; 1320 sd = sd->child;
1321 continue; 1321 continue;
1322 } 1322 }
1323 1323
1324 span = sd->span; 1324 span = sd->span;
1325 group = find_idlest_group(sd, t, cpu); 1325 group = find_idlest_group(sd, t, cpu);
1326 if (!group) { 1326 if (!group) {
1327 sd = sd->child; 1327 sd = sd->child;
1328 continue; 1328 continue;
1329 } 1329 }
1330 1330
1331 new_cpu = find_idlest_cpu(group, t, cpu); 1331 new_cpu = find_idlest_cpu(group, t, cpu);
1332 if (new_cpu == -1 || new_cpu == cpu) { 1332 if (new_cpu == -1 || new_cpu == cpu) {
1333 /* Now try balancing at a lower domain level of cpu */ 1333 /* Now try balancing at a lower domain level of cpu */
1334 sd = sd->child; 1334 sd = sd->child;
1335 continue; 1335 continue;
1336 } 1336 }
1337 1337
1338 /* Now try balancing at a lower domain level of new_cpu */ 1338 /* Now try balancing at a lower domain level of new_cpu */
1339 cpu = new_cpu; 1339 cpu = new_cpu;
1340 sd = NULL; 1340 sd = NULL;
1341 weight = cpus_weight(span); 1341 weight = cpus_weight(span);
1342 for_each_domain(cpu, tmp) { 1342 for_each_domain(cpu, tmp) {
1343 if (weight <= cpus_weight(tmp->span)) 1343 if (weight <= cpus_weight(tmp->span))
1344 break; 1344 break;
1345 if (tmp->flags & flag) 1345 if (tmp->flags & flag)
1346 sd = tmp; 1346 sd = tmp;
1347 } 1347 }
1348 /* while loop will break here if sd == NULL */ 1348 /* while loop will break here if sd == NULL */
1349 } 1349 }
1350 1350
1351 return cpu; 1351 return cpu;
1352 } 1352 }
1353 1353
1354 #endif /* CONFIG_SMP */ 1354 #endif /* CONFIG_SMP */
1355 1355
1356 /* 1356 /*
1357 * wake_idle() will wake a task on an idle cpu if task->cpu is 1357 * wake_idle() will wake a task on an idle cpu if task->cpu is
1358 * not idle and an idle cpu is available. The span of cpus to 1358 * not idle and an idle cpu is available. The span of cpus to
1359 * search starts with cpus closest then further out as needed, 1359 * search starts with cpus closest then further out as needed,
1360 * so we always favor a closer, idle cpu. 1360 * so we always favor a closer, idle cpu.
1361 * 1361 *
1362 * Returns the CPU we should wake onto. 1362 * Returns the CPU we should wake onto.
1363 */ 1363 */
1364 #if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1364 #if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1365 static int wake_idle(int cpu, struct task_struct *p) 1365 static int wake_idle(int cpu, struct task_struct *p)
1366 { 1366 {
1367 cpumask_t tmp; 1367 cpumask_t tmp;
1368 struct sched_domain *sd; 1368 struct sched_domain *sd;
1369 int i; 1369 int i;
1370 1370
1371 if (idle_cpu(cpu)) 1371 if (idle_cpu(cpu))
1372 return cpu; 1372 return cpu;
1373 1373
1374 for_each_domain(cpu, sd) { 1374 for_each_domain(cpu, sd) {
1375 if (sd->flags & SD_WAKE_IDLE) { 1375 if (sd->flags & SD_WAKE_IDLE) {
1376 cpus_and(tmp, sd->span, p->cpus_allowed); 1376 cpus_and(tmp, sd->span, p->cpus_allowed);
1377 for_each_cpu_mask(i, tmp) { 1377 for_each_cpu_mask(i, tmp) {
1378 if (idle_cpu(i)) 1378 if (idle_cpu(i))
1379 return i; 1379 return i;
1380 } 1380 }
1381 } 1381 }
1382 else 1382 else
1383 break; 1383 break;
1384 } 1384 }
1385 return cpu; 1385 return cpu;
1386 } 1386 }
1387 #else 1387 #else
1388 static inline int wake_idle(int cpu, struct task_struct *p) 1388 static inline int wake_idle(int cpu, struct task_struct *p)
1389 { 1389 {
1390 return cpu; 1390 return cpu;
1391 } 1391 }
1392 #endif 1392 #endif
1393 1393
1394 /*** 1394 /***
1395 * try_to_wake_up - wake up a thread 1395 * try_to_wake_up - wake up a thread
1396 * @p: the to-be-woken-up thread 1396 * @p: the to-be-woken-up thread
1397 * @state: the mask of task states that can be woken 1397 * @state: the mask of task states that can be woken
1398 * @sync: do a synchronous wakeup? 1398 * @sync: do a synchronous wakeup?
1399 * 1399 *
1400 * Put it on the run-queue if it's not already there. The "current" 1400 * Put it on the run-queue if it's not already there. The "current"
1401 * thread is always on the run-queue (except when the actual 1401 * thread is always on the run-queue (except when the actual
1402 * re-schedule is in progress), and as such you're allowed to do 1402 * re-schedule is in progress), and as such you're allowed to do
1403 * the simpler "current->state = TASK_RUNNING" to mark yourself 1403 * the simpler "current->state = TASK_RUNNING" to mark yourself
1404 * runnable without the overhead of this. 1404 * runnable without the overhead of this.
1405 * 1405 *
1406 * returns failure only if the task is already active. 1406 * returns failure only if the task is already active.
1407 */ 1407 */
1408 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 1408 static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1409 { 1409 {
1410 int cpu, this_cpu, success = 0; 1410 int cpu, this_cpu, success = 0;
1411 unsigned long flags; 1411 unsigned long flags;
1412 long old_state; 1412 long old_state;
1413 struct rq *rq; 1413 struct rq *rq;
1414 #ifdef CONFIG_SMP 1414 #ifdef CONFIG_SMP
1415 struct sched_domain *sd, *this_sd = NULL; 1415 struct sched_domain *sd, *this_sd = NULL;
1416 unsigned long load, this_load; 1416 unsigned long load, this_load;
1417 int new_cpu; 1417 int new_cpu;
1418 #endif 1418 #endif
1419 1419
1420 rq = task_rq_lock(p, &flags); 1420 rq = task_rq_lock(p, &flags);
1421 old_state = p->state; 1421 old_state = p->state;
1422 if (!(old_state & state)) 1422 if (!(old_state & state))
1423 goto out; 1423 goto out;
1424 1424
1425 if (p->array) 1425 if (p->array)
1426 goto out_running; 1426 goto out_running;
1427 1427
1428 cpu = task_cpu(p); 1428 cpu = task_cpu(p);
1429 this_cpu = smp_processor_id(); 1429 this_cpu = smp_processor_id();
1430 1430
1431 #ifdef CONFIG_SMP 1431 #ifdef CONFIG_SMP
1432 if (unlikely(task_running(rq, p))) 1432 if (unlikely(task_running(rq, p)))
1433 goto out_activate; 1433 goto out_activate;
1434 1434
1435 new_cpu = cpu; 1435 new_cpu = cpu;
1436 1436
1437 schedstat_inc(rq, ttwu_cnt); 1437 schedstat_inc(rq, ttwu_cnt);
1438 if (cpu == this_cpu) { 1438 if (cpu == this_cpu) {
1439 schedstat_inc(rq, ttwu_local); 1439 schedstat_inc(rq, ttwu_local);
1440 goto out_set_cpu; 1440 goto out_set_cpu;
1441 } 1441 }
1442 1442
1443 for_each_domain(this_cpu, sd) { 1443 for_each_domain(this_cpu, sd) {
1444 if (cpu_isset(cpu, sd->span)) { 1444 if (cpu_isset(cpu, sd->span)) {
1445 schedstat_inc(sd, ttwu_wake_remote); 1445 schedstat_inc(sd, ttwu_wake_remote);
1446 this_sd = sd; 1446 this_sd = sd;
1447 break; 1447 break;
1448 } 1448 }
1449 } 1449 }
1450 1450
1451 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) 1451 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1452 goto out_set_cpu; 1452 goto out_set_cpu;
1453 1453
1454 /* 1454 /*
1455 * Check for affine wakeup and passive balancing possibilities. 1455 * Check for affine wakeup and passive balancing possibilities.
1456 */ 1456 */
1457 if (this_sd) { 1457 if (this_sd) {
1458 int idx = this_sd->wake_idx; 1458 int idx = this_sd->wake_idx;
1459 unsigned int imbalance; 1459 unsigned int imbalance;
1460 1460
1461 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; 1461 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1462 1462
1463 load = source_load(cpu, idx); 1463 load = source_load(cpu, idx);
1464 this_load = target_load(this_cpu, idx); 1464 this_load = target_load(this_cpu, idx);
1465 1465
1466 new_cpu = this_cpu; /* Wake to this CPU if we can */ 1466 new_cpu = this_cpu; /* Wake to this CPU if we can */
1467 1467
1468 if (this_sd->flags & SD_WAKE_AFFINE) { 1468 if (this_sd->flags & SD_WAKE_AFFINE) {
1469 unsigned long tl = this_load; 1469 unsigned long tl = this_load;
1470 unsigned long tl_per_task; 1470 unsigned long tl_per_task;
1471 1471
1472 tl_per_task = cpu_avg_load_per_task(this_cpu); 1472 tl_per_task = cpu_avg_load_per_task(this_cpu);
1473 1473
1474 /* 1474 /*
1475 * If sync wakeup then subtract the (maximum possible) 1475 * If sync wakeup then subtract the (maximum possible)
1476 * effect of the currently running task from the load 1476 * effect of the currently running task from the load
1477 * of the current CPU: 1477 * of the current CPU:
1478 */ 1478 */
1479 if (sync) 1479 if (sync)
1480 tl -= current->load_weight; 1480 tl -= current->load_weight;
1481 1481
1482 if ((tl <= load && 1482 if ((tl <= load &&
1483 tl + target_load(cpu, idx) <= tl_per_task) || 1483 tl + target_load(cpu, idx) <= tl_per_task) ||
1484 100*(tl + p->load_weight) <= imbalance*load) { 1484 100*(tl + p->load_weight) <= imbalance*load) {
1485 /* 1485 /*
1486 * This domain has SD_WAKE_AFFINE and 1486 * This domain has SD_WAKE_AFFINE and
1487 * p is cache cold in this domain, and 1487 * p is cache cold in this domain, and
1488 * there is no bad imbalance. 1488 * there is no bad imbalance.
1489 */ 1489 */
1490 schedstat_inc(this_sd, ttwu_move_affine); 1490 schedstat_inc(this_sd, ttwu_move_affine);
1491 goto out_set_cpu; 1491 goto out_set_cpu;
1492 } 1492 }
1493 } 1493 }
1494 1494
1495 /* 1495 /*
1496 * Start passive balancing when half the imbalance_pct 1496 * Start passive balancing when half the imbalance_pct
1497 * limit is reached. 1497 * limit is reached.
1498 */ 1498 */
1499 if (this_sd->flags & SD_WAKE_BALANCE) { 1499 if (this_sd->flags & SD_WAKE_BALANCE) {
1500 if (imbalance*this_load <= 100*load) { 1500 if (imbalance*this_load <= 100*load) {
1501 schedstat_inc(this_sd, ttwu_move_balance); 1501 schedstat_inc(this_sd, ttwu_move_balance);
1502 goto out_set_cpu; 1502 goto out_set_cpu;
1503 } 1503 }
1504 } 1504 }
1505 } 1505 }
1506 1506
1507 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ 1507 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1508 out_set_cpu: 1508 out_set_cpu:
1509 new_cpu = wake_idle(new_cpu, p); 1509 new_cpu = wake_idle(new_cpu, p);
1510 if (new_cpu != cpu) { 1510 if (new_cpu != cpu) {
1511 set_task_cpu(p, new_cpu); 1511 set_task_cpu(p, new_cpu);
1512 task_rq_unlock(rq, &flags); 1512 task_rq_unlock(rq, &flags);
1513 /* might preempt at this point */ 1513 /* might preempt at this point */
1514 rq = task_rq_lock(p, &flags); 1514 rq = task_rq_lock(p, &flags);
1515 old_state = p->state; 1515 old_state = p->state;
1516 if (!(old_state & state)) 1516 if (!(old_state & state))
1517 goto out; 1517 goto out;
1518 if (p->array) 1518 if (p->array)
1519 goto out_running; 1519 goto out_running;
1520 1520
1521 this_cpu = smp_processor_id(); 1521 this_cpu = smp_processor_id();
1522 cpu = task_cpu(p); 1522 cpu = task_cpu(p);
1523 } 1523 }
1524 1524
1525 out_activate: 1525 out_activate:
1526 #endif /* CONFIG_SMP */ 1526 #endif /* CONFIG_SMP */
1527 if (old_state == TASK_UNINTERRUPTIBLE) { 1527 if (old_state == TASK_UNINTERRUPTIBLE) {
1528 rq->nr_uninterruptible--; 1528 rq->nr_uninterruptible--;
1529 /* 1529 /*
1530 * Tasks on involuntary sleep don't earn 1530 * Tasks on involuntary sleep don't earn
1531 * sleep_avg beyond just interactive state. 1531 * sleep_avg beyond just interactive state.
1532 */ 1532 */
1533 p->sleep_type = SLEEP_NONINTERACTIVE; 1533 p->sleep_type = SLEEP_NONINTERACTIVE;
1534 } else 1534 } else
1535 1535
1536 /* 1536 /*
1537 * Tasks that have marked their sleep as noninteractive get 1537 * Tasks that have marked their sleep as noninteractive get
1538 * woken up with their sleep average not weighted in an 1538 * woken up with their sleep average not weighted in an
1539 * interactive way. 1539 * interactive way.
1540 */ 1540 */
1541 if (old_state & TASK_NONINTERACTIVE) 1541 if (old_state & TASK_NONINTERACTIVE)
1542 p->sleep_type = SLEEP_NONINTERACTIVE; 1542 p->sleep_type = SLEEP_NONINTERACTIVE;
1543 1543
1544 1544
1545 activate_task(p, rq, cpu == this_cpu); 1545 activate_task(p, rq, cpu == this_cpu);
1546 /* 1546 /*
1547 * Sync wakeups (i.e. those types of wakeups where the waker 1547 * Sync wakeups (i.e. those types of wakeups where the waker
1548 * has indicated that it will leave the CPU in short order) 1548 * has indicated that it will leave the CPU in short order)
1549 * don't trigger a preemption, if the woken up task will run on 1549 * don't trigger a preemption, if the woken up task will run on
1550 * this cpu. (in this case the 'I will reschedule' promise of 1550 * this cpu. (in this case the 'I will reschedule' promise of
1551 * the waker guarantees that the freshly woken up task is going 1551 * the waker guarantees that the freshly woken up task is going
1552 * to be considered on this CPU.) 1552 * to be considered on this CPU.)
1553 */ 1553 */
1554 if (!sync || cpu != this_cpu) { 1554 if (!sync || cpu != this_cpu) {
1555 if (TASK_PREEMPTS_CURR(p, rq)) 1555 if (TASK_PREEMPTS_CURR(p, rq))
1556 resched_task(rq->curr); 1556 resched_task(rq->curr);
1557 } 1557 }
1558 success = 1; 1558 success = 1;
1559 1559
1560 out_running: 1560 out_running:
1561 p->state = TASK_RUNNING; 1561 p->state = TASK_RUNNING;
1562 out: 1562 out:
1563 task_rq_unlock(rq, &flags); 1563 task_rq_unlock(rq, &flags);
1564 1564
1565 return success; 1565 return success;
1566 } 1566 }
1567 1567
1568 int fastcall wake_up_process(struct task_struct *p) 1568 int fastcall wake_up_process(struct task_struct *p)
1569 { 1569 {
1570 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1570 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1571 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1571 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1572 } 1572 }
1573 EXPORT_SYMBOL(wake_up_process); 1573 EXPORT_SYMBOL(wake_up_process);
1574 1574
1575 int fastcall wake_up_state(struct task_struct *p, unsigned int state) 1575 int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1576 { 1576 {
1577 return try_to_wake_up(p, state, 0); 1577 return try_to_wake_up(p, state, 0);
1578 } 1578 }
1579 1579
1580 static void task_running_tick(struct rq *rq, struct task_struct *p); 1580 static void task_running_tick(struct rq *rq, struct task_struct *p);
1581 /* 1581 /*
1582 * Perform scheduler related setup for a newly forked process p. 1582 * Perform scheduler related setup for a newly forked process p.
1583 * p is forked by current. 1583 * p is forked by current.
1584 */ 1584 */
1585 void fastcall sched_fork(struct task_struct *p, int clone_flags) 1585 void fastcall sched_fork(struct task_struct *p, int clone_flags)
1586 { 1586 {
1587 int cpu = get_cpu(); 1587 int cpu = get_cpu();
1588 1588
1589 #ifdef CONFIG_SMP 1589 #ifdef CONFIG_SMP
1590 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 1590 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1591 #endif 1591 #endif
1592 set_task_cpu(p, cpu); 1592 set_task_cpu(p, cpu);
1593 1593
1594 /* 1594 /*
1595 * We mark the process as running here, but have not actually 1595 * We mark the process as running here, but have not actually
1596 * inserted it onto the runqueue yet. This guarantees that 1596 * inserted it onto the runqueue yet. This guarantees that
1597 * nobody will actually run it, and a signal or other external 1597 * nobody will actually run it, and a signal or other external
1598 * event cannot wake it up and insert it on the runqueue either. 1598 * event cannot wake it up and insert it on the runqueue either.
1599 */ 1599 */
1600 p->state = TASK_RUNNING; 1600 p->state = TASK_RUNNING;
1601 1601
1602 /* 1602 /*
1603 * Make sure we do not leak PI boosting priority to the child: 1603 * Make sure we do not leak PI boosting priority to the child:
1604 */ 1604 */
1605 p->prio = current->normal_prio; 1605 p->prio = current->normal_prio;
1606 1606
1607 INIT_LIST_HEAD(&p->run_list); 1607 INIT_LIST_HEAD(&p->run_list);
1608 p->array = NULL; 1608 p->array = NULL;
1609 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1609 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1610 if (unlikely(sched_info_on())) 1610 if (unlikely(sched_info_on()))
1611 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1611 memset(&p->sched_info, 0, sizeof(p->sched_info));
1612 #endif 1612 #endif
1613 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1613 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1614 p->oncpu = 0; 1614 p->oncpu = 0;
1615 #endif 1615 #endif
1616 #ifdef CONFIG_PREEMPT 1616 #ifdef CONFIG_PREEMPT
1617 /* Want to start with kernel preemption disabled. */ 1617 /* Want to start with kernel preemption disabled. */
1618 task_thread_info(p)->preempt_count = 1; 1618 task_thread_info(p)->preempt_count = 1;
1619 #endif 1619 #endif
1620 /* 1620 /*
1621 * Share the timeslice between parent and child, thus the 1621 * Share the timeslice between parent and child, thus the
1622 * total amount of pending timeslices in the system doesn't change, 1622 * total amount of pending timeslices in the system doesn't change,
1623 * resulting in more scheduling fairness. 1623 * resulting in more scheduling fairness.
1624 */ 1624 */
1625 local_irq_disable(); 1625 local_irq_disable();
1626 p->time_slice = (current->time_slice + 1) >> 1; 1626 p->time_slice = (current->time_slice + 1) >> 1;
1627 /* 1627 /*
1628 * The remainder of the first timeslice might be recovered by 1628 * The remainder of the first timeslice might be recovered by
1629 * the parent if the child exits early enough. 1629 * the parent if the child exits early enough.
1630 */ 1630 */
1631 p->first_time_slice = 1; 1631 p->first_time_slice = 1;
1632 current->time_slice >>= 1; 1632 current->time_slice >>= 1;
1633 p->timestamp = sched_clock(); 1633 p->timestamp = sched_clock();
1634 if (unlikely(!current->time_slice)) { 1634 if (unlikely(!current->time_slice)) {
1635 /* 1635 /*
1636 * This case is rare, it happens when the parent has only 1636 * This case is rare, it happens when the parent has only
1637 * a single jiffy left from its timeslice. Taking the 1637 * a single jiffy left from its timeslice. Taking the
1638 * runqueue lock is not a problem. 1638 * runqueue lock is not a problem.
1639 */ 1639 */
1640 current->time_slice = 1; 1640 current->time_slice = 1;
1641 task_running_tick(cpu_rq(cpu), current); 1641 task_running_tick(cpu_rq(cpu), current);
1642 } 1642 }
1643 local_irq_enable(); 1643 local_irq_enable();
1644 put_cpu(); 1644 put_cpu();
1645 } 1645 }
1646 1646
1647 /* 1647 /*
1648 * wake_up_new_task - wake up a newly created task for the first time. 1648 * wake_up_new_task - wake up a newly created task for the first time.
1649 * 1649 *
1650 * This function will do some initial scheduler statistics housekeeping 1650 * This function will do some initial scheduler statistics housekeeping
1651 * that must be done for every newly created context, then puts the task 1651 * that must be done for every newly created context, then puts the task
1652 * on the runqueue and wakes it. 1652 * on the runqueue and wakes it.
1653 */ 1653 */
1654 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 1654 void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1655 { 1655 {
1656 struct rq *rq, *this_rq; 1656 struct rq *rq, *this_rq;
1657 unsigned long flags; 1657 unsigned long flags;
1658 int this_cpu, cpu; 1658 int this_cpu, cpu;
1659 1659
1660 rq = task_rq_lock(p, &flags); 1660 rq = task_rq_lock(p, &flags);
1661 BUG_ON(p->state != TASK_RUNNING); 1661 BUG_ON(p->state != TASK_RUNNING);
1662 this_cpu = smp_processor_id(); 1662 this_cpu = smp_processor_id();
1663 cpu = task_cpu(p); 1663 cpu = task_cpu(p);
1664 1664
1665 /* 1665 /*
1666 * We decrease the sleep average of forking parents 1666 * We decrease the sleep average of forking parents
1667 * and children as well, to keep max-interactive tasks 1667 * and children as well, to keep max-interactive tasks
1668 * from forking tasks that are max-interactive. The parent 1668 * from forking tasks that are max-interactive. The parent
1669 * (current) is done further down, under its lock. 1669 * (current) is done further down, under its lock.
1670 */ 1670 */
1671 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * 1671 p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
1672 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1672 CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1673 1673
1674 p->prio = effective_prio(p); 1674 p->prio = effective_prio(p);
1675 1675
1676 if (likely(cpu == this_cpu)) { 1676 if (likely(cpu == this_cpu)) {
1677 if (!(clone_flags & CLONE_VM)) { 1677 if (!(clone_flags & CLONE_VM)) {
1678 /* 1678 /*
1679 * The VM isn't cloned, so we're in a good position to 1679 * The VM isn't cloned, so we're in a good position to
1680 * do child-runs-first in anticipation of an exec. This 1680 * do child-runs-first in anticipation of an exec. This
1681 * usually avoids a lot of COW overhead. 1681 * usually avoids a lot of COW overhead.
1682 */ 1682 */
1683 if (unlikely(!current->array)) 1683 if (unlikely(!current->array))
1684 __activate_task(p, rq); 1684 __activate_task(p, rq);
1685 else { 1685 else {
1686 p->prio = current->prio; 1686 p->prio = current->prio;
1687 p->normal_prio = current->normal_prio; 1687 p->normal_prio = current->normal_prio;
1688 list_add_tail(&p->run_list, &current->run_list); 1688 list_add_tail(&p->run_list, &current->run_list);
1689 p->array = current->array; 1689 p->array = current->array;
1690 p->array->nr_active++; 1690 p->array->nr_active++;
1691 inc_nr_running(p, rq); 1691 inc_nr_running(p, rq);
1692 } 1692 }
1693 set_need_resched(); 1693 set_need_resched();
1694 } else 1694 } else
1695 /* Run child last */ 1695 /* Run child last */
1696 __activate_task(p, rq); 1696 __activate_task(p, rq);
1697 /* 1697 /*
1698 * We skip the following code due to cpu == this_cpu 1698 * We skip the following code due to cpu == this_cpu
1699 * 1699 *
1700 * task_rq_unlock(rq, &flags); 1700 * task_rq_unlock(rq, &flags);
1701 * this_rq = task_rq_lock(current, &flags); 1701 * this_rq = task_rq_lock(current, &flags);
1702 */ 1702 */
1703 this_rq = rq; 1703 this_rq = rq;
1704 } else { 1704 } else {
1705 this_rq = cpu_rq(this_cpu); 1705 this_rq = cpu_rq(this_cpu);
1706 1706
1707 /* 1707 /*
1708 * Not the local CPU - must adjust timestamp. This should 1708 * Not the local CPU - must adjust timestamp. This should
1709 * get optimised away in the !CONFIG_SMP case. 1709 * get optimised away in the !CONFIG_SMP case.
1710 */ 1710 */
1711 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) 1711 p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
1712 + rq->most_recent_timestamp; 1712 + rq->most_recent_timestamp;
1713 __activate_task(p, rq); 1713 __activate_task(p, rq);
1714 if (TASK_PREEMPTS_CURR(p, rq)) 1714 if (TASK_PREEMPTS_CURR(p, rq))
1715 resched_task(rq->curr); 1715 resched_task(rq->curr);
1716 1716
1717 /* 1717 /*
1718 * Parent and child are on different CPUs, now get the 1718 * Parent and child are on different CPUs, now get the
1719 * parent runqueue to update the parent's ->sleep_avg: 1719 * parent runqueue to update the parent's ->sleep_avg:
1720 */ 1720 */
1721 task_rq_unlock(rq, &flags); 1721 task_rq_unlock(rq, &flags);
1722 this_rq = task_rq_lock(current, &flags); 1722 this_rq = task_rq_lock(current, &flags);
1723 } 1723 }
1724 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * 1724 current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
1725 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); 1725 PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
1726 task_rq_unlock(this_rq, &flags); 1726 task_rq_unlock(this_rq, &flags);
1727 } 1727 }
1728 1728
1729 /* 1729 /*
1730 * Potentially available exiting-child timeslices are 1730 * Potentially available exiting-child timeslices are
1731 * retrieved here - this way the parent does not get 1731 * retrieved here - this way the parent does not get
1732 * penalized for creating too many threads. 1732 * penalized for creating too many threads.
1733 * 1733 *
1734 * (this cannot be used to 'generate' timeslices 1734 * (this cannot be used to 'generate' timeslices
1735 * artificially, because any timeslice recovered here 1735 * artificially, because any timeslice recovered here
1736 * was given away by the parent in the first place.) 1736 * was given away by the parent in the first place.)
1737 */ 1737 */
1738 void fastcall sched_exit(struct task_struct *p) 1738 void fastcall sched_exit(struct task_struct *p)
1739 { 1739 {
1740 unsigned long flags; 1740 unsigned long flags;
1741 struct rq *rq; 1741 struct rq *rq;
1742 1742
1743 /* 1743 /*
1744 * If the child was a (relative-) CPU hog then decrease 1744 * If the child was a (relative-) CPU hog then decrease
1745 * the sleep_avg of the parent as well. 1745 * the sleep_avg of the parent as well.
1746 */ 1746 */
1747 rq = task_rq_lock(p->parent, &flags); 1747 rq = task_rq_lock(p->parent, &flags);
1748 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { 1748 if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
1749 p->parent->time_slice += p->time_slice; 1749 p->parent->time_slice += p->time_slice;
1750 if (unlikely(p->parent->time_slice > task_timeslice(p))) 1750 if (unlikely(p->parent->time_slice > task_timeslice(p)))
1751 p->parent->time_slice = task_timeslice(p); 1751 p->parent->time_slice = task_timeslice(p);
1752 } 1752 }
1753 if (p->sleep_avg < p->parent->sleep_avg) 1753 if (p->sleep_avg < p->parent->sleep_avg)
1754 p->parent->sleep_avg = p->parent->sleep_avg / 1754 p->parent->sleep_avg = p->parent->sleep_avg /
1755 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / 1755 (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
1756 (EXIT_WEIGHT + 1); 1756 (EXIT_WEIGHT + 1);
1757 task_rq_unlock(rq, &flags); 1757 task_rq_unlock(rq, &flags);
1758 } 1758 }
1759 1759
1760 /** 1760 /**
1761 * prepare_task_switch - prepare to switch tasks 1761 * prepare_task_switch - prepare to switch tasks
1762 * @rq: the runqueue preparing to switch 1762 * @rq: the runqueue preparing to switch
1763 * @next: the task we are going to switch to. 1763 * @next: the task we are going to switch to.
1764 * 1764 *
1765 * This is called with the rq lock held and interrupts off. It must 1765 * This is called with the rq lock held and interrupts off. It must
1766 * be paired with a subsequent finish_task_switch after the context 1766 * be paired with a subsequent finish_task_switch after the context
1767 * switch. 1767 * switch.
1768 * 1768 *
1769 * prepare_task_switch sets up locking and calls architecture specific 1769 * prepare_task_switch sets up locking and calls architecture specific
1770 * hooks. 1770 * hooks.
1771 */ 1771 */
1772 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) 1772 static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1773 { 1773 {
1774 prepare_lock_switch(rq, next); 1774 prepare_lock_switch(rq, next);
1775 prepare_arch_switch(next); 1775 prepare_arch_switch(next);
1776 } 1776 }
1777 1777
1778 /** 1778 /**
1779 * finish_task_switch - clean up after a task-switch 1779 * finish_task_switch - clean up after a task-switch
1780 * @rq: runqueue associated with task-switch 1780 * @rq: runqueue associated with task-switch
1781 * @prev: the thread we just switched away from. 1781 * @prev: the thread we just switched away from.
1782 * 1782 *
1783 * finish_task_switch must be called after the context switch, paired 1783 * finish_task_switch must be called after the context switch, paired
1784 * with a prepare_task_switch call before the context switch. 1784 * with a prepare_task_switch call before the context switch.
1785 * finish_task_switch will reconcile locking set up by prepare_task_switch, 1785 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1786 * and do any other architecture-specific cleanup actions. 1786 * and do any other architecture-specific cleanup actions.
1787 * 1787 *
1788 * Note that we may have delayed dropping an mm in context_switch(). If 1788 * Note that we may have delayed dropping an mm in context_switch(). If
1789 * so, we finish that here outside of the runqueue lock. (Doing it 1789 * so, we finish that here outside of the runqueue lock. (Doing it
1790 * with the lock held can cause deadlocks; see schedule() for 1790 * with the lock held can cause deadlocks; see schedule() for
1791 * details.) 1791 * details.)
1792 */ 1792 */
1793 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) 1793 static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1794 __releases(rq->lock) 1794 __releases(rq->lock)
1795 { 1795 {
1796 struct mm_struct *mm = rq->prev_mm; 1796 struct mm_struct *mm = rq->prev_mm;
1797 long prev_state; 1797 long prev_state;
1798 1798
1799 rq->prev_mm = NULL; 1799 rq->prev_mm = NULL;
1800 1800
1801 /* 1801 /*
1802 * A task struct has one reference for the use as "current". 1802 * A task struct has one reference for the use as "current".
1803 * If a task dies, then it sets TASK_DEAD in tsk->state and calls 1803 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1804 * schedule one last time. The schedule call will never return, and 1804 * schedule one last time. The schedule call will never return, and
1805 * the scheduled task must drop that reference. 1805 * the scheduled task must drop that reference.
1806 * The test for TASK_DEAD must occur while the runqueue locks are 1806 * The test for TASK_DEAD must occur while the runqueue locks are
1807 * still held, otherwise prev could be scheduled on another cpu, die 1807 * still held, otherwise prev could be scheduled on another cpu, die
1808 * there before we look at prev->state, and then the reference would 1808 * there before we look at prev->state, and then the reference would
1809 * be dropped twice. 1809 * be dropped twice.
1810 * Manfred Spraul <manfred@colorfullife.com> 1810 * Manfred Spraul <manfred@colorfullife.com>
1811 */ 1811 */
1812 prev_state = prev->state; 1812 prev_state = prev->state;
1813 finish_arch_switch(prev); 1813 finish_arch_switch(prev);
1814 finish_lock_switch(rq, prev); 1814 finish_lock_switch(rq, prev);
1815 if (mm) 1815 if (mm)
1816 mmdrop(mm); 1816 mmdrop(mm);
1817 if (unlikely(prev_state == TASK_DEAD)) { 1817 if (unlikely(prev_state == TASK_DEAD)) {
1818 /* 1818 /*
1819 * Remove function-return probe instances associated with this 1819 * Remove function-return probe instances associated with this
1820 * task and put them back on the free list. 1820 * task and put them back on the free list.
1821 */ 1821 */
1822 kprobe_flush_task(prev); 1822 kprobe_flush_task(prev);
1823 put_task_struct(prev); 1823 put_task_struct(prev);
1824 } 1824 }
1825 } 1825 }
1826 1826
1827 /** 1827 /**
1828 * schedule_tail - first thing a freshly forked thread must call. 1828 * schedule_tail - first thing a freshly forked thread must call.
1829 * @prev: the thread we just switched away from. 1829 * @prev: the thread we just switched away from.
1830 */ 1830 */
1831 asmlinkage void schedule_tail(struct task_struct *prev) 1831 asmlinkage void schedule_tail(struct task_struct *prev)
1832 __releases(rq->lock) 1832 __releases(rq->lock)
1833 { 1833 {
1834 struct rq *rq = this_rq(); 1834 struct rq *rq = this_rq();
1835 1835
1836 finish_task_switch(rq, prev); 1836 finish_task_switch(rq, prev);
1837 #ifdef __ARCH_WANT_UNLOCKED_CTXSW 1837 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
1838 /* In this case, finish_task_switch does not reenable preemption */ 1838 /* In this case, finish_task_switch does not reenable preemption */
1839 preempt_enable(); 1839 preempt_enable();
1840 #endif 1840 #endif
1841 if (current->set_child_tid) 1841 if (current->set_child_tid)
1842 put_user(current->pid, current->set_child_tid); 1842 put_user(current->pid, current->set_child_tid);
1843 } 1843 }
1844 1844
1845 /* 1845 /*
1846 * context_switch - switch to the new MM and the new 1846 * context_switch - switch to the new MM and the new
1847 * thread's register state. 1847 * thread's register state.
1848 */ 1848 */
1849 static inline struct task_struct * 1849 static inline struct task_struct *
1850 context_switch(struct rq *rq, struct task_struct *prev, 1850 context_switch(struct rq *rq, struct task_struct *prev,
1851 struct task_struct *next) 1851 struct task_struct *next)
1852 { 1852 {
1853 struct mm_struct *mm = next->mm; 1853 struct mm_struct *mm = next->mm;
1854 struct mm_struct *oldmm = prev->active_mm; 1854 struct mm_struct *oldmm = prev->active_mm;
1855 1855
1856 /* 1856 /*
1857 * For paravirt, this is coupled with an exit in switch_to to 1857 * For paravirt, this is coupled with an exit in switch_to to
1858 * combine the page table reload and the switch backend into 1858 * combine the page table reload and the switch backend into
1859 * one hypercall. 1859 * one hypercall.
1860 */ 1860 */
1861 arch_enter_lazy_cpu_mode(); 1861 arch_enter_lazy_cpu_mode();
1862 1862
1863 if (!mm) { 1863 if (!mm) {
1864 next->active_mm = oldmm; 1864 next->active_mm = oldmm;
1865 atomic_inc(&oldmm->mm_count); 1865 atomic_inc(&oldmm->mm_count);
1866 enter_lazy_tlb(oldmm, next); 1866 enter_lazy_tlb(oldmm, next);
1867 } else 1867 } else
1868 switch_mm(oldmm, mm, next); 1868 switch_mm(oldmm, mm, next);
1869 1869
1870 if (!prev->mm) { 1870 if (!prev->mm) {
1871 prev->active_mm = NULL; 1871 prev->active_mm = NULL;
1872 WARN_ON(rq->prev_mm); 1872 WARN_ON(rq->prev_mm);
1873 rq->prev_mm = oldmm; 1873 rq->prev_mm = oldmm;
1874 } 1874 }
1875 /* 1875 /*
1876 * Since the runqueue lock will be released by the next 1876 * Since the runqueue lock will be released by the next
1877 * task (which is an invalid locking op but in the case 1877 * task (which is an invalid locking op but in the case
1878 * of the scheduler it's an obvious special-case), so we 1878 * of the scheduler it's an obvious special-case), so we
1879 * do an early lockdep release here: 1879 * do an early lockdep release here:
1880 */ 1880 */
1881 #ifndef __ARCH_WANT_UNLOCKED_CTXSW 1881 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
1882 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1882 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1883 #endif 1883 #endif
1884 1884
1885 /* Here we just switch the register state and the stack. */ 1885 /* Here we just switch the register state and the stack. */
1886 switch_to(prev, next, prev); 1886 switch_to(prev, next, prev);
1887 1887
1888 return prev; 1888 return prev;
1889 } 1889 }
1890 1890
1891 /* 1891 /*
1892 * nr_running, nr_uninterruptible and nr_context_switches: 1892 * nr_running, nr_uninterruptible and nr_context_switches:
1893 * 1893 *
1894 * externally visible scheduler statistics: current number of runnable 1894 * externally visible scheduler statistics: current number of runnable
1895 * threads, current number of uninterruptible-sleeping threads, total 1895 * threads, current number of uninterruptible-sleeping threads, total
1896 * number of context switches performed since bootup. 1896 * number of context switches performed since bootup.
1897 */ 1897 */
1898 unsigned long nr_running(void) 1898 unsigned long nr_running(void)
1899 { 1899 {
1900 unsigned long i, sum = 0; 1900 unsigned long i, sum = 0;
1901 1901
1902 for_each_online_cpu(i) 1902 for_each_online_cpu(i)
1903 sum += cpu_rq(i)->nr_running; 1903 sum += cpu_rq(i)->nr_running;
1904 1904
1905 return sum; 1905 return sum;
1906 } 1906 }
1907 1907
1908 unsigned long nr_uninterruptible(void) 1908 unsigned long nr_uninterruptible(void)
1909 { 1909 {
1910 unsigned long i, sum = 0; 1910 unsigned long i, sum = 0;
1911 1911
1912 for_each_possible_cpu(i) 1912 for_each_possible_cpu(i)
1913 sum += cpu_rq(i)->nr_uninterruptible; 1913 sum += cpu_rq(i)->nr_uninterruptible;
1914 1914
1915 /* 1915 /*
1916 * Since we read the counters lockless, it might be slightly 1916 * Since we read the counters lockless, it might be slightly
1917 * inaccurate. Do not allow it to go below zero though: 1917 * inaccurate. Do not allow it to go below zero though:
1918 */ 1918 */
1919 if (unlikely((long)sum < 0)) 1919 if (unlikely((long)sum < 0))
1920 sum = 0; 1920 sum = 0;
1921 1921
1922 return sum; 1922 return sum;
1923 } 1923 }
1924 1924
1925 unsigned long long nr_context_switches(void) 1925 unsigned long long nr_context_switches(void)
1926 { 1926 {
1927 int i; 1927 int i;
1928 unsigned long long sum = 0; 1928 unsigned long long sum = 0;
1929 1929
1930 for_each_possible_cpu(i) 1930 for_each_possible_cpu(i)
1931 sum += cpu_rq(i)->nr_switches; 1931 sum += cpu_rq(i)->nr_switches;
1932 1932
1933 return sum; 1933 return sum;
1934 } 1934 }
1935 1935
1936 unsigned long nr_iowait(void) 1936 unsigned long nr_iowait(void)
1937 { 1937 {
1938 unsigned long i, sum = 0; 1938 unsigned long i, sum = 0;
1939 1939
1940 for_each_possible_cpu(i) 1940 for_each_possible_cpu(i)
1941 sum += atomic_read(&cpu_rq(i)->nr_iowait); 1941 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1942 1942
1943 return sum; 1943 return sum;
1944 } 1944 }
1945 1945
1946 unsigned long nr_active(void) 1946 unsigned long nr_active(void)
1947 { 1947 {
1948 unsigned long i, running = 0, uninterruptible = 0; 1948 unsigned long i, running = 0, uninterruptible = 0;
1949 1949
1950 for_each_online_cpu(i) { 1950 for_each_online_cpu(i) {
1951 running += cpu_rq(i)->nr_running; 1951 running += cpu_rq(i)->nr_running;
1952 uninterruptible += cpu_rq(i)->nr_uninterruptible; 1952 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1953 } 1953 }
1954 1954
1955 if (unlikely((long)uninterruptible < 0)) 1955 if (unlikely((long)uninterruptible < 0))
1956 uninterruptible = 0; 1956 uninterruptible = 0;
1957 1957
1958 return running + uninterruptible; 1958 return running + uninterruptible;
1959 } 1959 }
1960 1960
1961 #ifdef CONFIG_SMP 1961 #ifdef CONFIG_SMP
1962 1962
1963 /* 1963 /*
1964 * Is this task likely cache-hot: 1964 * Is this task likely cache-hot:
1965 */ 1965 */
1966 static inline int 1966 static inline int
1967 task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd) 1967 task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
1968 { 1968 {
1969 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time; 1969 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
1970 } 1970 }
1971 1971
1972 /* 1972 /*
1973 * double_rq_lock - safely lock two runqueues 1973 * double_rq_lock - safely lock two runqueues
1974 * 1974 *
1975 * Note this does not disable interrupts like task_rq_lock, 1975 * Note this does not disable interrupts like task_rq_lock,
1976 * you need to do so manually before calling. 1976 * you need to do so manually before calling.
1977 */ 1977 */
1978 static void double_rq_lock(struct rq *rq1, struct rq *rq2) 1978 static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1979 __acquires(rq1->lock) 1979 __acquires(rq1->lock)
1980 __acquires(rq2->lock) 1980 __acquires(rq2->lock)
1981 { 1981 {
1982 BUG_ON(!irqs_disabled()); 1982 BUG_ON(!irqs_disabled());
1983 if (rq1 == rq2) { 1983 if (rq1 == rq2) {
1984 spin_lock(&rq1->lock); 1984 spin_lock(&rq1->lock);
1985 __acquire(rq2->lock); /* Fake it out ;) */ 1985 __acquire(rq2->lock); /* Fake it out ;) */
1986 } else { 1986 } else {
1987 if (rq1 < rq2) { 1987 if (rq1 < rq2) {
1988 spin_lock(&rq1->lock); 1988 spin_lock(&rq1->lock);
1989 spin_lock(&rq2->lock); 1989 spin_lock(&rq2->lock);
1990 } else { 1990 } else {
1991 spin_lock(&rq2->lock); 1991 spin_lock(&rq2->lock);
1992 spin_lock(&rq1->lock); 1992 spin_lock(&rq1->lock);
1993 } 1993 }
1994 } 1994 }
1995 } 1995 }
1996 1996
1997 /* 1997 /*
1998 * double_rq_unlock - safely unlock two runqueues 1998 * double_rq_unlock - safely unlock two runqueues
1999 * 1999 *
2000 * Note this does not restore interrupts like task_rq_unlock, 2000 * Note this does not restore interrupts like task_rq_unlock,
2001 * you need to do so manually after calling. 2001 * you need to do so manually after calling.
2002 */ 2002 */
2003 static void double_rq_unlock(struct rq *rq1, struct rq *rq2) 2003 static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2004 __releases(rq1->lock) 2004 __releases(rq1->lock)
2005 __releases(rq2->lock) 2005 __releases(rq2->lock)
2006 { 2006 {
2007 spin_unlock(&rq1->lock); 2007 spin_unlock(&rq1->lock);
2008 if (rq1 != rq2) 2008 if (rq1 != rq2)
2009 spin_unlock(&rq2->lock); 2009 spin_unlock(&rq2->lock);
2010 else 2010 else
2011 __release(rq2->lock); 2011 __release(rq2->lock);
2012 } 2012 }
2013 2013
2014 /* 2014 /*
2015 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2015 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2016 */ 2016 */
2017 static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2017 static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2018 __releases(this_rq->lock) 2018 __releases(this_rq->lock)
2019 __acquires(busiest->lock) 2019 __acquires(busiest->lock)
2020 __acquires(this_rq->lock) 2020 __acquires(this_rq->lock)
2021 { 2021 {
2022 if (unlikely(!irqs_disabled())) { 2022 if (unlikely(!irqs_disabled())) {
2023 /* printk() doesn't work good under rq->lock */ 2023 /* printk() doesn't work good under rq->lock */
2024 spin_unlock(&this_rq->lock); 2024 spin_unlock(&this_rq->lock);
2025 BUG_ON(1); 2025 BUG_ON(1);
2026 } 2026 }
2027 if (unlikely(!spin_trylock(&busiest->lock))) { 2027 if (unlikely(!spin_trylock(&busiest->lock))) {
2028 if (busiest < this_rq) { 2028 if (busiest < this_rq) {
2029 spin_unlock(&this_rq->lock); 2029 spin_unlock(&this_rq->lock);
2030 spin_lock(&busiest->lock); 2030 spin_lock(&busiest->lock);
2031 spin_lock(&this_rq->lock); 2031 spin_lock(&this_rq->lock);
2032 } else 2032 } else
2033 spin_lock(&busiest->lock); 2033 spin_lock(&busiest->lock);
2034 } 2034 }
2035 } 2035 }
2036 2036
2037 /* 2037 /*
2038 * If dest_cpu is allowed for this process, migrate the task to it. 2038 * If dest_cpu is allowed for this process, migrate the task to it.
2039 * This is accomplished by forcing the cpu_allowed mask to only 2039 * This is accomplished by forcing the cpu_allowed mask to only
2040 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2040 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2041 * the cpu_allowed mask is restored. 2041 * the cpu_allowed mask is restored.
2042 */ 2042 */
2043 static void sched_migrate_task(struct task_struct *p, int dest_cpu) 2043 static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2044 { 2044 {
2045 struct migration_req req; 2045 struct migration_req req;
2046 unsigned long flags; 2046 unsigned long flags;
2047 struct rq *rq; 2047 struct rq *rq;
2048 2048
2049 rq = task_rq_lock(p, &flags); 2049 rq = task_rq_lock(p, &flags);
2050 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2050 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2051 || unlikely(cpu_is_offline(dest_cpu))) 2051 || unlikely(cpu_is_offline(dest_cpu)))
2052 goto out; 2052 goto out;
2053 2053
2054 /* force the process onto the specified CPU */ 2054 /* force the process onto the specified CPU */
2055 if (migrate_task(p, dest_cpu, &req)) { 2055 if (migrate_task(p, dest_cpu, &req)) {
2056 /* Need to wait for migration thread (might exit: take ref). */ 2056 /* Need to wait for migration thread (might exit: take ref). */
2057 struct task_struct *mt = rq->migration_thread; 2057 struct task_struct *mt = rq->migration_thread;
2058 2058
2059 get_task_struct(mt); 2059 get_task_struct(mt);
2060 task_rq_unlock(rq, &flags); 2060 task_rq_unlock(rq, &flags);
2061 wake_up_process(mt); 2061 wake_up_process(mt);
2062 put_task_struct(mt); 2062 put_task_struct(mt);
2063 wait_for_completion(&req.done); 2063 wait_for_completion(&req.done);
2064 2064
2065 return; 2065 return;
2066 } 2066 }
2067 out: 2067 out:
2068 task_rq_unlock(rq, &flags); 2068 task_rq_unlock(rq, &flags);
2069 } 2069 }
2070 2070
2071 /* 2071 /*
2072 * sched_exec - execve() is a valuable balancing opportunity, because at 2072 * sched_exec - execve() is a valuable balancing opportunity, because at
2073 * this point the task has the smallest effective memory and cache footprint. 2073 * this point the task has the smallest effective memory and cache footprint.
2074 */ 2074 */
2075 void sched_exec(void) 2075 void sched_exec(void)
2076 { 2076 {
2077 int new_cpu, this_cpu = get_cpu(); 2077 int new_cpu, this_cpu = get_cpu();
2078 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 2078 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
2079 put_cpu(); 2079 put_cpu();
2080 if (new_cpu != this_cpu) 2080 if (new_cpu != this_cpu)
2081 sched_migrate_task(current, new_cpu); 2081 sched_migrate_task(current, new_cpu);
2082 } 2082 }
2083 2083
2084 /* 2084 /*
2085 * pull_task - move a task from a remote runqueue to the local runqueue. 2085 * pull_task - move a task from a remote runqueue to the local runqueue.
2086 * Both runqueues must be locked. 2086 * Both runqueues must be locked.
2087 */ 2087 */
2088 static void pull_task(struct rq *src_rq, struct prio_array *src_array, 2088 static void pull_task(struct rq *src_rq, struct prio_array *src_array,
2089 struct task_struct *p, struct rq *this_rq, 2089 struct task_struct *p, struct rq *this_rq,
2090 struct prio_array *this_array, int this_cpu) 2090 struct prio_array *this_array, int this_cpu)
2091 { 2091 {
2092 dequeue_task(p, src_array); 2092 dequeue_task(p, src_array);
2093 dec_nr_running(p, src_rq); 2093 dec_nr_running(p, src_rq);
2094 set_task_cpu(p, this_cpu); 2094 set_task_cpu(p, this_cpu);
2095 inc_nr_running(p, this_rq); 2095 inc_nr_running(p, this_rq);
2096 enqueue_task(p, this_array); 2096 enqueue_task(p, this_array);
2097 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) 2097 p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
2098 + this_rq->most_recent_timestamp; 2098 + this_rq->most_recent_timestamp;
2099 /* 2099 /*
2100 * Note that idle threads have a prio of MAX_PRIO, for this test 2100 * Note that idle threads have a prio of MAX_PRIO, for this test
2101 * to be always true for them. 2101 * to be always true for them.
2102 */ 2102 */
2103 if (TASK_PREEMPTS_CURR(p, this_rq)) 2103 if (TASK_PREEMPTS_CURR(p, this_rq))
2104 resched_task(this_rq->curr); 2104 resched_task(this_rq->curr);
2105 } 2105 }
2106 2106
2107 /* 2107 /*
2108 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 2108 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2109 */ 2109 */
2110 static 2110 static
2111 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 2111 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2112 struct sched_domain *sd, enum idle_type idle, 2112 struct sched_domain *sd, enum idle_type idle,
2113 int *all_pinned) 2113 int *all_pinned)
2114 { 2114 {
2115 /* 2115 /*
2116 * We do not migrate tasks that are: 2116 * We do not migrate tasks that are:
2117 * 1) running (obviously), or 2117 * 1) running (obviously), or
2118 * 2) cannot be migrated to this CPU due to cpus_allowed, or 2118 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2119 * 3) are cache-hot on their current CPU. 2119 * 3) are cache-hot on their current CPU.
2120 */ 2120 */
2121 if (!cpu_isset(this_cpu, p->cpus_allowed)) 2121 if (!cpu_isset(this_cpu, p->cpus_allowed))
2122 return 0; 2122 return 0;
2123 *all_pinned = 0; 2123 *all_pinned = 0;
2124 2124
2125 if (task_running(rq, p)) 2125 if (task_running(rq, p))
2126 return 0; 2126 return 0;
2127 2127
2128 /* 2128 /*
2129 * Aggressive migration if: 2129 * Aggressive migration if:
2130 * 1) task is cache cold, or 2130 * 1) task is cache cold, or
2131 * 2) too many balance attempts have failed. 2131 * 2) too many balance attempts have failed.
2132 */ 2132 */
2133 2133
2134 if (sd->nr_balance_failed > sd->cache_nice_tries) { 2134 if (sd->nr_balance_failed > sd->cache_nice_tries) {
2135 #ifdef CONFIG_SCHEDSTATS 2135 #ifdef CONFIG_SCHEDSTATS
2136 if (task_hot(p, rq->most_recent_timestamp, sd)) 2136 if (task_hot(p, rq->most_recent_timestamp, sd))
2137 schedstat_inc(sd, lb_hot_gained[idle]); 2137 schedstat_inc(sd, lb_hot_gained[idle]);
2138 #endif 2138 #endif
2139 return 1; 2139 return 1;
2140 } 2140 }
2141 2141
2142 if (task_hot(p, rq->most_recent_timestamp, sd)) 2142 if (task_hot(p, rq->most_recent_timestamp, sd))
2143 return 0; 2143 return 0;
2144 return 1; 2144 return 1;
2145 } 2145 }
2146 2146
2147 #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) 2147 #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
2148 2148
2149 /* 2149 /*
2150 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted 2150 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2151 * load from busiest to this_rq, as part of a balancing operation within 2151 * load from busiest to this_rq, as part of a balancing operation within
2152 * "domain". Returns the number of tasks moved. 2152 * "domain". Returns the number of tasks moved.
2153 * 2153 *
2154 * Called with both runqueues locked. 2154 * Called with both runqueues locked.
2155 */ 2155 */
2156 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 2156 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2157 unsigned long max_nr_move, unsigned long max_load_move, 2157 unsigned long max_nr_move, unsigned long max_load_move,
2158 struct sched_domain *sd, enum idle_type idle, 2158 struct sched_domain *sd, enum idle_type idle,
2159 int *all_pinned) 2159 int *all_pinned)
2160 { 2160 {
2161 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, 2161 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
2162 best_prio_seen, skip_for_load; 2162 best_prio_seen, skip_for_load;
2163 struct prio_array *array, *dst_array; 2163 struct prio_array *array, *dst_array;
2164 struct list_head *head, *curr; 2164 struct list_head *head, *curr;
2165 struct task_struct *tmp; 2165 struct task_struct *tmp;
2166 long rem_load_move; 2166 long rem_load_move;
2167 2167
2168 if (max_nr_move == 0 || max_load_move == 0) 2168 if (max_nr_move == 0 || max_load_move == 0)
2169 goto out; 2169 goto out;
2170 2170
2171 rem_load_move = max_load_move; 2171 rem_load_move = max_load_move;
2172 pinned = 1; 2172 pinned = 1;
2173 this_best_prio = rq_best_prio(this_rq); 2173 this_best_prio = rq_best_prio(this_rq);
2174 best_prio = rq_best_prio(busiest); 2174 best_prio = rq_best_prio(busiest);
2175 /* 2175 /*
2176 * Enable handling of the case where there is more than one task 2176 * Enable handling of the case where there is more than one task
2177 * with the best priority. If the current running task is one 2177 * with the best priority. If the current running task is one
2178 * of those with prio==best_prio we know it won't be moved 2178 * of those with prio==best_prio we know it won't be moved
2179 * and therefore it's safe to override the skip (based on load) of 2179 * and therefore it's safe to override the skip (based on load) of
2180 * any task we find with that prio. 2180 * any task we find with that prio.
2181 */ 2181 */
2182 best_prio_seen = best_prio == busiest->curr->prio; 2182 best_prio_seen = best_prio == busiest->curr->prio;
2183 2183
2184 /* 2184 /*
2185 * We first consider expired tasks. Those will likely not be 2185 * We first consider expired tasks. Those will likely not be
2186 * executed in the near future, and they are most likely to 2186 * executed in the near future, and they are most likely to
2187 * be cache-cold, thus switching CPUs has the least effect 2187 * be cache-cold, thus switching CPUs has the least effect
2188 * on them. 2188 * on them.
2189 */ 2189 */
2190 if (busiest->expired->nr_active) { 2190 if (busiest->expired->nr_active) {
2191 array = busiest->expired; 2191 array = busiest->expired;
2192 dst_array = this_rq->expired; 2192 dst_array = this_rq->expired;
2193 } else { 2193 } else {
2194 array = busiest->active; 2194 array = busiest->active;
2195 dst_array = this_rq->active; 2195 dst_array = this_rq->active;
2196 } 2196 }
2197 2197
2198 new_array: 2198 new_array:
2199 /* Start searching at priority 0: */ 2199 /* Start searching at priority 0: */
2200 idx = 0; 2200 idx = 0;
2201 skip_bitmap: 2201 skip_bitmap:
2202 if (!idx) 2202 if (!idx)
2203 idx = sched_find_first_bit(array->bitmap); 2203 idx = sched_find_first_bit(array->bitmap);
2204 else 2204 else
2205 idx = find_next_bit(array->bitmap, MAX_PRIO, idx); 2205 idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
2206 if (idx >= MAX_PRIO) { 2206 if (idx >= MAX_PRIO) {
2207 if (array == busiest->expired && busiest->active->nr_active) { 2207 if (array == busiest->expired && busiest->active->nr_active) {
2208 array = busiest->active; 2208 array = busiest->active;
2209 dst_array = this_rq->active; 2209 dst_array = this_rq->active;
2210 goto new_array; 2210 goto new_array;
2211 } 2211 }
2212 goto out; 2212 goto out;
2213 } 2213 }
2214 2214
2215 head = array->queue + idx; 2215 head = array->queue + idx;
2216 curr = head->prev; 2216 curr = head->prev;
2217 skip_queue: 2217 skip_queue:
2218 tmp = list_entry(curr, struct task_struct, run_list); 2218 tmp = list_entry(curr, struct task_struct, run_list);
2219 2219
2220 curr = curr->prev; 2220 curr = curr->prev;
2221 2221
2222 /* 2222 /*
2223 * To help distribute high priority tasks accross CPUs we don't 2223 * To help distribute high priority tasks accross CPUs we don't
2224 * skip a task if it will be the highest priority task (i.e. smallest 2224 * skip a task if it will be the highest priority task (i.e. smallest
2225 * prio value) on its new queue regardless of its load weight 2225 * prio value) on its new queue regardless of its load weight
2226 */ 2226 */
2227 skip_for_load = tmp->load_weight > rem_load_move; 2227 skip_for_load = tmp->load_weight > rem_load_move;
2228 if (skip_for_load && idx < this_best_prio) 2228 if (skip_for_load && idx < this_best_prio)
2229 skip_for_load = !best_prio_seen && idx == best_prio; 2229 skip_for_load = !best_prio_seen && idx == best_prio;
2230 if (skip_for_load || 2230 if (skip_for_load ||
2231 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2231 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2232 2232
2233 best_prio_seen |= idx == best_prio; 2233 best_prio_seen |= idx == best_prio;
2234 if (curr != head) 2234 if (curr != head)
2235 goto skip_queue; 2235 goto skip_queue;
2236 idx++; 2236 idx++;
2237 goto skip_bitmap; 2237 goto skip_bitmap;
2238 } 2238 }
2239 2239
2240 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2240 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
2241 pulled++; 2241 pulled++;
2242 rem_load_move -= tmp->load_weight; 2242 rem_load_move -= tmp->load_weight;
2243 2243
2244 /* 2244 /*
2245 * We only want to steal up to the prescribed number of tasks 2245 * We only want to steal up to the prescribed number of tasks
2246 * and the prescribed amount of weighted load. 2246 * and the prescribed amount of weighted load.
2247 */ 2247 */
2248 if (pulled < max_nr_move && rem_load_move > 0) { 2248 if (pulled < max_nr_move && rem_load_move > 0) {
2249 if (idx < this_best_prio) 2249 if (idx < this_best_prio)
2250 this_best_prio = idx; 2250 this_best_prio = idx;
2251 if (curr != head) 2251 if (curr != head)
2252 goto skip_queue; 2252 goto skip_queue;
2253 idx++; 2253 idx++;
2254 goto skip_bitmap; 2254 goto skip_bitmap;
2255 } 2255 }
2256 out: 2256 out:
2257 /* 2257 /*
2258 * Right now, this is the only place pull_task() is called, 2258 * Right now, this is the only place pull_task() is called,
2259 * so we can safely collect pull_task() stats here rather than 2259 * so we can safely collect pull_task() stats here rather than
2260 * inside pull_task(). 2260 * inside pull_task().
2261 */ 2261 */
2262 schedstat_add(sd, lb_gained[idle], pulled); 2262 schedstat_add(sd, lb_gained[idle], pulled);
2263 2263
2264 if (all_pinned) 2264 if (all_pinned)
2265 *all_pinned = pinned; 2265 *all_pinned = pinned;
2266 return pulled; 2266 return pulled;
2267 } 2267 }
2268 2268
2269 /* 2269 /*
2270 * find_busiest_group finds and returns the busiest CPU group within the 2270 * find_busiest_group finds and returns the busiest CPU group within the
2271 * domain. It calculates and returns the amount of weighted load which 2271 * domain. It calculates and returns the amount of weighted load which
2272 * should be moved to restore balance via the imbalance parameter. 2272 * should be moved to restore balance via the imbalance parameter.
2273 */ 2273 */
2274 static struct sched_group * 2274 static struct sched_group *
2275 find_busiest_group(struct sched_domain *sd, int this_cpu, 2275 find_busiest_group(struct sched_domain *sd, int this_cpu,
2276 unsigned long *imbalance, enum idle_type idle, int *sd_idle, 2276 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2277 cpumask_t *cpus, int *balance) 2277 cpumask_t *cpus, int *balance)
2278 { 2278 {
2279 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2279 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2280 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2280 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
2281 unsigned long max_pull; 2281 unsigned long max_pull;
2282 unsigned long busiest_load_per_task, busiest_nr_running; 2282 unsigned long busiest_load_per_task, busiest_nr_running;
2283 unsigned long this_load_per_task, this_nr_running; 2283 unsigned long this_load_per_task, this_nr_running;
2284 int load_idx; 2284 int load_idx;
2285 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2285 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2286 int power_savings_balance = 1; 2286 int power_savings_balance = 1;
2287 unsigned long leader_nr_running = 0, min_load_per_task = 0; 2287 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2288 unsigned long min_nr_running = ULONG_MAX; 2288 unsigned long min_nr_running = ULONG_MAX;
2289 struct sched_group *group_min = NULL, *group_leader = NULL; 2289 struct sched_group *group_min = NULL, *group_leader = NULL;
2290 #endif 2290 #endif
2291 2291
2292 max_load = this_load = total_load = total_pwr = 0; 2292 max_load = this_load = total_load = total_pwr = 0;
2293 busiest_load_per_task = busiest_nr_running = 0; 2293 busiest_load_per_task = busiest_nr_running = 0;
2294 this_load_per_task = this_nr_running = 0; 2294 this_load_per_task = this_nr_running = 0;
2295 if (idle == NOT_IDLE) 2295 if (idle == NOT_IDLE)
2296 load_idx = sd->busy_idx; 2296 load_idx = sd->busy_idx;
2297 else if (idle == NEWLY_IDLE) 2297 else if (idle == NEWLY_IDLE)
2298 load_idx = sd->newidle_idx; 2298 load_idx = sd->newidle_idx;
2299 else 2299 else
2300 load_idx = sd->idle_idx; 2300 load_idx = sd->idle_idx;
2301 2301
2302 do { 2302 do {
2303 unsigned long load, group_capacity; 2303 unsigned long load, group_capacity;
2304 int local_group; 2304 int local_group;
2305 int i; 2305 int i;
2306 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2306 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2307 unsigned long sum_nr_running, sum_weighted_load; 2307 unsigned long sum_nr_running, sum_weighted_load;
2308 2308
2309 local_group = cpu_isset(this_cpu, group->cpumask); 2309 local_group = cpu_isset(this_cpu, group->cpumask);
2310 2310
2311 if (local_group) 2311 if (local_group)
2312 balance_cpu = first_cpu(group->cpumask); 2312 balance_cpu = first_cpu(group->cpumask);
2313 2313
2314 /* Tally up the load of all CPUs in the group */ 2314 /* Tally up the load of all CPUs in the group */
2315 sum_weighted_load = sum_nr_running = avg_load = 0; 2315 sum_weighted_load = sum_nr_running = avg_load = 0;
2316 2316
2317 for_each_cpu_mask(i, group->cpumask) { 2317 for_each_cpu_mask(i, group->cpumask) {
2318 struct rq *rq; 2318 struct rq *rq;
2319 2319
2320 if (!cpu_isset(i, *cpus)) 2320 if (!cpu_isset(i, *cpus))
2321 continue; 2321 continue;
2322 2322
2323 rq = cpu_rq(i); 2323 rq = cpu_rq(i);
2324 2324
2325 if (*sd_idle && !idle_cpu(i)) 2325 if (*sd_idle && !idle_cpu(i))
2326 *sd_idle = 0; 2326 *sd_idle = 0;
2327 2327
2328 /* Bias balancing toward cpus of our domain */ 2328 /* Bias balancing toward cpus of our domain */
2329 if (local_group) { 2329 if (local_group) {
2330 if (idle_cpu(i) && !first_idle_cpu) { 2330 if (idle_cpu(i) && !first_idle_cpu) {
2331 first_idle_cpu = 1; 2331 first_idle_cpu = 1;
2332 balance_cpu = i; 2332 balance_cpu = i;
2333 } 2333 }
2334 2334
2335 load = target_load(i, load_idx); 2335 load = target_load(i, load_idx);
2336 } else 2336 } else
2337 load = source_load(i, load_idx); 2337 load = source_load(i, load_idx);
2338 2338
2339 avg_load += load; 2339 avg_load += load;
2340 sum_nr_running += rq->nr_running; 2340 sum_nr_running += rq->nr_running;
2341 sum_weighted_load += rq->raw_weighted_load; 2341 sum_weighted_load += rq->raw_weighted_load;
2342 } 2342 }
2343 2343
2344 /* 2344 /*
2345 * First idle cpu or the first cpu(busiest) in this sched group 2345 * First idle cpu or the first cpu(busiest) in this sched group
2346 * is eligible for doing load balancing at this and above 2346 * is eligible for doing load balancing at this and above
2347 * domains. 2347 * domains.
2348 */ 2348 */
2349 if (local_group && balance_cpu != this_cpu && balance) { 2349 if (local_group && balance_cpu != this_cpu && balance) {
2350 *balance = 0; 2350 *balance = 0;
2351 goto ret; 2351 goto ret;
2352 } 2352 }
2353 2353
2354 total_load += avg_load; 2354 total_load += avg_load;
2355 total_pwr += group->cpu_power; 2355 total_pwr += group->cpu_power;
2356 2356
2357 /* Adjust by relative CPU power of the group */ 2357 /* Adjust by relative CPU power of the group */
2358 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2358 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
2359 2359
2360 group_capacity = group->cpu_power / SCHED_LOAD_SCALE; 2360 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2361 2361
2362 if (local_group) { 2362 if (local_group) {
2363 this_load = avg_load; 2363 this_load = avg_load;
2364 this = group; 2364 this = group;
2365 this_nr_running = sum_nr_running; 2365 this_nr_running = sum_nr_running;
2366 this_load_per_task = sum_weighted_load; 2366 this_load_per_task = sum_weighted_load;
2367 } else if (avg_load > max_load && 2367 } else if (avg_load > max_load &&
2368 sum_nr_running > group_capacity) { 2368 sum_nr_running > group_capacity) {
2369 max_load = avg_load; 2369 max_load = avg_load;
2370 busiest = group; 2370 busiest = group;
2371 busiest_nr_running = sum_nr_running; 2371 busiest_nr_running = sum_nr_running;
2372 busiest_load_per_task = sum_weighted_load; 2372 busiest_load_per_task = sum_weighted_load;
2373 } 2373 }
2374 2374
2375 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2375 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2376 /* 2376 /*
2377 * Busy processors will not participate in power savings 2377 * Busy processors will not participate in power savings
2378 * balance. 2378 * balance.
2379 */ 2379 */
2380 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2380 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2381 goto group_next; 2381 goto group_next;
2382 2382
2383 /* 2383 /*
2384 * If the local group is idle or completely loaded 2384 * If the local group is idle or completely loaded
2385 * no need to do power savings balance at this domain 2385 * no need to do power savings balance at this domain
2386 */ 2386 */
2387 if (local_group && (this_nr_running >= group_capacity || 2387 if (local_group && (this_nr_running >= group_capacity ||
2388 !this_nr_running)) 2388 !this_nr_running))
2389 power_savings_balance = 0; 2389 power_savings_balance = 0;
2390 2390
2391 /* 2391 /*
2392 * If a group is already running at full capacity or idle, 2392 * If a group is already running at full capacity or idle,
2393 * don't include that group in power savings calculations 2393 * don't include that group in power savings calculations
2394 */ 2394 */
2395 if (!power_savings_balance || sum_nr_running >= group_capacity 2395 if (!power_savings_balance || sum_nr_running >= group_capacity
2396 || !sum_nr_running) 2396 || !sum_nr_running)
2397 goto group_next; 2397 goto group_next;
2398 2398
2399 /* 2399 /*
2400 * Calculate the group which has the least non-idle load. 2400 * Calculate the group which has the least non-idle load.
2401 * This is the group from where we need to pick up the load 2401 * This is the group from where we need to pick up the load
2402 * for saving power 2402 * for saving power
2403 */ 2403 */
2404 if ((sum_nr_running < min_nr_running) || 2404 if ((sum_nr_running < min_nr_running) ||
2405 (sum_nr_running == min_nr_running && 2405 (sum_nr_running == min_nr_running &&
2406 first_cpu(group->cpumask) < 2406 first_cpu(group->cpumask) <
2407 first_cpu(group_min->cpumask))) { 2407 first_cpu(group_min->cpumask))) {
2408 group_min = group; 2408 group_min = group;
2409 min_nr_running = sum_nr_running; 2409 min_nr_running = sum_nr_running;
2410 min_load_per_task = sum_weighted_load / 2410 min_load_per_task = sum_weighted_load /
2411 sum_nr_running; 2411 sum_nr_running;
2412 } 2412 }
2413 2413
2414 /* 2414 /*
2415 * Calculate the group which is almost near its 2415 * Calculate the group which is almost near its
2416 * capacity but still has some space to pick up some load 2416 * capacity but still has some space to pick up some load
2417 * from other group and save more power 2417 * from other group and save more power
2418 */ 2418 */
2419 if (sum_nr_running <= group_capacity - 1) { 2419 if (sum_nr_running <= group_capacity - 1) {
2420 if (sum_nr_running > leader_nr_running || 2420 if (sum_nr_running > leader_nr_running ||
2421 (sum_nr_running == leader_nr_running && 2421 (sum_nr_running == leader_nr_running &&
2422 first_cpu(group->cpumask) > 2422 first_cpu(group->cpumask) >
2423 first_cpu(group_leader->cpumask))) { 2423 first_cpu(group_leader->cpumask))) {
2424 group_leader = group; 2424 group_leader = group;
2425 leader_nr_running = sum_nr_running; 2425 leader_nr_running = sum_nr_running;
2426 } 2426 }
2427 } 2427 }
2428 group_next: 2428 group_next:
2429 #endif 2429 #endif
2430 group = group->next; 2430 group = group->next;
2431 } while (group != sd->groups); 2431 } while (group != sd->groups);
2432 2432
2433 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 2433 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2434 goto out_balanced; 2434 goto out_balanced;
2435 2435
2436 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2436 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2437 2437
2438 if (this_load >= avg_load || 2438 if (this_load >= avg_load ||
2439 100*max_load <= sd->imbalance_pct*this_load) 2439 100*max_load <= sd->imbalance_pct*this_load)
2440 goto out_balanced; 2440 goto out_balanced;
2441 2441
2442 busiest_load_per_task /= busiest_nr_running; 2442 busiest_load_per_task /= busiest_nr_running;
2443 /* 2443 /*
2444 * We're trying to get all the cpus to the average_load, so we don't 2444 * We're trying to get all the cpus to the average_load, so we don't
2445 * want to push ourselves above the average load, nor do we wish to 2445 * want to push ourselves above the average load, nor do we wish to
2446 * reduce the max loaded cpu below the average load, as either of these 2446 * reduce the max loaded cpu below the average load, as either of these
2447 * actions would just result in more rebalancing later, and ping-pong 2447 * actions would just result in more rebalancing later, and ping-pong
2448 * tasks around. Thus we look for the minimum possible imbalance. 2448 * tasks around. Thus we look for the minimum possible imbalance.
2449 * Negative imbalances (*we* are more loaded than anyone else) will 2449 * Negative imbalances (*we* are more loaded than anyone else) will
2450 * be counted as no imbalance for these purposes -- we can't fix that 2450 * be counted as no imbalance for these purposes -- we can't fix that
2451 * by pulling tasks to us. Be careful of negative numbers as they'll 2451 * by pulling tasks to us. Be careful of negative numbers as they'll
2452 * appear as very large values with unsigned longs. 2452 * appear as very large values with unsigned longs.
2453 */ 2453 */
2454 if (max_load <= busiest_load_per_task) 2454 if (max_load <= busiest_load_per_task)
2455 goto out_balanced; 2455 goto out_balanced;
2456 2456
2457 /* 2457 /*
2458 * In the presence of smp nice balancing, certain scenarios can have 2458 * In the presence of smp nice balancing, certain scenarios can have
2459 * max load less than avg load(as we skip the groups at or below 2459 * max load less than avg load(as we skip the groups at or below
2460 * its cpu_power, while calculating max_load..) 2460 * its cpu_power, while calculating max_load..)
2461 */ 2461 */
2462 if (max_load < avg_load) { 2462 if (max_load < avg_load) {
2463 *imbalance = 0; 2463 *imbalance = 0;
2464 goto small_imbalance; 2464 goto small_imbalance;
2465 } 2465 }
2466 2466
2467 /* Don't want to pull so many tasks that a group would go idle */ 2467 /* Don't want to pull so many tasks that a group would go idle */
2468 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 2468 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2469 2469
2470 /* How much load to actually move to equalise the imbalance */ 2470 /* How much load to actually move to equalise the imbalance */
2471 *imbalance = min(max_pull * busiest->cpu_power, 2471 *imbalance = min(max_pull * busiest->cpu_power,
2472 (avg_load - this_load) * this->cpu_power) 2472 (avg_load - this_load) * this->cpu_power)
2473 / SCHED_LOAD_SCALE; 2473 / SCHED_LOAD_SCALE;
2474 2474
2475 /* 2475 /*
2476 * if *imbalance is less than the average load per runnable task 2476 * if *imbalance is less than the average load per runnable task
2477 * there is no gaurantee that any tasks will be moved so we'll have 2477 * there is no gaurantee that any tasks will be moved so we'll have
2478 * a think about bumping its value to force at least one task to be 2478 * a think about bumping its value to force at least one task to be
2479 * moved 2479 * moved
2480 */ 2480 */
2481 if (*imbalance < busiest_load_per_task) { 2481 if (*imbalance < busiest_load_per_task) {
2482 unsigned long tmp, pwr_now, pwr_move; 2482 unsigned long tmp, pwr_now, pwr_move;
2483 unsigned int imbn; 2483 unsigned int imbn;
2484 2484
2485 small_imbalance: 2485 small_imbalance:
2486 pwr_move = pwr_now = 0; 2486 pwr_move = pwr_now = 0;
2487 imbn = 2; 2487 imbn = 2;
2488 if (this_nr_running) { 2488 if (this_nr_running) {
2489 this_load_per_task /= this_nr_running; 2489 this_load_per_task /= this_nr_running;
2490 if (busiest_load_per_task > this_load_per_task) 2490 if (busiest_load_per_task > this_load_per_task)
2491 imbn = 1; 2491 imbn = 1;
2492 } else 2492 } else
2493 this_load_per_task = SCHED_LOAD_SCALE; 2493 this_load_per_task = SCHED_LOAD_SCALE;
2494 2494
2495 if (max_load - this_load >= busiest_load_per_task * imbn) { 2495 if (max_load - this_load >= busiest_load_per_task * imbn) {
2496 *imbalance = busiest_load_per_task; 2496 *imbalance = busiest_load_per_task;
2497 return busiest; 2497 return busiest;
2498 } 2498 }
2499 2499
2500 /* 2500 /*
2501 * OK, we don't have enough imbalance to justify moving tasks, 2501 * OK, we don't have enough imbalance to justify moving tasks,
2502 * however we may be able to increase total CPU power used by 2502 * however we may be able to increase total CPU power used by
2503 * moving them. 2503 * moving them.
2504 */ 2504 */
2505 2505
2506 pwr_now += busiest->cpu_power * 2506 pwr_now += busiest->cpu_power *
2507 min(busiest_load_per_task, max_load); 2507 min(busiest_load_per_task, max_load);
2508 pwr_now += this->cpu_power * 2508 pwr_now += this->cpu_power *
2509 min(this_load_per_task, this_load); 2509 min(this_load_per_task, this_load);
2510 pwr_now /= SCHED_LOAD_SCALE; 2510 pwr_now /= SCHED_LOAD_SCALE;
2511 2511
2512 /* Amount of load we'd subtract */ 2512 /* Amount of load we'd subtract */
2513 tmp = busiest_load_per_task * SCHED_LOAD_SCALE / 2513 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2514 busiest->cpu_power; 2514 busiest->cpu_power;
2515 if (max_load > tmp) 2515 if (max_load > tmp)
2516 pwr_move += busiest->cpu_power * 2516 pwr_move += busiest->cpu_power *
2517 min(busiest_load_per_task, max_load - tmp); 2517 min(busiest_load_per_task, max_load - tmp);
2518 2518
2519 /* Amount of load we'd add */ 2519 /* Amount of load we'd add */
2520 if (max_load * busiest->cpu_power < 2520 if (max_load * busiest->cpu_power <
2521 busiest_load_per_task * SCHED_LOAD_SCALE) 2521 busiest_load_per_task * SCHED_LOAD_SCALE)
2522 tmp = max_load * busiest->cpu_power / this->cpu_power; 2522 tmp = max_load * busiest->cpu_power / this->cpu_power;
2523 else 2523 else
2524 tmp = busiest_load_per_task * SCHED_LOAD_SCALE / 2524 tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
2525 this->cpu_power; 2525 this->cpu_power;
2526 pwr_move += this->cpu_power * 2526 pwr_move += this->cpu_power *
2527 min(this_load_per_task, this_load + tmp); 2527 min(this_load_per_task, this_load + tmp);
2528 pwr_move /= SCHED_LOAD_SCALE; 2528 pwr_move /= SCHED_LOAD_SCALE;
2529 2529
2530 /* Move if we gain throughput */ 2530 /* Move if we gain throughput */
2531 if (pwr_move <= pwr_now) 2531 if (pwr_move <= pwr_now)
2532 goto out_balanced; 2532 goto out_balanced;
2533 2533
2534 *imbalance = busiest_load_per_task; 2534 *imbalance = busiest_load_per_task;
2535 } 2535 }
2536 2536
2537 return busiest; 2537 return busiest;
2538 2538
2539 out_balanced: 2539 out_balanced:
2540 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2540 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2541 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) 2541 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2542 goto ret; 2542 goto ret;
2543 2543
2544 if (this == group_leader && group_leader != group_min) { 2544 if (this == group_leader && group_leader != group_min) {
2545 *imbalance = min_load_per_task; 2545 *imbalance = min_load_per_task;
2546 return group_min; 2546 return group_min;
2547 } 2547 }
2548 #endif 2548 #endif
2549 ret: 2549 ret:
2550 *imbalance = 0; 2550 *imbalance = 0;
2551 return NULL; 2551 return NULL;
2552 } 2552 }
2553 2553
2554 /* 2554 /*
2555 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2555 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2556 */ 2556 */
2557 static struct rq * 2557 static struct rq *
2558 find_busiest_queue(struct sched_group *group, enum idle_type idle, 2558 find_busiest_queue(struct sched_group *group, enum idle_type idle,
2559 unsigned long imbalance, cpumask_t *cpus) 2559 unsigned long imbalance, cpumask_t *cpus)
2560 { 2560 {
2561 struct rq *busiest = NULL, *rq; 2561 struct rq *busiest = NULL, *rq;
2562 unsigned long max_load = 0; 2562 unsigned long max_load = 0;
2563 int i; 2563 int i;
2564 2564
2565 for_each_cpu_mask(i, group->cpumask) { 2565 for_each_cpu_mask(i, group->cpumask) {
2566 2566
2567 if (!cpu_isset(i, *cpus)) 2567 if (!cpu_isset(i, *cpus))
2568 continue; 2568 continue;
2569 2569
2570 rq = cpu_rq(i); 2570 rq = cpu_rq(i);
2571 2571
2572 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance) 2572 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
2573 continue; 2573 continue;
2574 2574
2575 if (rq->raw_weighted_load > max_load) { 2575 if (rq->raw_weighted_load > max_load) {
2576 max_load = rq->raw_weighted_load; 2576 max_load = rq->raw_weighted_load;
2577 busiest = rq; 2577 busiest = rq;
2578 } 2578 }
2579 } 2579 }
2580 2580
2581 return busiest; 2581 return busiest;
2582 } 2582 }
2583 2583
2584 /* 2584 /*
2585 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but 2585 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2586 * so long as it is large enough. 2586 * so long as it is large enough.
2587 */ 2587 */
2588 #define MAX_PINNED_INTERVAL 512 2588 #define MAX_PINNED_INTERVAL 512
2589 2589
2590 static inline unsigned long minus_1_or_zero(unsigned long n) 2590 static inline unsigned long minus_1_or_zero(unsigned long n)
2591 { 2591 {
2592 return n > 0 ? n - 1 : 0; 2592 return n > 0 ? n - 1 : 0;
2593 } 2593 }
2594 2594
2595 /* 2595 /*
2596 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2596 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2597 * tasks if there is an imbalance. 2597 * tasks if there is an imbalance.
2598 */ 2598 */
2599 static int load_balance(int this_cpu, struct rq *this_rq, 2599 static int load_balance(int this_cpu, struct rq *this_rq,
2600 struct sched_domain *sd, enum idle_type idle, 2600 struct sched_domain *sd, enum idle_type idle,
2601 int *balance) 2601 int *balance)
2602 { 2602 {
2603 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 2603 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2604 struct sched_group *group; 2604 struct sched_group *group;
2605 unsigned long imbalance; 2605 unsigned long imbalance;
2606 struct rq *busiest; 2606 struct rq *busiest;
2607 cpumask_t cpus = CPU_MASK_ALL; 2607 cpumask_t cpus = CPU_MASK_ALL;
2608 unsigned long flags; 2608 unsigned long flags;
2609 2609
2610 /* 2610 /*
2611 * When power savings policy is enabled for the parent domain, idle 2611 * When power savings policy is enabled for the parent domain, idle
2612 * sibling can pick up load irrespective of busy siblings. In this case, 2612 * sibling can pick up load irrespective of busy siblings. In this case,
2613 * let the state of idle sibling percolate up as IDLE, instead of 2613 * let the state of idle sibling percolate up as IDLE, instead of
2614 * portraying it as NOT_IDLE. 2614 * portraying it as NOT_IDLE.
2615 */ 2615 */
2616 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && 2616 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2617 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2617 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2618 sd_idle = 1; 2618 sd_idle = 1;
2619 2619
2620 schedstat_inc(sd, lb_cnt[idle]); 2620 schedstat_inc(sd, lb_cnt[idle]);
2621 2621
2622 redo: 2622 redo:
2623 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 2623 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2624 &cpus, balance); 2624 &cpus, balance);
2625 2625
2626 if (*balance == 0) 2626 if (*balance == 0)
2627 goto out_balanced; 2627 goto out_balanced;
2628 2628
2629 if (!group) { 2629 if (!group) {
2630 schedstat_inc(sd, lb_nobusyg[idle]); 2630 schedstat_inc(sd, lb_nobusyg[idle]);
2631 goto out_balanced; 2631 goto out_balanced;
2632 } 2632 }
2633 2633
2634 busiest = find_busiest_queue(group, idle, imbalance, &cpus); 2634 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2635 if (!busiest) { 2635 if (!busiest) {
2636 schedstat_inc(sd, lb_nobusyq[idle]); 2636 schedstat_inc(sd, lb_nobusyq[idle]);
2637 goto out_balanced; 2637 goto out_balanced;
2638 } 2638 }
2639 2639
2640 BUG_ON(busiest == this_rq); 2640 BUG_ON(busiest == this_rq);
2641 2641
2642 schedstat_add(sd, lb_imbalance[idle], imbalance); 2642 schedstat_add(sd, lb_imbalance[idle], imbalance);
2643 2643
2644 nr_moved = 0; 2644 nr_moved = 0;
2645 if (busiest->nr_running > 1) { 2645 if (busiest->nr_running > 1) {
2646 /* 2646 /*
2647 * Attempt to move tasks. If find_busiest_group has found 2647 * Attempt to move tasks. If find_busiest_group has found
2648 * an imbalance but busiest->nr_running <= 1, the group is 2648 * an imbalance but busiest->nr_running <= 1, the group is
2649 * still unbalanced. nr_moved simply stays zero, so it is 2649 * still unbalanced. nr_moved simply stays zero, so it is
2650 * correctly treated as an imbalance. 2650 * correctly treated as an imbalance.
2651 */ 2651 */
2652 local_irq_save(flags); 2652 local_irq_save(flags);
2653 double_rq_lock(this_rq, busiest); 2653 double_rq_lock(this_rq, busiest);
2654 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2654 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2655 minus_1_or_zero(busiest->nr_running), 2655 minus_1_or_zero(busiest->nr_running),
2656 imbalance, sd, idle, &all_pinned); 2656 imbalance, sd, idle, &all_pinned);
2657 double_rq_unlock(this_rq, busiest); 2657 double_rq_unlock(this_rq, busiest);
2658 local_irq_restore(flags); 2658 local_irq_restore(flags);
2659 2659
2660 /* All tasks on this runqueue were pinned by CPU affinity */ 2660 /* All tasks on this runqueue were pinned by CPU affinity */
2661 if (unlikely(all_pinned)) { 2661 if (unlikely(all_pinned)) {
2662 cpu_clear(cpu_of(busiest), cpus); 2662 cpu_clear(cpu_of(busiest), cpus);
2663 if (!cpus_empty(cpus)) 2663 if (!cpus_empty(cpus))
2664 goto redo; 2664 goto redo;
2665 goto out_balanced; 2665 goto out_balanced;
2666 } 2666 }
2667 } 2667 }
2668 2668
2669 if (!nr_moved) { 2669 if (!nr_moved) {
2670 schedstat_inc(sd, lb_failed[idle]); 2670 schedstat_inc(sd, lb_failed[idle]);
2671 sd->nr_balance_failed++; 2671 sd->nr_balance_failed++;
2672 2672
2673 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2673 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2674 2674
2675 spin_lock_irqsave(&busiest->lock, flags); 2675 spin_lock_irqsave(&busiest->lock, flags);
2676 2676
2677 /* don't kick the migration_thread, if the curr 2677 /* don't kick the migration_thread, if the curr
2678 * task on busiest cpu can't be moved to this_cpu 2678 * task on busiest cpu can't be moved to this_cpu
2679 */ 2679 */
2680 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { 2680 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2681 spin_unlock_irqrestore(&busiest->lock, flags); 2681 spin_unlock_irqrestore(&busiest->lock, flags);
2682 all_pinned = 1; 2682 all_pinned = 1;
2683 goto out_one_pinned; 2683 goto out_one_pinned;
2684 } 2684 }
2685 2685
2686 if (!busiest->active_balance) { 2686 if (!busiest->active_balance) {
2687 busiest->active_balance = 1; 2687 busiest->active_balance = 1;
2688 busiest->push_cpu = this_cpu; 2688 busiest->push_cpu = this_cpu;
2689 active_balance = 1; 2689 active_balance = 1;
2690 } 2690 }
2691 spin_unlock_irqrestore(&busiest->lock, flags); 2691 spin_unlock_irqrestore(&busiest->lock, flags);
2692 if (active_balance) 2692 if (active_balance)
2693 wake_up_process(busiest->migration_thread); 2693 wake_up_process(busiest->migration_thread);
2694 2694
2695 /* 2695 /*
2696 * We've kicked active balancing, reset the failure 2696 * We've kicked active balancing, reset the failure
2697 * counter. 2697 * counter.
2698 */ 2698 */
2699 sd->nr_balance_failed = sd->cache_nice_tries+1; 2699 sd->nr_balance_failed = sd->cache_nice_tries+1;
2700 } 2700 }
2701 } else 2701 } else
2702 sd->nr_balance_failed = 0; 2702 sd->nr_balance_failed = 0;
2703 2703
2704 if (likely(!active_balance)) { 2704 if (likely(!active_balance)) {
2705 /* We were unbalanced, so reset the balancing interval */ 2705 /* We were unbalanced, so reset the balancing interval */
2706 sd->balance_interval = sd->min_interval; 2706 sd->balance_interval = sd->min_interval;
2707 } else { 2707 } else {
2708 /* 2708 /*
2709 * If we've begun active balancing, start to back off. This 2709 * If we've begun active balancing, start to back off. This
2710 * case may not be covered by the all_pinned logic if there 2710 * case may not be covered by the all_pinned logic if there
2711 * is only 1 task on the busy runqueue (because we don't call 2711 * is only 1 task on the busy runqueue (because we don't call
2712 * move_tasks). 2712 * move_tasks).
2713 */ 2713 */
2714 if (sd->balance_interval < sd->max_interval) 2714 if (sd->balance_interval < sd->max_interval)
2715 sd->balance_interval *= 2; 2715 sd->balance_interval *= 2;
2716 } 2716 }
2717 2717
2718 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2718 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2719 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2719 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2720 return -1; 2720 return -1;
2721 return nr_moved; 2721 return nr_moved;
2722 2722
2723 out_balanced: 2723 out_balanced:
2724 schedstat_inc(sd, lb_balanced[idle]); 2724 schedstat_inc(sd, lb_balanced[idle]);
2725 2725
2726 sd->nr_balance_failed = 0; 2726 sd->nr_balance_failed = 0;
2727 2727
2728 out_one_pinned: 2728 out_one_pinned:
2729 /* tune up the balancing interval */ 2729 /* tune up the balancing interval */
2730 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 2730 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2731 (sd->balance_interval < sd->max_interval)) 2731 (sd->balance_interval < sd->max_interval))
2732 sd->balance_interval *= 2; 2732 sd->balance_interval *= 2;
2733 2733
2734 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2734 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2735 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2735 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2736 return -1; 2736 return -1;
2737 return 0; 2737 return 0;
2738 } 2738 }
2739 2739
2740 /* 2740 /*
2741 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2741 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2742 * tasks if there is an imbalance. 2742 * tasks if there is an imbalance.
2743 * 2743 *
2744 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2744 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2745 * this_rq is locked. 2745 * this_rq is locked.
2746 */ 2746 */
2747 static int 2747 static int
2748 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) 2748 load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2749 { 2749 {
2750 struct sched_group *group; 2750 struct sched_group *group;
2751 struct rq *busiest = NULL; 2751 struct rq *busiest = NULL;
2752 unsigned long imbalance; 2752 unsigned long imbalance;
2753 int nr_moved = 0; 2753 int nr_moved = 0;
2754 int sd_idle = 0; 2754 int sd_idle = 0;
2755 cpumask_t cpus = CPU_MASK_ALL; 2755 cpumask_t cpus = CPU_MASK_ALL;
2756 2756
2757 /* 2757 /*
2758 * When power savings policy is enabled for the parent domain, idle 2758 * When power savings policy is enabled for the parent domain, idle
2759 * sibling can pick up load irrespective of busy siblings. In this case, 2759 * sibling can pick up load irrespective of busy siblings. In this case,
2760 * let the state of idle sibling percolate up as IDLE, instead of 2760 * let the state of idle sibling percolate up as IDLE, instead of
2761 * portraying it as NOT_IDLE. 2761 * portraying it as NOT_IDLE.
2762 */ 2762 */
2763 if (sd->flags & SD_SHARE_CPUPOWER && 2763 if (sd->flags & SD_SHARE_CPUPOWER &&
2764 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2764 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2765 sd_idle = 1; 2765 sd_idle = 1;
2766 2766
2767 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2767 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2768 redo: 2768 redo:
2769 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, 2769 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2770 &sd_idle, &cpus, NULL); 2770 &sd_idle, &cpus, NULL);
2771 if (!group) { 2771 if (!group) {
2772 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2772 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2773 goto out_balanced; 2773 goto out_balanced;
2774 } 2774 }
2775 2775
2776 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, 2776 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
2777 &cpus); 2777 &cpus);
2778 if (!busiest) { 2778 if (!busiest) {
2779 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2779 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2780 goto out_balanced; 2780 goto out_balanced;
2781 } 2781 }
2782 2782
2783 BUG_ON(busiest == this_rq); 2783 BUG_ON(busiest == this_rq);
2784 2784
2785 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2785 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2786 2786
2787 nr_moved = 0; 2787 nr_moved = 0;
2788 if (busiest->nr_running > 1) { 2788 if (busiest->nr_running > 1) {
2789 /* Attempt to move tasks */ 2789 /* Attempt to move tasks */
2790 double_lock_balance(this_rq, busiest); 2790 double_lock_balance(this_rq, busiest);
2791 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2791 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2792 minus_1_or_zero(busiest->nr_running), 2792 minus_1_or_zero(busiest->nr_running),
2793 imbalance, sd, NEWLY_IDLE, NULL); 2793 imbalance, sd, NEWLY_IDLE, NULL);
2794 spin_unlock(&busiest->lock); 2794 spin_unlock(&busiest->lock);
2795 2795
2796 if (!nr_moved) { 2796 if (!nr_moved) {
2797 cpu_clear(cpu_of(busiest), cpus); 2797 cpu_clear(cpu_of(busiest), cpus);
2798 if (!cpus_empty(cpus)) 2798 if (!cpus_empty(cpus))
2799 goto redo; 2799 goto redo;
2800 } 2800 }
2801 } 2801 }
2802 2802
2803 if (!nr_moved) { 2803 if (!nr_moved) {
2804 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2804 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2805 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2805 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2806 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2806 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2807 return -1; 2807 return -1;
2808 } else 2808 } else
2809 sd->nr_balance_failed = 0; 2809 sd->nr_balance_failed = 0;
2810 2810
2811 return nr_moved; 2811 return nr_moved;
2812 2812
2813 out_balanced: 2813 out_balanced:
2814 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2814 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2815 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 2815 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2816 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 2816 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2817 return -1; 2817 return -1;
2818 sd->nr_balance_failed = 0; 2818 sd->nr_balance_failed = 0;
2819 2819
2820 return 0; 2820 return 0;
2821 } 2821 }
2822 2822
2823 /* 2823 /*
2824 * idle_balance is called by schedule() if this_cpu is about to become 2824 * idle_balance is called by schedule() if this_cpu is about to become
2825 * idle. Attempts to pull tasks from other CPUs. 2825 * idle. Attempts to pull tasks from other CPUs.
2826 */ 2826 */
2827 static void idle_balance(int this_cpu, struct rq *this_rq) 2827 static void idle_balance(int this_cpu, struct rq *this_rq)
2828 { 2828 {
2829 struct sched_domain *sd; 2829 struct sched_domain *sd;
2830 int pulled_task = 0; 2830 int pulled_task = 0;
2831 unsigned long next_balance = jiffies + 60 * HZ; 2831 unsigned long next_balance = jiffies + 60 * HZ;
2832 2832
2833 for_each_domain(this_cpu, sd) { 2833 for_each_domain(this_cpu, sd) {
2834 if (sd->flags & SD_BALANCE_NEWIDLE) { 2834 if (sd->flags & SD_BALANCE_NEWIDLE) {
2835 /* If we've pulled tasks over stop searching: */ 2835 /* If we've pulled tasks over stop searching: */
2836 pulled_task = load_balance_newidle(this_cpu, 2836 pulled_task = load_balance_newidle(this_cpu,
2837 this_rq, sd); 2837 this_rq, sd);
2838 if (time_after(next_balance, 2838 if (time_after(next_balance,
2839 sd->last_balance + sd->balance_interval)) 2839 sd->last_balance + sd->balance_interval))
2840 next_balance = sd->last_balance 2840 next_balance = sd->last_balance
2841 + sd->balance_interval; 2841 + sd->balance_interval;
2842 if (pulled_task) 2842 if (pulled_task)
2843 break; 2843 break;
2844 } 2844 }
2845 } 2845 }
2846 if (!pulled_task) 2846 if (!pulled_task)
2847 /* 2847 /*
2848 * We are going idle. next_balance may be set based on 2848 * We are going idle. next_balance may be set based on
2849 * a busy processor. So reset next_balance. 2849 * a busy processor. So reset next_balance.
2850 */ 2850 */
2851 this_rq->next_balance = next_balance; 2851 this_rq->next_balance = next_balance;
2852 } 2852 }
2853 2853
2854 /* 2854 /*
2855 * active_load_balance is run by migration threads. It pushes running tasks 2855 * active_load_balance is run by migration threads. It pushes running tasks
2856 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 2856 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2857 * running on each physical CPU where possible, and avoids physical / 2857 * running on each physical CPU where possible, and avoids physical /
2858 * logical imbalances. 2858 * logical imbalances.
2859 * 2859 *
2860 * Called with busiest_rq locked. 2860 * Called with busiest_rq locked.
2861 */ 2861 */
2862 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 2862 static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2863 { 2863 {
2864 int target_cpu = busiest_rq->push_cpu; 2864 int target_cpu = busiest_rq->push_cpu;
2865 struct sched_domain *sd; 2865 struct sched_domain *sd;
2866 struct rq *target_rq; 2866 struct rq *target_rq;
2867 2867
2868 /* Is there any task to move? */ 2868 /* Is there any task to move? */
2869 if (busiest_rq->nr_running <= 1) 2869 if (busiest_rq->nr_running <= 1)
2870 return; 2870 return;
2871 2871
2872 target_rq = cpu_rq(target_cpu); 2872 target_rq = cpu_rq(target_cpu);
2873 2873
2874 /* 2874 /*
2875 * This condition is "impossible", if it occurs 2875 * This condition is "impossible", if it occurs
2876 * we need to fix it. Originally reported by 2876 * we need to fix it. Originally reported by
2877 * Bjorn Helgaas on a 128-cpu setup. 2877 * Bjorn Helgaas on a 128-cpu setup.
2878 */ 2878 */
2879 BUG_ON(busiest_rq == target_rq); 2879 BUG_ON(busiest_rq == target_rq);
2880 2880
2881 /* move a task from busiest_rq to target_rq */ 2881 /* move a task from busiest_rq to target_rq */
2882 double_lock_balance(busiest_rq, target_rq); 2882 double_lock_balance(busiest_rq, target_rq);
2883 2883
2884 /* Search for an sd spanning us and the target CPU. */ 2884 /* Search for an sd spanning us and the target CPU. */
2885 for_each_domain(target_cpu, sd) { 2885 for_each_domain(target_cpu, sd) {
2886 if ((sd->flags & SD_LOAD_BALANCE) && 2886 if ((sd->flags & SD_LOAD_BALANCE) &&
2887 cpu_isset(busiest_cpu, sd->span)) 2887 cpu_isset(busiest_cpu, sd->span))
2888 break; 2888 break;
2889 } 2889 }
2890 2890
2891 if (likely(sd)) { 2891 if (likely(sd)) {
2892 schedstat_inc(sd, alb_cnt); 2892 schedstat_inc(sd, alb_cnt);
2893 2893
2894 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, 2894 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2895 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, 2895 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
2896 NULL)) 2896 NULL))
2897 schedstat_inc(sd, alb_pushed); 2897 schedstat_inc(sd, alb_pushed);
2898 else 2898 else
2899 schedstat_inc(sd, alb_failed); 2899 schedstat_inc(sd, alb_failed);
2900 } 2900 }
2901 spin_unlock(&target_rq->lock); 2901 spin_unlock(&target_rq->lock);
2902 } 2902 }
2903 2903
2904 static void update_load(struct rq *this_rq) 2904 static void update_load(struct rq *this_rq)
2905 { 2905 {
2906 unsigned long this_load; 2906 unsigned long this_load;
2907 unsigned int i, scale; 2907 unsigned int i, scale;
2908 2908
2909 this_load = this_rq->raw_weighted_load; 2909 this_load = this_rq->raw_weighted_load;
2910 2910
2911 /* Update our load: */ 2911 /* Update our load: */
2912 for (i = 0, scale = 1; i < 3; i++, scale += scale) { 2912 for (i = 0, scale = 1; i < 3; i++, scale += scale) {
2913 unsigned long old_load, new_load; 2913 unsigned long old_load, new_load;
2914 2914
2915 /* scale is effectively 1 << i now, and >> i divides by scale */ 2915 /* scale is effectively 1 << i now, and >> i divides by scale */
2916 2916
2917 old_load = this_rq->cpu_load[i]; 2917 old_load = this_rq->cpu_load[i];
2918 new_load = this_load; 2918 new_load = this_load;
2919 /* 2919 /*
2920 * Round up the averaging division if load is increasing. This 2920 * Round up the averaging division if load is increasing. This
2921 * prevents us from getting stuck on 9 if the load is 10, for 2921 * prevents us from getting stuck on 9 if the load is 10, for
2922 * example. 2922 * example.
2923 */ 2923 */
2924 if (new_load > old_load) 2924 if (new_load > old_load)
2925 new_load += scale-1; 2925 new_load += scale-1;
2926 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 2926 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
2927 } 2927 }
2928 } 2928 }
2929 2929
2930 /* 2930 /*
2931 * run_rebalance_domains is triggered when needed from the scheduler tick. 2931 * run_rebalance_domains is triggered when needed from the scheduler tick.
2932 * 2932 *
2933 * It checks each scheduling domain to see if it is due to be balanced, 2933 * It checks each scheduling domain to see if it is due to be balanced,
2934 * and initiates a balancing operation if so. 2934 * and initiates a balancing operation if so.
2935 * 2935 *
2936 * Balancing parameters are set up in arch_init_sched_domains. 2936 * Balancing parameters are set up in arch_init_sched_domains.
2937 */ 2937 */
2938 static DEFINE_SPINLOCK(balancing); 2938 static DEFINE_SPINLOCK(balancing);
2939 2939
2940 static void run_rebalance_domains(struct softirq_action *h) 2940 static void run_rebalance_domains(struct softirq_action *h)
2941 { 2941 {
2942 int this_cpu = smp_processor_id(), balance = 1; 2942 int this_cpu = smp_processor_id(), balance = 1;
2943 struct rq *this_rq = cpu_rq(this_cpu); 2943 struct rq *this_rq = cpu_rq(this_cpu);
2944 unsigned long interval; 2944 unsigned long interval;
2945 struct sched_domain *sd; 2945 struct sched_domain *sd;
2946 /* 2946 /*
2947 * We are idle if there are no processes running. This 2947 * We are idle if there are no processes running. This
2948 * is valid even if we are the idle process (SMT). 2948 * is valid even if we are the idle process (SMT).
2949 */ 2949 */
2950 enum idle_type idle = !this_rq->nr_running ? 2950 enum idle_type idle = !this_rq->nr_running ?
2951 SCHED_IDLE : NOT_IDLE; 2951 SCHED_IDLE : NOT_IDLE;
2952 /* Earliest time when we have to call run_rebalance_domains again */ 2952 /* Earliest time when we have to call run_rebalance_domains again */
2953 unsigned long next_balance = jiffies + 60*HZ; 2953 unsigned long next_balance = jiffies + 60*HZ;
2954 2954
2955 for_each_domain(this_cpu, sd) { 2955 for_each_domain(this_cpu, sd) {
2956 if (!(sd->flags & SD_LOAD_BALANCE)) 2956 if (!(sd->flags & SD_LOAD_BALANCE))
2957 continue; 2957 continue;
2958 2958
2959 interval = sd->balance_interval; 2959 interval = sd->balance_interval;
2960 if (idle != SCHED_IDLE) 2960 if (idle != SCHED_IDLE)
2961 interval *= sd->busy_factor; 2961 interval *= sd->busy_factor;
2962 2962
2963 /* scale ms to jiffies */ 2963 /* scale ms to jiffies */
2964 interval = msecs_to_jiffies(interval); 2964 interval = msecs_to_jiffies(interval);
2965 if (unlikely(!interval)) 2965 if (unlikely(!interval))
2966 interval = 1; 2966 interval = 1;
2967 2967
2968 if (sd->flags & SD_SERIALIZE) { 2968 if (sd->flags & SD_SERIALIZE) {
2969 if (!spin_trylock(&balancing)) 2969 if (!spin_trylock(&balancing))
2970 goto out; 2970 goto out;
2971 } 2971 }
2972 2972
2973 if (time_after_eq(jiffies, sd->last_balance + interval)) { 2973 if (time_after_eq(jiffies, sd->last_balance + interval)) {
2974 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { 2974 if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
2975 /* 2975 /*
2976 * We've pulled tasks over so either we're no 2976 * We've pulled tasks over so either we're no
2977 * longer idle, or one of our SMT siblings is 2977 * longer idle, or one of our SMT siblings is
2978 * not idle. 2978 * not idle.
2979 */ 2979 */
2980 idle = NOT_IDLE; 2980 idle = NOT_IDLE;
2981 } 2981 }
2982 sd->last_balance = jiffies; 2982 sd->last_balance = jiffies;
2983 } 2983 }
2984 if (sd->flags & SD_SERIALIZE) 2984 if (sd->flags & SD_SERIALIZE)
2985 spin_unlock(&balancing); 2985 spin_unlock(&balancing);
2986 out: 2986 out:
2987 if (time_after(next_balance, sd->last_balance + interval)) 2987 if (time_after(next_balance, sd->last_balance + interval))
2988 next_balance = sd->last_balance + interval; 2988 next_balance = sd->last_balance + interval;
2989 2989
2990 /* 2990 /*
2991 * Stop the load balance at this level. There is another 2991 * Stop the load balance at this level. There is another
2992 * CPU in our sched group which is doing load balancing more 2992 * CPU in our sched group which is doing load balancing more
2993 * actively. 2993 * actively.
2994 */ 2994 */
2995 if (!balance) 2995 if (!balance)
2996 break; 2996 break;
2997 } 2997 }
2998 this_rq->next_balance = next_balance; 2998 this_rq->next_balance = next_balance;
2999 } 2999 }
3000 #else 3000 #else
3001 /* 3001 /*
3002 * on UP we do not need to balance between CPUs: 3002 * on UP we do not need to balance between CPUs:
3003 */ 3003 */
3004 static inline void idle_balance(int cpu, struct rq *rq) 3004 static inline void idle_balance(int cpu, struct rq *rq)
3005 { 3005 {
3006 } 3006 }
3007 #endif 3007 #endif
3008 3008
3009 DEFINE_PER_CPU(struct kernel_stat, kstat); 3009 DEFINE_PER_CPU(struct kernel_stat, kstat);
3010 3010
3011 EXPORT_PER_CPU_SYMBOL(kstat); 3011 EXPORT_PER_CPU_SYMBOL(kstat);
3012 3012
3013 /* 3013 /*
3014 * This is called on clock ticks and on context switches. 3014 * This is called on clock ticks and on context switches.
3015 * Bank in p->sched_time the ns elapsed since the last tick or switch. 3015 * Bank in p->sched_time the ns elapsed since the last tick or switch.
3016 */ 3016 */
3017 static inline void 3017 static inline void
3018 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) 3018 update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
3019 { 3019 {
3020 p->sched_time += now - p->last_ran; 3020 p->sched_time += now - p->last_ran;
3021 p->last_ran = rq->most_recent_timestamp = now; 3021 p->last_ran = rq->most_recent_timestamp = now;
3022 } 3022 }
3023 3023
3024 /* 3024 /*
3025 * Return current->sched_time plus any more ns on the sched_clock 3025 * Return current->sched_time plus any more ns on the sched_clock
3026 * that have not yet been banked. 3026 * that have not yet been banked.
3027 */ 3027 */
3028 unsigned long long current_sched_time(const struct task_struct *p) 3028 unsigned long long current_sched_time(const struct task_struct *p)
3029 { 3029 {
3030 unsigned long long ns; 3030 unsigned long long ns;
3031 unsigned long flags; 3031 unsigned long flags;
3032 3032
3033 local_irq_save(flags); 3033 local_irq_save(flags);
3034 ns = p->sched_time + sched_clock() - p->last_ran; 3034 ns = p->sched_time + sched_clock() - p->last_ran;
3035 local_irq_restore(flags); 3035 local_irq_restore(flags);
3036 3036
3037 return ns; 3037 return ns;
3038 } 3038 }
3039 3039
3040 /* 3040 /*
3041 * We place interactive tasks back into the active array, if possible. 3041 * We place interactive tasks back into the active array, if possible.
3042 * 3042 *
3043 * To guarantee that this does not starve expired tasks we ignore the 3043 * To guarantee that this does not starve expired tasks we ignore the
3044 * interactivity of a task if the first expired task had to wait more 3044 * interactivity of a task if the first expired task had to wait more
3045 * than a 'reasonable' amount of time. This deadline timeout is 3045 * than a 'reasonable' amount of time. This deadline timeout is
3046 * load-dependent, as the frequency of array switched decreases with 3046 * load-dependent, as the frequency of array switched decreases with
3047 * increasing number of running tasks. We also ignore the interactivity 3047 * increasing number of running tasks. We also ignore the interactivity
3048 * if a better static_prio task has expired: 3048 * if a better static_prio task has expired:
3049 */ 3049 */
3050 static inline int expired_starving(struct rq *rq) 3050 static inline int expired_starving(struct rq *rq)
3051 { 3051 {
3052 if (rq->curr->static_prio > rq->best_expired_prio) 3052 if (rq->curr->static_prio > rq->best_expired_prio)
3053 return 1; 3053 return 1;
3054 if (!STARVATION_LIMIT || !rq->expired_timestamp) 3054 if (!STARVATION_LIMIT || !rq->expired_timestamp)
3055 return 0; 3055 return 0;
3056 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running) 3056 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
3057 return 1; 3057 return 1;
3058 return 0; 3058 return 0;
3059 } 3059 }
3060 3060
3061 /* 3061 /*
3062 * Account user cpu time to a process. 3062 * Account user cpu time to a process.
3063 * @p: the process that the cpu time gets accounted to 3063 * @p: the process that the cpu time gets accounted to
3064 * @hardirq_offset: the offset to subtract from hardirq_count() 3064 * @hardirq_offset: the offset to subtract from hardirq_count()
3065 * @cputime: the cpu time spent in user space since the last update 3065 * @cputime: the cpu time spent in user space since the last update
3066 */ 3066 */
3067 void account_user_time(struct task_struct *p, cputime_t cputime) 3067 void account_user_time(struct task_struct *p, cputime_t cputime)
3068 { 3068 {
3069 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3069 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3070 cputime64_t tmp; 3070 cputime64_t tmp;
3071 3071
3072 p->utime = cputime_add(p->utime, cputime); 3072 p->utime = cputime_add(p->utime, cputime);
3073 3073
3074 /* Add user time to cpustat. */ 3074 /* Add user time to cpustat. */
3075 tmp = cputime_to_cputime64(cputime); 3075 tmp = cputime_to_cputime64(cputime);
3076 if (TASK_NICE(p) > 0) 3076 if (TASK_NICE(p) > 0)
3077 cpustat->nice = cputime64_add(cpustat->nice, tmp); 3077 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3078 else 3078 else
3079 cpustat->user = cputime64_add(cpustat->user, tmp); 3079 cpustat->user = cputime64_add(cpustat->user, tmp);
3080 } 3080 }
3081 3081
3082 /* 3082 /*
3083 * Account system cpu time to a process. 3083 * Account system cpu time to a process.
3084 * @p: the process that the cpu time gets accounted to 3084 * @p: the process that the cpu time gets accounted to
3085 * @hardirq_offset: the offset to subtract from hardirq_count() 3085 * @hardirq_offset: the offset to subtract from hardirq_count()
3086 * @cputime: the cpu time spent in kernel space since the last update 3086 * @cputime: the cpu time spent in kernel space since the last update
3087 */ 3087 */
3088 void account_system_time(struct task_struct *p, int hardirq_offset, 3088 void account_system_time(struct task_struct *p, int hardirq_offset,
3089 cputime_t cputime) 3089 cputime_t cputime)
3090 { 3090 {
3091 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3091 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3092 struct rq *rq = this_rq(); 3092 struct rq *rq = this_rq();
3093 cputime64_t tmp; 3093 cputime64_t tmp;
3094 3094
3095 p->stime = cputime_add(p->stime, cputime); 3095 p->stime = cputime_add(p->stime, cputime);
3096 3096
3097 /* Add system time to cpustat. */ 3097 /* Add system time to cpustat. */
3098 tmp = cputime_to_cputime64(cputime); 3098 tmp = cputime_to_cputime64(cputime);
3099 if (hardirq_count() - hardirq_offset) 3099 if (hardirq_count() - hardirq_offset)
3100 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3100 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3101 else if (softirq_count()) 3101 else if (softirq_count())
3102 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3102 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3103 else if (p != rq->idle) 3103 else if (p != rq->idle)
3104 cpustat->system = cputime64_add(cpustat->system, tmp); 3104 cpustat->system = cputime64_add(cpustat->system, tmp);
3105 else if (atomic_read(&rq->nr_iowait) > 0) 3105 else if (atomic_read(&rq->nr_iowait) > 0)
3106 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3106 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3107 else 3107 else
3108 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3108 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3109 /* Account for system time used */ 3109 /* Account for system time used */
3110 acct_update_integrals(p); 3110 acct_update_integrals(p);
3111 } 3111 }
3112 3112
3113 /* 3113 /*
3114 * Account for involuntary wait time. 3114 * Account for involuntary wait time.
3115 * @p: the process from which the cpu time has been stolen 3115 * @p: the process from which the cpu time has been stolen
3116 * @steal: the cpu time spent in involuntary wait 3116 * @steal: the cpu time spent in involuntary wait
3117 */ 3117 */
3118 void account_steal_time(struct task_struct *p, cputime_t steal) 3118 void account_steal_time(struct task_struct *p, cputime_t steal)
3119 { 3119 {
3120 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3120 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3121 cputime64_t tmp = cputime_to_cputime64(steal); 3121 cputime64_t tmp = cputime_to_cputime64(steal);
3122 struct rq *rq = this_rq(); 3122 struct rq *rq = this_rq();
3123 3123
3124 if (p == rq->idle) { 3124 if (p == rq->idle) {
3125 p->stime = cputime_add(p->stime, steal); 3125 p->stime = cputime_add(p->stime, steal);
3126 if (atomic_read(&rq->nr_iowait) > 0) 3126 if (atomic_read(&rq->nr_iowait) > 0)
3127 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3127 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3128 else 3128 else
3129 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3129 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3130 } else 3130 } else
3131 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3131 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3132 } 3132 }
3133 3133
3134 static void task_running_tick(struct rq *rq, struct task_struct *p) 3134 static void task_running_tick(struct rq *rq, struct task_struct *p)
3135 { 3135 {
3136 if (p->array != rq->active) { 3136 if (p->array != rq->active) {
3137 /* Task has expired but was not scheduled yet */ 3137 /* Task has expired but was not scheduled yet */
3138 set_tsk_need_resched(p); 3138 set_tsk_need_resched(p);
3139 return; 3139 return;
3140 } 3140 }
3141 spin_lock(&rq->lock); 3141 spin_lock(&rq->lock);
3142 /* 3142 /*
3143 * The task was running during this tick - update the 3143 * The task was running during this tick - update the
3144 * time slice counter. Note: we do not update a thread's 3144 * time slice counter. Note: we do not update a thread's
3145 * priority until it either goes to sleep or uses up its 3145 * priority until it either goes to sleep or uses up its
3146 * timeslice. This makes it possible for interactive tasks 3146 * timeslice. This makes it possible for interactive tasks
3147 * to use up their timeslices at their highest priority levels. 3147 * to use up their timeslices at their highest priority levels.
3148 */ 3148 */
3149 if (rt_task(p)) { 3149 if (rt_task(p)) {
3150 /* 3150 /*
3151 * RR tasks need a special form of timeslice management. 3151 * RR tasks need a special form of timeslice management.
3152 * FIFO tasks have no timeslices. 3152 * FIFO tasks have no timeslices.
3153 */ 3153 */
3154 if ((p->policy == SCHED_RR) && !--p->time_slice) { 3154 if ((p->policy == SCHED_RR) && !--p->time_slice) {
3155 p->time_slice = task_timeslice(p); 3155 p->time_slice = task_timeslice(p);
3156 p->first_time_slice = 0; 3156 p->first_time_slice = 0;
3157 set_tsk_need_resched(p); 3157 set_tsk_need_resched(p);
3158 3158
3159 /* put it at the end of the queue: */ 3159 /* put it at the end of the queue: */
3160 requeue_task(p, rq->active); 3160 requeue_task(p, rq->active);
3161 } 3161 }
3162 goto out_unlock; 3162 goto out_unlock;
3163 } 3163 }
3164 if (!--p->time_slice) { 3164 if (!--p->time_slice) {
3165 dequeue_task(p, rq->active); 3165 dequeue_task(p, rq->active);
3166 set_tsk_need_resched(p); 3166 set_tsk_need_resched(p);
3167 p->prio = effective_prio(p); 3167 p->prio = effective_prio(p);
3168 p->time_slice = task_timeslice(p); 3168 p->time_slice = task_timeslice(p);
3169 p->first_time_slice = 0; 3169 p->first_time_slice = 0;
3170 3170
3171 if (!rq->expired_timestamp) 3171 if (!rq->expired_timestamp)
3172 rq->expired_timestamp = jiffies; 3172 rq->expired_timestamp = jiffies;
3173 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) { 3173 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
3174 enqueue_task(p, rq->expired); 3174 enqueue_task(p, rq->expired);
3175 if (p->static_prio < rq->best_expired_prio) 3175 if (p->static_prio < rq->best_expired_prio)
3176 rq->best_expired_prio = p->static_prio; 3176 rq->best_expired_prio = p->static_prio;
3177 } else 3177 } else
3178 enqueue_task(p, rq->active); 3178 enqueue_task(p, rq->active);
3179 } else { 3179 } else {
3180 /* 3180 /*
3181 * Prevent a too long timeslice allowing a task to monopolize 3181 * Prevent a too long timeslice allowing a task to monopolize
3182 * the CPU. We do this by splitting up the timeslice into 3182 * the CPU. We do this by splitting up the timeslice into
3183 * smaller pieces. 3183 * smaller pieces.
3184 * 3184 *
3185 * Note: this does not mean the task's timeslices expire or 3185 * Note: this does not mean the task's timeslices expire or
3186 * get lost in any way, they just might be preempted by 3186 * get lost in any way, they just might be preempted by
3187 * another task of equal priority. (one with higher 3187 * another task of equal priority. (one with higher
3188 * priority would have preempted this task already.) We 3188 * priority would have preempted this task already.) We
3189 * requeue this task to the end of the list on this priority 3189 * requeue this task to the end of the list on this priority
3190 * level, which is in essence a round-robin of tasks with 3190 * level, which is in essence a round-robin of tasks with
3191 * equal priority. 3191 * equal priority.
3192 * 3192 *
3193 * This only applies to tasks in the interactive 3193 * This only applies to tasks in the interactive
3194 * delta range with at least TIMESLICE_GRANULARITY to requeue. 3194 * delta range with at least TIMESLICE_GRANULARITY to requeue.
3195 */ 3195 */
3196 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - 3196 if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
3197 p->time_slice) % TIMESLICE_GRANULARITY(p)) && 3197 p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
3198 (p->time_slice >= TIMESLICE_GRANULARITY(p)) && 3198 (p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
3199 (p->array == rq->active)) { 3199 (p->array == rq->active)) {
3200 3200
3201 requeue_task(p, rq->active); 3201 requeue_task(p, rq->active);
3202 set_tsk_need_resched(p); 3202 set_tsk_need_resched(p);
3203 } 3203 }
3204 } 3204 }
3205 out_unlock: 3205 out_unlock:
3206 spin_unlock(&rq->lock); 3206 spin_unlock(&rq->lock);
3207 } 3207 }
3208 3208
3209 /* 3209 /*
3210 * This function gets called by the timer code, with HZ frequency. 3210 * This function gets called by the timer code, with HZ frequency.
3211 * We call it with interrupts disabled. 3211 * We call it with interrupts disabled.
3212 * 3212 *
3213 * It also gets called by the fork code, when changing the parent's 3213 * It also gets called by the fork code, when changing the parent's
3214 * timeslices. 3214 * timeslices.
3215 */ 3215 */
3216 void scheduler_tick(void) 3216 void scheduler_tick(void)
3217 { 3217 {
3218 unsigned long long now = sched_clock(); 3218 unsigned long long now = sched_clock();
3219 struct task_struct *p = current; 3219 struct task_struct *p = current;
3220 int cpu = smp_processor_id(); 3220 int cpu = smp_processor_id();
3221 struct rq *rq = cpu_rq(cpu); 3221 struct rq *rq = cpu_rq(cpu);
3222 3222
3223 update_cpu_clock(p, rq, now); 3223 update_cpu_clock(p, rq, now);
3224 3224
3225 if (p != rq->idle) 3225 if (p != rq->idle)
3226 task_running_tick(rq, p); 3226 task_running_tick(rq, p);
3227 #ifdef CONFIG_SMP 3227 #ifdef CONFIG_SMP
3228 update_load(rq); 3228 update_load(rq);
3229 if (time_after_eq(jiffies, rq->next_balance)) 3229 if (time_after_eq(jiffies, rq->next_balance))
3230 raise_softirq(SCHED_SOFTIRQ); 3230 raise_softirq(SCHED_SOFTIRQ);
3231 #endif 3231 #endif
3232 } 3232 }
3233 3233
3234 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 3234 #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3235 3235
3236 void fastcall add_preempt_count(int val) 3236 void fastcall add_preempt_count(int val)
3237 { 3237 {
3238 /* 3238 /*
3239 * Underflow? 3239 * Underflow?
3240 */ 3240 */
3241 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 3241 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3242 return; 3242 return;
3243 preempt_count() += val; 3243 preempt_count() += val;
3244 /* 3244 /*
3245 * Spinlock count overflowing soon? 3245 * Spinlock count overflowing soon?
3246 */ 3246 */
3247 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 3247 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3248 PREEMPT_MASK - 10); 3248 PREEMPT_MASK - 10);
3249 } 3249 }
3250 EXPORT_SYMBOL(add_preempt_count); 3250 EXPORT_SYMBOL(add_preempt_count);
3251 3251
3252 void fastcall sub_preempt_count(int val) 3252 void fastcall sub_preempt_count(int val)
3253 { 3253 {
3254 /* 3254 /*
3255 * Underflow? 3255 * Underflow?
3256 */ 3256 */
3257 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) 3257 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3258 return; 3258 return;
3259 /* 3259 /*
3260 * Is the spinlock portion underflowing? 3260 * Is the spinlock portion underflowing?
3261 */ 3261 */
3262 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 3262 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3263 !(preempt_count() & PREEMPT_MASK))) 3263 !(preempt_count() & PREEMPT_MASK)))
3264 return; 3264 return;
3265 3265
3266 preempt_count() -= val; 3266 preempt_count() -= val;
3267 } 3267 }
3268 EXPORT_SYMBOL(sub_preempt_count); 3268 EXPORT_SYMBOL(sub_preempt_count);
3269 3269
3270 #endif 3270 #endif
3271 3271
3272 static inline int interactive_sleep(enum sleep_type sleep_type) 3272 static inline int interactive_sleep(enum sleep_type sleep_type)
3273 { 3273 {
3274 return (sleep_type == SLEEP_INTERACTIVE || 3274 return (sleep_type == SLEEP_INTERACTIVE ||
3275 sleep_type == SLEEP_INTERRUPTED); 3275 sleep_type == SLEEP_INTERRUPTED);
3276 } 3276 }
3277 3277
3278 /* 3278 /*
3279 * schedule() is the main scheduler function. 3279 * schedule() is the main scheduler function.
3280 */ 3280 */
3281 asmlinkage void __sched schedule(void) 3281 asmlinkage void __sched schedule(void)
3282 { 3282 {
3283 struct task_struct *prev, *next; 3283 struct task_struct *prev, *next;
3284 struct prio_array *array; 3284 struct prio_array *array;
3285 struct list_head *queue; 3285 struct list_head *queue;
3286 unsigned long long now; 3286 unsigned long long now;
3287 unsigned long run_time; 3287 unsigned long run_time;
3288 int cpu, idx, new_prio; 3288 int cpu, idx, new_prio;
3289 long *switch_count; 3289 long *switch_count;
3290 struct rq *rq; 3290 struct rq *rq;
3291 3291
3292 /* 3292 /*
3293 * Test if we are atomic. Since do_exit() needs to call into 3293 * Test if we are atomic. Since do_exit() needs to call into
3294 * schedule() atomically, we ignore that path for now. 3294 * schedule() atomically, we ignore that path for now.
3295 * Otherwise, whine if we are scheduling when we should not be. 3295 * Otherwise, whine if we are scheduling when we should not be.
3296 */ 3296 */
3297 if (unlikely(in_atomic() && !current->exit_state)) { 3297 if (unlikely(in_atomic() && !current->exit_state)) {
3298 printk(KERN_ERR "BUG: scheduling while atomic: " 3298 printk(KERN_ERR "BUG: scheduling while atomic: "
3299 "%s/0x%08x/%d\n", 3299 "%s/0x%08x/%d\n",
3300 current->comm, preempt_count(), current->pid); 3300 current->comm, preempt_count(), current->pid);
3301 debug_show_held_locks(current); 3301 debug_show_held_locks(current);
3302 if (irqs_disabled()) 3302 if (irqs_disabled())
3303 print_irqtrace_events(current); 3303 print_irqtrace_events(current);
3304 dump_stack(); 3304 dump_stack();
3305 } 3305 }
3306 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 3306 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3307 3307
3308 need_resched: 3308 need_resched:
3309 preempt_disable(); 3309 preempt_disable();
3310 prev = current; 3310 prev = current;
3311 release_kernel_lock(prev); 3311 release_kernel_lock(prev);
3312 need_resched_nonpreemptible: 3312 need_resched_nonpreemptible:
3313 rq = this_rq(); 3313 rq = this_rq();
3314 3314
3315 /* 3315 /*
3316 * The idle thread is not allowed to schedule! 3316 * The idle thread is not allowed to schedule!
3317 * Remove this check after it has been exercised a bit. 3317 * Remove this check after it has been exercised a bit.
3318 */ 3318 */
3319 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) { 3319 if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
3320 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 3320 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
3321 dump_stack(); 3321 dump_stack();
3322 } 3322 }
3323 3323
3324 schedstat_inc(rq, sched_cnt); 3324 schedstat_inc(rq, sched_cnt);
3325 now = sched_clock(); 3325 now = sched_clock();
3326 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { 3326 if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
3327 run_time = now - prev->timestamp; 3327 run_time = now - prev->timestamp;
3328 if (unlikely((long long)(now - prev->timestamp) < 0)) 3328 if (unlikely((long long)(now - prev->timestamp) < 0))
3329 run_time = 0; 3329 run_time = 0;
3330 } else 3330 } else
3331 run_time = NS_MAX_SLEEP_AVG; 3331 run_time = NS_MAX_SLEEP_AVG;
3332 3332
3333 /* 3333 /*
3334 * Tasks charged proportionately less run_time at high sleep_avg to 3334 * Tasks charged proportionately less run_time at high sleep_avg to
3335 * delay them losing their interactive status 3335 * delay them losing their interactive status
3336 */ 3336 */
3337 run_time /= (CURRENT_BONUS(prev) ? : 1); 3337 run_time /= (CURRENT_BONUS(prev) ? : 1);
3338 3338
3339 spin_lock_irq(&rq->lock); 3339 spin_lock_irq(&rq->lock);
3340 3340
3341 switch_count = &prev->nivcsw; 3341 switch_count = &prev->nivcsw;
3342 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3342 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3343 switch_count = &prev->nvcsw; 3343 switch_count = &prev->nvcsw;
3344 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 3344 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3345 unlikely(signal_pending(prev)))) 3345 unlikely(signal_pending(prev))))
3346 prev->state = TASK_RUNNING; 3346 prev->state = TASK_RUNNING;
3347 else { 3347 else {
3348 if (prev->state == TASK_UNINTERRUPTIBLE) 3348 if (prev->state == TASK_UNINTERRUPTIBLE)
3349 rq->nr_uninterruptible++; 3349 rq->nr_uninterruptible++;
3350 deactivate_task(prev, rq); 3350 deactivate_task(prev, rq);
3351 } 3351 }
3352 } 3352 }
3353 3353
3354 cpu = smp_processor_id(); 3354 cpu = smp_processor_id();
3355 if (unlikely(!rq->nr_running)) { 3355 if (unlikely(!rq->nr_running)) {
3356 idle_balance(cpu, rq); 3356 idle_balance(cpu, rq);
3357 if (!rq->nr_running) { 3357 if (!rq->nr_running) {
3358 next = rq->idle; 3358 next = rq->idle;
3359 rq->expired_timestamp = 0; 3359 rq->expired_timestamp = 0;
3360 goto switch_tasks; 3360 goto switch_tasks;
3361 } 3361 }
3362 } 3362 }
3363 3363
3364 array = rq->active; 3364 array = rq->active;
3365 if (unlikely(!array->nr_active)) { 3365 if (unlikely(!array->nr_active)) {
3366 /* 3366 /*
3367 * Switch the active and expired arrays. 3367 * Switch the active and expired arrays.
3368 */ 3368 */
3369 schedstat_inc(rq, sched_switch); 3369 schedstat_inc(rq, sched_switch);
3370 rq->active = rq->expired; 3370 rq->active = rq->expired;
3371 rq->expired = array; 3371 rq->expired = array;
3372 array = rq->active; 3372 array = rq->active;
3373 rq->expired_timestamp = 0; 3373 rq->expired_timestamp = 0;
3374 rq->best_expired_prio = MAX_PRIO; 3374 rq->best_expired_prio = MAX_PRIO;
3375 } 3375 }
3376 3376
3377 idx = sched_find_first_bit(array->bitmap); 3377 idx = sched_find_first_bit(array->bitmap);
3378 queue = array->queue + idx; 3378 queue = array->queue + idx;
3379 next = list_entry(queue->next, struct task_struct, run_list); 3379 next = list_entry(queue->next, struct task_struct, run_list);
3380 3380
3381 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3381 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3382 unsigned long long delta = now - next->timestamp; 3382 unsigned long long delta = now - next->timestamp;
3383 if (unlikely((long long)(now - next->timestamp) < 0)) 3383 if (unlikely((long long)(now - next->timestamp) < 0))
3384 delta = 0; 3384 delta = 0;
3385 3385
3386 if (next->sleep_type == SLEEP_INTERACTIVE) 3386 if (next->sleep_type == SLEEP_INTERACTIVE)
3387 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; 3387 delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
3388 3388
3389 array = next->array; 3389 array = next->array;
3390 new_prio = recalc_task_prio(next, next->timestamp + delta); 3390 new_prio = recalc_task_prio(next, next->timestamp + delta);
3391 3391
3392 if (unlikely(next->prio != new_prio)) { 3392 if (unlikely(next->prio != new_prio)) {
3393 dequeue_task(next, array); 3393 dequeue_task(next, array);
3394 next->prio = new_prio; 3394 next->prio = new_prio;
3395 enqueue_task(next, array); 3395 enqueue_task(next, array);
3396 } 3396 }
3397 } 3397 }
3398 next->sleep_type = SLEEP_NORMAL; 3398 next->sleep_type = SLEEP_NORMAL;
3399 switch_tasks: 3399 switch_tasks:
3400 if (next == rq->idle) 3400 if (next == rq->idle)
3401 schedstat_inc(rq, sched_goidle); 3401 schedstat_inc(rq, sched_goidle);
3402 prefetch(next); 3402 prefetch(next);
3403 prefetch_stack(next); 3403 prefetch_stack(next);
3404 clear_tsk_need_resched(prev); 3404 clear_tsk_need_resched(prev);
3405 rcu_qsctr_inc(task_cpu(prev)); 3405 rcu_qsctr_inc(task_cpu(prev));
3406 3406
3407 update_cpu_clock(prev, rq, now); 3407 update_cpu_clock(prev, rq, now);
3408 3408
3409 prev->sleep_avg -= run_time; 3409 prev->sleep_avg -= run_time;
3410 if ((long)prev->sleep_avg <= 0) 3410 if ((long)prev->sleep_avg <= 0)
3411 prev->sleep_avg = 0; 3411 prev->sleep_avg = 0;
3412 prev->timestamp = prev->last_ran = now; 3412 prev->timestamp = prev->last_ran = now;
3413 3413
3414 sched_info_switch(prev, next); 3414 sched_info_switch(prev, next);
3415 if (likely(prev != next)) { 3415 if (likely(prev != next)) {
3416 next->timestamp = next->last_ran = now; 3416 next->timestamp = next->last_ran = now;
3417 rq->nr_switches++; 3417 rq->nr_switches++;
3418 rq->curr = next; 3418 rq->curr = next;
3419 ++*switch_count; 3419 ++*switch_count;
3420 3420
3421 prepare_task_switch(rq, next); 3421 prepare_task_switch(rq, next);
3422 prev = context_switch(rq, prev, next); 3422 prev = context_switch(rq, prev, next);
3423 barrier(); 3423 barrier();
3424 /* 3424 /*
3425 * this_rq must be evaluated again because prev may have moved 3425 * this_rq must be evaluated again because prev may have moved
3426 * CPUs since it called schedule(), thus the 'rq' on its stack 3426 * CPUs since it called schedule(), thus the 'rq' on its stack
3427 * frame will be invalid. 3427 * frame will be invalid.
3428 */ 3428 */
3429 finish_task_switch(this_rq(), prev); 3429 finish_task_switch(this_rq(), prev);
3430 } else 3430 } else
3431 spin_unlock_irq(&rq->lock); 3431 spin_unlock_irq(&rq->lock);
3432 3432
3433 prev = current; 3433 prev = current;
3434 if (unlikely(reacquire_kernel_lock(prev) < 0)) 3434 if (unlikely(reacquire_kernel_lock(prev) < 0))
3435 goto need_resched_nonpreemptible; 3435 goto need_resched_nonpreemptible;
3436 preempt_enable_no_resched(); 3436 preempt_enable_no_resched();
3437 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3437 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3438 goto need_resched; 3438 goto need_resched;
3439 } 3439 }
3440 EXPORT_SYMBOL(schedule); 3440 EXPORT_SYMBOL(schedule);
3441 3441
3442 #ifdef CONFIG_PREEMPT 3442 #ifdef CONFIG_PREEMPT
3443 /* 3443 /*
3444 * this is the entry point to schedule() from in-kernel preemption 3444 * this is the entry point to schedule() from in-kernel preemption
3445 * off of preempt_enable. Kernel preemptions off return from interrupt 3445 * off of preempt_enable. Kernel preemptions off return from interrupt
3446 * occur there and call schedule directly. 3446 * occur there and call schedule directly.
3447 */ 3447 */
3448 asmlinkage void __sched preempt_schedule(void) 3448 asmlinkage void __sched preempt_schedule(void)
3449 { 3449 {
3450 struct thread_info *ti = current_thread_info(); 3450 struct thread_info *ti = current_thread_info();
3451 #ifdef CONFIG_PREEMPT_BKL 3451 #ifdef CONFIG_PREEMPT_BKL
3452 struct task_struct *task = current; 3452 struct task_struct *task = current;
3453 int saved_lock_depth; 3453 int saved_lock_depth;
3454 #endif 3454 #endif
3455 /* 3455 /*
3456 * If there is a non-zero preempt_count or interrupts are disabled, 3456 * If there is a non-zero preempt_count or interrupts are disabled,
3457 * we do not want to preempt the current task. Just return.. 3457 * we do not want to preempt the current task. Just return..
3458 */ 3458 */
3459 if (likely(ti->preempt_count || irqs_disabled())) 3459 if (likely(ti->preempt_count || irqs_disabled()))
3460 return; 3460 return;
3461 3461
3462 need_resched: 3462 need_resched:
3463 add_preempt_count(PREEMPT_ACTIVE); 3463 add_preempt_count(PREEMPT_ACTIVE);
3464 /* 3464 /*
3465 * We keep the big kernel semaphore locked, but we 3465 * We keep the big kernel semaphore locked, but we
3466 * clear ->lock_depth so that schedule() doesnt 3466 * clear ->lock_depth so that schedule() doesnt
3467 * auto-release the semaphore: 3467 * auto-release the semaphore:
3468 */ 3468 */
3469 #ifdef CONFIG_PREEMPT_BKL 3469 #ifdef CONFIG_PREEMPT_BKL
3470 saved_lock_depth = task->lock_depth; 3470 saved_lock_depth = task->lock_depth;
3471 task->lock_depth = -1; 3471 task->lock_depth = -1;
3472 #endif 3472 #endif
3473 schedule(); 3473 schedule();
3474 #ifdef CONFIG_PREEMPT_BKL 3474 #ifdef CONFIG_PREEMPT_BKL
3475 task->lock_depth = saved_lock_depth; 3475 task->lock_depth = saved_lock_depth;
3476 #endif 3476 #endif
3477 sub_preempt_count(PREEMPT_ACTIVE); 3477 sub_preempt_count(PREEMPT_ACTIVE);
3478 3478
3479 /* we could miss a preemption opportunity between schedule and now */ 3479 /* we could miss a preemption opportunity between schedule and now */
3480 barrier(); 3480 barrier();
3481 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3481 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3482 goto need_resched; 3482 goto need_resched;
3483 } 3483 }
3484 EXPORT_SYMBOL(preempt_schedule); 3484 EXPORT_SYMBOL(preempt_schedule);
3485 3485
3486 /* 3486 /*
3487 * this is the entry point to schedule() from kernel preemption 3487 * this is the entry point to schedule() from kernel preemption
3488 * off of irq context. 3488 * off of irq context.
3489 * Note, that this is called and return with irqs disabled. This will 3489 * Note, that this is called and return with irqs disabled. This will
3490 * protect us against recursive calling from irq. 3490 * protect us against recursive calling from irq.
3491 */ 3491 */
3492 asmlinkage void __sched preempt_schedule_irq(void) 3492 asmlinkage void __sched preempt_schedule_irq(void)
3493 { 3493 {
3494 struct thread_info *ti = current_thread_info(); 3494 struct thread_info *ti = current_thread_info();
3495 #ifdef CONFIG_PREEMPT_BKL 3495 #ifdef CONFIG_PREEMPT_BKL
3496 struct task_struct *task = current; 3496 struct task_struct *task = current;
3497 int saved_lock_depth; 3497 int saved_lock_depth;
3498 #endif 3498 #endif
3499 /* Catch callers which need to be fixed */ 3499 /* Catch callers which need to be fixed */
3500 BUG_ON(ti->preempt_count || !irqs_disabled()); 3500 BUG_ON(ti->preempt_count || !irqs_disabled());
3501 3501
3502 need_resched: 3502 need_resched:
3503 add_preempt_count(PREEMPT_ACTIVE); 3503 add_preempt_count(PREEMPT_ACTIVE);
3504 /* 3504 /*
3505 * We keep the big kernel semaphore locked, but we 3505 * We keep the big kernel semaphore locked, but we
3506 * clear ->lock_depth so that schedule() doesnt 3506 * clear ->lock_depth so that schedule() doesnt
3507 * auto-release the semaphore: 3507 * auto-release the semaphore:
3508 */ 3508 */
3509 #ifdef CONFIG_PREEMPT_BKL 3509 #ifdef CONFIG_PREEMPT_BKL
3510 saved_lock_depth = task->lock_depth; 3510 saved_lock_depth = task->lock_depth;
3511 task->lock_depth = -1; 3511 task->lock_depth = -1;
3512 #endif 3512 #endif
3513 local_irq_enable(); 3513 local_irq_enable();
3514 schedule(); 3514 schedule();
3515 local_irq_disable(); 3515 local_irq_disable();
3516 #ifdef CONFIG_PREEMPT_BKL 3516 #ifdef CONFIG_PREEMPT_BKL
3517 task->lock_depth = saved_lock_depth; 3517 task->lock_depth = saved_lock_depth;
3518 #endif 3518 #endif
3519 sub_preempt_count(PREEMPT_ACTIVE); 3519 sub_preempt_count(PREEMPT_ACTIVE);
3520 3520
3521 /* we could miss a preemption opportunity between schedule and now */ 3521 /* we could miss a preemption opportunity between schedule and now */
3522 barrier(); 3522 barrier();
3523 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3523 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3524 goto need_resched; 3524 goto need_resched;
3525 } 3525 }
3526 3526
3527 #endif /* CONFIG_PREEMPT */ 3527 #endif /* CONFIG_PREEMPT */
3528 3528
3529 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 3529 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3530 void *key) 3530 void *key)
3531 { 3531 {
3532 return try_to_wake_up(curr->private, mode, sync); 3532 return try_to_wake_up(curr->private, mode, sync);
3533 } 3533 }
3534 EXPORT_SYMBOL(default_wake_function); 3534 EXPORT_SYMBOL(default_wake_function);
3535 3535
3536 /* 3536 /*
3537 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 3537 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3538 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 3538 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3539 * number) then we wake all the non-exclusive tasks and one exclusive task. 3539 * number) then we wake all the non-exclusive tasks and one exclusive task.
3540 * 3540 *
3541 * There are circumstances in which we can try to wake a task which has already 3541 * There are circumstances in which we can try to wake a task which has already
3542 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 3542 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3543 * zero in this (rare) case, and we handle it by continuing to scan the queue. 3543 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3544 */ 3544 */
3545 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 3545 static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3546 int nr_exclusive, int sync, void *key) 3546 int nr_exclusive, int sync, void *key)
3547 { 3547 {
3548 struct list_head *tmp, *next; 3548 struct list_head *tmp, *next;
3549 3549
3550 list_for_each_safe(tmp, next, &q->task_list) { 3550 list_for_each_safe(tmp, next, &q->task_list) {
3551 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); 3551 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3552 unsigned flags = curr->flags; 3552 unsigned flags = curr->flags;
3553 3553
3554 if (curr->func(curr, mode, sync, key) && 3554 if (curr->func(curr, mode, sync, key) &&
3555 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 3555 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3556 break; 3556 break;
3557 } 3557 }
3558 } 3558 }
3559 3559
3560 /** 3560 /**
3561 * __wake_up - wake up threads blocked on a waitqueue. 3561 * __wake_up - wake up threads blocked on a waitqueue.
3562 * @q: the waitqueue 3562 * @q: the waitqueue
3563 * @mode: which threads 3563 * @mode: which threads
3564 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3564 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3565 * @key: is directly passed to the wakeup function 3565 * @key: is directly passed to the wakeup function
3566 */ 3566 */
3567 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 3567 void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3568 int nr_exclusive, void *key) 3568 int nr_exclusive, void *key)
3569 { 3569 {
3570 unsigned long flags; 3570 unsigned long flags;
3571 3571
3572 spin_lock_irqsave(&q->lock, flags); 3572 spin_lock_irqsave(&q->lock, flags);
3573 __wake_up_common(q, mode, nr_exclusive, 0, key); 3573 __wake_up_common(q, mode, nr_exclusive, 0, key);
3574 spin_unlock_irqrestore(&q->lock, flags); 3574 spin_unlock_irqrestore(&q->lock, flags);
3575 } 3575 }
3576 EXPORT_SYMBOL(__wake_up); 3576 EXPORT_SYMBOL(__wake_up);
3577 3577
3578 /* 3578 /*
3579 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3579 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3580 */ 3580 */
3581 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3581 void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3582 { 3582 {
3583 __wake_up_common(q, mode, 1, 0, NULL); 3583 __wake_up_common(q, mode, 1, 0, NULL);
3584 } 3584 }
3585 3585
3586 /** 3586 /**
3587 * __wake_up_sync - wake up threads blocked on a waitqueue. 3587 * __wake_up_sync - wake up threads blocked on a waitqueue.
3588 * @q: the waitqueue 3588 * @q: the waitqueue
3589 * @mode: which threads 3589 * @mode: which threads
3590 * @nr_exclusive: how many wake-one or wake-many threads to wake up 3590 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3591 * 3591 *
3592 * The sync wakeup differs that the waker knows that it will schedule 3592 * The sync wakeup differs that the waker knows that it will schedule
3593 * away soon, so while the target thread will be woken up, it will not 3593 * away soon, so while the target thread will be woken up, it will not
3594 * be migrated to another CPU - ie. the two threads are 'synchronized' 3594 * be migrated to another CPU - ie. the two threads are 'synchronized'
3595 * with each other. This can prevent needless bouncing between CPUs. 3595 * with each other. This can prevent needless bouncing between CPUs.
3596 * 3596 *
3597 * On UP it can prevent extra preemption. 3597 * On UP it can prevent extra preemption.
3598 */ 3598 */
3599 void fastcall 3599 void fastcall
3600 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3600 __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3601 { 3601 {
3602 unsigned long flags; 3602 unsigned long flags;
3603 int sync = 1; 3603 int sync = 1;
3604 3604
3605 if (unlikely(!q)) 3605 if (unlikely(!q))
3606 return; 3606 return;
3607 3607
3608 if (unlikely(!nr_exclusive)) 3608 if (unlikely(!nr_exclusive))
3609 sync = 0; 3609 sync = 0;
3610 3610
3611 spin_lock_irqsave(&q->lock, flags); 3611 spin_lock_irqsave(&q->lock, flags);
3612 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 3612 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3613 spin_unlock_irqrestore(&q->lock, flags); 3613 spin_unlock_irqrestore(&q->lock, flags);
3614 } 3614 }
3615 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 3615 EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3616 3616
3617 void fastcall complete(struct completion *x) 3617 void fastcall complete(struct completion *x)
3618 { 3618 {
3619 unsigned long flags; 3619 unsigned long flags;
3620 3620
3621 spin_lock_irqsave(&x->wait.lock, flags); 3621 spin_lock_irqsave(&x->wait.lock, flags);
3622 x->done++; 3622 x->done++;
3623 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 3623 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3624 1, 0, NULL); 3624 1, 0, NULL);
3625 spin_unlock_irqrestore(&x->wait.lock, flags); 3625 spin_unlock_irqrestore(&x->wait.lock, flags);
3626 } 3626 }
3627 EXPORT_SYMBOL(complete); 3627 EXPORT_SYMBOL(complete);
3628 3628
3629 void fastcall complete_all(struct completion *x) 3629 void fastcall complete_all(struct completion *x)
3630 { 3630 {
3631 unsigned long flags; 3631 unsigned long flags;
3632 3632
3633 spin_lock_irqsave(&x->wait.lock, flags); 3633 spin_lock_irqsave(&x->wait.lock, flags);
3634 x->done += UINT_MAX/2; 3634 x->done += UINT_MAX/2;
3635 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 3635 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3636 0, 0, NULL); 3636 0, 0, NULL);
3637 spin_unlock_irqrestore(&x->wait.lock, flags); 3637 spin_unlock_irqrestore(&x->wait.lock, flags);
3638 } 3638 }
3639 EXPORT_SYMBOL(complete_all); 3639 EXPORT_SYMBOL(complete_all);
3640 3640
3641 void fastcall __sched wait_for_completion(struct completion *x) 3641 void fastcall __sched wait_for_completion(struct completion *x)
3642 { 3642 {
3643 might_sleep(); 3643 might_sleep();
3644 3644
3645 spin_lock_irq(&x->wait.lock); 3645 spin_lock_irq(&x->wait.lock);
3646 if (!x->done) { 3646 if (!x->done) {
3647 DECLARE_WAITQUEUE(wait, current); 3647 DECLARE_WAITQUEUE(wait, current);
3648 3648
3649 wait.flags |= WQ_FLAG_EXCLUSIVE; 3649 wait.flags |= WQ_FLAG_EXCLUSIVE;
3650 __add_wait_queue_tail(&x->wait, &wait); 3650 __add_wait_queue_tail(&x->wait, &wait);
3651 do { 3651 do {
3652 __set_current_state(TASK_UNINTERRUPTIBLE); 3652 __set_current_state(TASK_UNINTERRUPTIBLE);
3653 spin_unlock_irq(&x->wait.lock); 3653 spin_unlock_irq(&x->wait.lock);
3654 schedule(); 3654 schedule();
3655 spin_lock_irq(&x->wait.lock); 3655 spin_lock_irq(&x->wait.lock);
3656 } while (!x->done); 3656 } while (!x->done);
3657 __remove_wait_queue(&x->wait, &wait); 3657 __remove_wait_queue(&x->wait, &wait);
3658 } 3658 }
3659 x->done--; 3659 x->done--;
3660 spin_unlock_irq(&x->wait.lock); 3660 spin_unlock_irq(&x->wait.lock);
3661 } 3661 }
3662 EXPORT_SYMBOL(wait_for_completion); 3662 EXPORT_SYMBOL(wait_for_completion);
3663 3663
3664 unsigned long fastcall __sched 3664 unsigned long fastcall __sched
3665 wait_for_completion_timeout(struct completion *x, unsigned long timeout) 3665 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3666 { 3666 {
3667 might_sleep(); 3667 might_sleep();
3668 3668
3669 spin_lock_irq(&x->wait.lock); 3669 spin_lock_irq(&x->wait.lock);
3670 if (!x->done) { 3670 if (!x->done) {
3671 DECLARE_WAITQUEUE(wait, current); 3671 DECLARE_WAITQUEUE(wait, current);
3672 3672
3673 wait.flags |= WQ_FLAG_EXCLUSIVE; 3673 wait.flags |= WQ_FLAG_EXCLUSIVE;
3674 __add_wait_queue_tail(&x->wait, &wait); 3674 __add_wait_queue_tail(&x->wait, &wait);
3675 do { 3675 do {
3676 __set_current_state(TASK_UNINTERRUPTIBLE); 3676 __set_current_state(TASK_UNINTERRUPTIBLE);
3677 spin_unlock_irq(&x->wait.lock); 3677 spin_unlock_irq(&x->wait.lock);
3678 timeout = schedule_timeout(timeout); 3678 timeout = schedule_timeout(timeout);
3679 spin_lock_irq(&x->wait.lock); 3679 spin_lock_irq(&x->wait.lock);
3680 if (!timeout) { 3680 if (!timeout) {
3681 __remove_wait_queue(&x->wait, &wait); 3681 __remove_wait_queue(&x->wait, &wait);
3682 goto out; 3682 goto out;
3683 } 3683 }
3684 } while (!x->done); 3684 } while (!x->done);
3685 __remove_wait_queue(&x->wait, &wait); 3685 __remove_wait_queue(&x->wait, &wait);
3686 } 3686 }
3687 x->done--; 3687 x->done--;
3688 out: 3688 out:
3689 spin_unlock_irq(&x->wait.lock); 3689 spin_unlock_irq(&x->wait.lock);
3690 return timeout; 3690 return timeout;
3691 } 3691 }
3692 EXPORT_SYMBOL(wait_for_completion_timeout); 3692 EXPORT_SYMBOL(wait_for_completion_timeout);
3693 3693
3694 int fastcall __sched wait_for_completion_interruptible(struct completion *x) 3694 int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3695 { 3695 {
3696 int ret = 0; 3696 int ret = 0;
3697 3697
3698 might_sleep(); 3698 might_sleep();
3699 3699
3700 spin_lock_irq(&x->wait.lock); 3700 spin_lock_irq(&x->wait.lock);
3701 if (!x->done) { 3701 if (!x->done) {
3702 DECLARE_WAITQUEUE(wait, current); 3702 DECLARE_WAITQUEUE(wait, current);
3703 3703
3704 wait.flags |= WQ_FLAG_EXCLUSIVE; 3704 wait.flags |= WQ_FLAG_EXCLUSIVE;
3705 __add_wait_queue_tail(&x->wait, &wait); 3705 __add_wait_queue_tail(&x->wait, &wait);
3706 do { 3706 do {
3707 if (signal_pending(current)) { 3707 if (signal_pending(current)) {
3708 ret = -ERESTARTSYS; 3708 ret = -ERESTARTSYS;
3709 __remove_wait_queue(&x->wait, &wait); 3709 __remove_wait_queue(&x->wait, &wait);
3710 goto out; 3710 goto out;
3711 } 3711 }
3712 __set_current_state(TASK_INTERRUPTIBLE); 3712 __set_current_state(TASK_INTERRUPTIBLE);
3713 spin_unlock_irq(&x->wait.lock); 3713 spin_unlock_irq(&x->wait.lock);
3714 schedule(); 3714 schedule();
3715 spin_lock_irq(&x->wait.lock); 3715 spin_lock_irq(&x->wait.lock);
3716 } while (!x->done); 3716 } while (!x->done);
3717 __remove_wait_queue(&x->wait, &wait); 3717 __remove_wait_queue(&x->wait, &wait);
3718 } 3718 }
3719 x->done--; 3719 x->done--;
3720 out: 3720 out:
3721 spin_unlock_irq(&x->wait.lock); 3721 spin_unlock_irq(&x->wait.lock);
3722 3722
3723 return ret; 3723 return ret;
3724 } 3724 }
3725 EXPORT_SYMBOL(wait_for_completion_interruptible); 3725 EXPORT_SYMBOL(wait_for_completion_interruptible);
3726 3726
3727 unsigned long fastcall __sched 3727 unsigned long fastcall __sched
3728 wait_for_completion_interruptible_timeout(struct completion *x, 3728 wait_for_completion_interruptible_timeout(struct completion *x,
3729 unsigned long timeout) 3729 unsigned long timeout)
3730 { 3730 {
3731 might_sleep(); 3731 might_sleep();
3732 3732
3733 spin_lock_irq(&x->wait.lock); 3733 spin_lock_irq(&x->wait.lock);
3734 if (!x->done) { 3734 if (!x->done) {
3735 DECLARE_WAITQUEUE(wait, current); 3735 DECLARE_WAITQUEUE(wait, current);
3736 3736
3737 wait.flags |= WQ_FLAG_EXCLUSIVE; 3737 wait.flags |= WQ_FLAG_EXCLUSIVE;
3738 __add_wait_queue_tail(&x->wait, &wait); 3738 __add_wait_queue_tail(&x->wait, &wait);
3739 do { 3739 do {
3740 if (signal_pending(current)) { 3740 if (signal_pending(current)) {
3741 timeout = -ERESTARTSYS; 3741 timeout = -ERESTARTSYS;
3742 __remove_wait_queue(&x->wait, &wait); 3742 __remove_wait_queue(&x->wait, &wait);
3743 goto out; 3743 goto out;
3744 } 3744 }
3745 __set_current_state(TASK_INTERRUPTIBLE); 3745 __set_current_state(TASK_INTERRUPTIBLE);
3746 spin_unlock_irq(&x->wait.lock); 3746 spin_unlock_irq(&x->wait.lock);
3747 timeout = schedule_timeout(timeout); 3747 timeout = schedule_timeout(timeout);
3748 spin_lock_irq(&x->wait.lock); 3748 spin_lock_irq(&x->wait.lock);
3749 if (!timeout) { 3749 if (!timeout) {
3750 __remove_wait_queue(&x->wait, &wait); 3750 __remove_wait_queue(&x->wait, &wait);
3751 goto out; 3751 goto out;
3752 } 3752 }
3753 } while (!x->done); 3753 } while (!x->done);
3754 __remove_wait_queue(&x->wait, &wait); 3754 __remove_wait_queue(&x->wait, &wait);
3755 } 3755 }
3756 x->done--; 3756 x->done--;
3757 out: 3757 out:
3758 spin_unlock_irq(&x->wait.lock); 3758 spin_unlock_irq(&x->wait.lock);
3759 return timeout; 3759 return timeout;
3760 } 3760 }
3761 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 3761 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3762 3762
3763 3763
3764 #define SLEEP_ON_VAR \ 3764 #define SLEEP_ON_VAR \
3765 unsigned long flags; \ 3765 unsigned long flags; \
3766 wait_queue_t wait; \ 3766 wait_queue_t wait; \
3767 init_waitqueue_entry(&wait, current); 3767 init_waitqueue_entry(&wait, current);
3768 3768
3769 #define SLEEP_ON_HEAD \ 3769 #define SLEEP_ON_HEAD \
3770 spin_lock_irqsave(&q->lock,flags); \ 3770 spin_lock_irqsave(&q->lock,flags); \
3771 __add_wait_queue(q, &wait); \ 3771 __add_wait_queue(q, &wait); \
3772 spin_unlock(&q->lock); 3772 spin_unlock(&q->lock);
3773 3773
3774 #define SLEEP_ON_TAIL \ 3774 #define SLEEP_ON_TAIL \
3775 spin_lock_irq(&q->lock); \ 3775 spin_lock_irq(&q->lock); \
3776 __remove_wait_queue(q, &wait); \ 3776 __remove_wait_queue(q, &wait); \
3777 spin_unlock_irqrestore(&q->lock, flags); 3777 spin_unlock_irqrestore(&q->lock, flags);
3778 3778
3779 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) 3779 void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3780 { 3780 {
3781 SLEEP_ON_VAR 3781 SLEEP_ON_VAR
3782 3782
3783 current->state = TASK_INTERRUPTIBLE; 3783 current->state = TASK_INTERRUPTIBLE;
3784 3784
3785 SLEEP_ON_HEAD 3785 SLEEP_ON_HEAD
3786 schedule(); 3786 schedule();
3787 SLEEP_ON_TAIL 3787 SLEEP_ON_TAIL
3788 } 3788 }
3789 EXPORT_SYMBOL(interruptible_sleep_on); 3789 EXPORT_SYMBOL(interruptible_sleep_on);
3790 3790
3791 long fastcall __sched 3791 long fastcall __sched
3792 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3792 interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3793 { 3793 {
3794 SLEEP_ON_VAR 3794 SLEEP_ON_VAR
3795 3795
3796 current->state = TASK_INTERRUPTIBLE; 3796 current->state = TASK_INTERRUPTIBLE;
3797 3797
3798 SLEEP_ON_HEAD 3798 SLEEP_ON_HEAD
3799 timeout = schedule_timeout(timeout); 3799 timeout = schedule_timeout(timeout);
3800 SLEEP_ON_TAIL 3800 SLEEP_ON_TAIL
3801 3801
3802 return timeout; 3802 return timeout;
3803 } 3803 }
3804 EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3804 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3805 3805
3806 void fastcall __sched sleep_on(wait_queue_head_t *q) 3806 void fastcall __sched sleep_on(wait_queue_head_t *q)
3807 { 3807 {
3808 SLEEP_ON_VAR 3808 SLEEP_ON_VAR
3809 3809
3810 current->state = TASK_UNINTERRUPTIBLE; 3810 current->state = TASK_UNINTERRUPTIBLE;
3811 3811
3812 SLEEP_ON_HEAD 3812 SLEEP_ON_HEAD
3813 schedule(); 3813 schedule();
3814 SLEEP_ON_TAIL 3814 SLEEP_ON_TAIL
3815 } 3815 }
3816 EXPORT_SYMBOL(sleep_on); 3816 EXPORT_SYMBOL(sleep_on);
3817 3817
3818 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3818 long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3819 { 3819 {
3820 SLEEP_ON_VAR 3820 SLEEP_ON_VAR
3821 3821
3822 current->state = TASK_UNINTERRUPTIBLE; 3822 current->state = TASK_UNINTERRUPTIBLE;
3823 3823
3824 SLEEP_ON_HEAD 3824 SLEEP_ON_HEAD
3825 timeout = schedule_timeout(timeout); 3825 timeout = schedule_timeout(timeout);
3826 SLEEP_ON_TAIL 3826 SLEEP_ON_TAIL
3827 3827
3828 return timeout; 3828 return timeout;
3829 } 3829 }
3830 3830
3831 EXPORT_SYMBOL(sleep_on_timeout); 3831 EXPORT_SYMBOL(sleep_on_timeout);
3832 3832
3833 #ifdef CONFIG_RT_MUTEXES 3833 #ifdef CONFIG_RT_MUTEXES
3834 3834
3835 /* 3835 /*
3836 * rt_mutex_setprio - set the current priority of a task 3836 * rt_mutex_setprio - set the current priority of a task
3837 * @p: task 3837 * @p: task
3838 * @prio: prio value (kernel-internal form) 3838 * @prio: prio value (kernel-internal form)
3839 * 3839 *
3840 * This function changes the 'effective' priority of a task. It does 3840 * This function changes the 'effective' priority of a task. It does
3841 * not touch ->normal_prio like __setscheduler(). 3841 * not touch ->normal_prio like __setscheduler().
3842 * 3842 *
3843 * Used by the rt_mutex code to implement priority inheritance logic. 3843 * Used by the rt_mutex code to implement priority inheritance logic.
3844 */ 3844 */
3845 void rt_mutex_setprio(struct task_struct *p, int prio) 3845 void rt_mutex_setprio(struct task_struct *p, int prio)
3846 { 3846 {
3847 struct prio_array *array; 3847 struct prio_array *array;
3848 unsigned long flags; 3848 unsigned long flags;
3849 struct rq *rq; 3849 struct rq *rq;
3850 int oldprio; 3850 int oldprio;
3851 3851
3852 BUG_ON(prio < 0 || prio > MAX_PRIO); 3852 BUG_ON(prio < 0 || prio > MAX_PRIO);
3853 3853
3854 rq = task_rq_lock(p, &flags); 3854 rq = task_rq_lock(p, &flags);
3855 3855
3856 oldprio = p->prio; 3856 oldprio = p->prio;
3857 array = p->array; 3857 array = p->array;
3858 if (array) 3858 if (array)
3859 dequeue_task(p, array); 3859 dequeue_task(p, array);
3860 p->prio = prio; 3860 p->prio = prio;
3861 3861
3862 if (array) { 3862 if (array) {
3863 /* 3863 /*
3864 * If changing to an RT priority then queue it 3864 * If changing to an RT priority then queue it
3865 * in the active array! 3865 * in the active array!
3866 */ 3866 */
3867 if (rt_task(p)) 3867 if (rt_task(p))
3868 array = rq->active; 3868 array = rq->active;
3869 enqueue_task(p, array); 3869 enqueue_task(p, array);
3870 /* 3870 /*
3871 * Reschedule if we are currently running on this runqueue and 3871 * Reschedule if we are currently running on this runqueue and
3872 * our priority decreased, or if we are not currently running on 3872 * our priority decreased, or if we are not currently running on
3873 * this runqueue and our priority is higher than the current's 3873 * this runqueue and our priority is higher than the current's
3874 */ 3874 */
3875 if (task_running(rq, p)) { 3875 if (task_running(rq, p)) {
3876 if (p->prio > oldprio) 3876 if (p->prio > oldprio)
3877 resched_task(rq->curr); 3877 resched_task(rq->curr);
3878 } else if (TASK_PREEMPTS_CURR(p, rq)) 3878 } else if (TASK_PREEMPTS_CURR(p, rq))
3879 resched_task(rq->curr); 3879 resched_task(rq->curr);
3880 } 3880 }
3881 task_rq_unlock(rq, &flags); 3881 task_rq_unlock(rq, &flags);
3882 } 3882 }
3883 3883
3884 #endif 3884 #endif
3885 3885
3886 void set_user_nice(struct task_struct *p, long nice) 3886 void set_user_nice(struct task_struct *p, long nice)
3887 { 3887 {
3888 struct prio_array *array; 3888 struct prio_array *array;
3889 int old_prio, delta; 3889 int old_prio, delta;
3890 unsigned long flags; 3890 unsigned long flags;
3891 struct rq *rq; 3891 struct rq *rq;
3892 3892
3893 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3893 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3894 return; 3894 return;
3895 /* 3895 /*
3896 * We have to be careful, if called from sys_setpriority(), 3896 * We have to be careful, if called from sys_setpriority(),
3897 * the task might be in the middle of scheduling on another CPU. 3897 * the task might be in the middle of scheduling on another CPU.
3898 */ 3898 */
3899 rq = task_rq_lock(p, &flags); 3899 rq = task_rq_lock(p, &flags);
3900 /* 3900 /*
3901 * The RT priorities are set via sched_setscheduler(), but we still 3901 * The RT priorities are set via sched_setscheduler(), but we still
3902 * allow the 'normal' nice value to be set - but as expected 3902 * allow the 'normal' nice value to be set - but as expected
3903 * it wont have any effect on scheduling until the task is 3903 * it wont have any effect on scheduling until the task is
3904 * not SCHED_NORMAL/SCHED_BATCH: 3904 * not SCHED_NORMAL/SCHED_BATCH:
3905 */ 3905 */
3906 if (has_rt_policy(p)) { 3906 if (has_rt_policy(p)) {
3907 p->static_prio = NICE_TO_PRIO(nice); 3907 p->static_prio = NICE_TO_PRIO(nice);
3908 goto out_unlock; 3908 goto out_unlock;
3909 } 3909 }
3910 array = p->array; 3910 array = p->array;
3911 if (array) { 3911 if (array) {
3912 dequeue_task(p, array); 3912 dequeue_task(p, array);
3913 dec_raw_weighted_load(rq, p); 3913 dec_raw_weighted_load(rq, p);
3914 } 3914 }
3915 3915
3916 p->static_prio = NICE_TO_PRIO(nice); 3916 p->static_prio = NICE_TO_PRIO(nice);
3917 set_load_weight(p); 3917 set_load_weight(p);
3918 old_prio = p->prio; 3918 old_prio = p->prio;
3919 p->prio = effective_prio(p); 3919 p->prio = effective_prio(p);
3920 delta = p->prio - old_prio; 3920 delta = p->prio - old_prio;
3921 3921
3922 if (array) { 3922 if (array) {
3923 enqueue_task(p, array); 3923 enqueue_task(p, array);
3924 inc_raw_weighted_load(rq, p); 3924 inc_raw_weighted_load(rq, p);
3925 /* 3925 /*
3926 * If the task increased its priority or is running and 3926 * If the task increased its priority or is running and
3927 * lowered its priority, then reschedule its CPU: 3927 * lowered its priority, then reschedule its CPU:
3928 */ 3928 */
3929 if (delta < 0 || (delta > 0 && task_running(rq, p))) 3929 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3930 resched_task(rq->curr); 3930 resched_task(rq->curr);
3931 } 3931 }
3932 out_unlock: 3932 out_unlock:
3933 task_rq_unlock(rq, &flags); 3933 task_rq_unlock(rq, &flags);
3934 } 3934 }
3935 EXPORT_SYMBOL(set_user_nice); 3935 EXPORT_SYMBOL(set_user_nice);
3936 3936
3937 /* 3937 /*
3938 * can_nice - check if a task can reduce its nice value 3938 * can_nice - check if a task can reduce its nice value
3939 * @p: task 3939 * @p: task
3940 * @nice: nice value 3940 * @nice: nice value
3941 */ 3941 */
3942 int can_nice(const struct task_struct *p, const int nice) 3942 int can_nice(const struct task_struct *p, const int nice)
3943 { 3943 {
3944 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3944 /* convert nice value [19,-20] to rlimit style value [1,40] */
3945 int nice_rlim = 20 - nice; 3945 int nice_rlim = 20 - nice;
3946 3946
3947 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3947 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3948 capable(CAP_SYS_NICE)); 3948 capable(CAP_SYS_NICE));
3949 } 3949 }
3950 3950
3951 #ifdef __ARCH_WANT_SYS_NICE 3951 #ifdef __ARCH_WANT_SYS_NICE
3952 3952
3953 /* 3953 /*
3954 * sys_nice - change the priority of the current process. 3954 * sys_nice - change the priority of the current process.
3955 * @increment: priority increment 3955 * @increment: priority increment
3956 * 3956 *
3957 * sys_setpriority is a more generic, but much slower function that 3957 * sys_setpriority is a more generic, but much slower function that
3958 * does similar things. 3958 * does similar things.
3959 */ 3959 */
3960 asmlinkage long sys_nice(int increment) 3960 asmlinkage long sys_nice(int increment)
3961 { 3961 {
3962 long nice, retval; 3962 long nice, retval;
3963 3963
3964 /* 3964 /*
3965 * Setpriority might change our priority at the same moment. 3965 * Setpriority might change our priority at the same moment.
3966 * We don't have to worry. Conceptually one call occurs first 3966 * We don't have to worry. Conceptually one call occurs first
3967 * and we have a single winner. 3967 * and we have a single winner.
3968 */ 3968 */
3969 if (increment < -40) 3969 if (increment < -40)
3970 increment = -40; 3970 increment = -40;
3971 if (increment > 40) 3971 if (increment > 40)
3972 increment = 40; 3972 increment = 40;
3973 3973
3974 nice = PRIO_TO_NICE(current->static_prio) + increment; 3974 nice = PRIO_TO_NICE(current->static_prio) + increment;
3975 if (nice < -20) 3975 if (nice < -20)
3976 nice = -20; 3976 nice = -20;
3977 if (nice > 19) 3977 if (nice > 19)
3978 nice = 19; 3978 nice = 19;
3979 3979
3980 if (increment < 0 && !can_nice(current, nice)) 3980 if (increment < 0 && !can_nice(current, nice))
3981 return -EPERM; 3981 return -EPERM;
3982 3982
3983 retval = security_task_setnice(current, nice); 3983 retval = security_task_setnice(current, nice);
3984 if (retval) 3984 if (retval)
3985 return retval; 3985 return retval;
3986 3986
3987 set_user_nice(current, nice); 3987 set_user_nice(current, nice);
3988 return 0; 3988 return 0;
3989 } 3989 }
3990 3990
3991 #endif 3991 #endif
3992 3992
3993 /** 3993 /**
3994 * task_prio - return the priority value of a given task. 3994 * task_prio - return the priority value of a given task.
3995 * @p: the task in question. 3995 * @p: the task in question.
3996 * 3996 *
3997 * This is the priority value as seen by users in /proc. 3997 * This is the priority value as seen by users in /proc.
3998 * RT tasks are offset by -200. Normal tasks are centered 3998 * RT tasks are offset by -200. Normal tasks are centered
3999 * around 0, value goes from -16 to +15. 3999 * around 0, value goes from -16 to +15.
4000 */ 4000 */
4001 int task_prio(const struct task_struct *p) 4001 int task_prio(const struct task_struct *p)
4002 { 4002 {
4003 return p->prio - MAX_RT_PRIO; 4003 return p->prio - MAX_RT_PRIO;
4004 } 4004 }
4005 4005
4006 /** 4006 /**
4007 * task_nice - return the nice value of a given task. 4007 * task_nice - return the nice value of a given task.
4008 * @p: the task in question. 4008 * @p: the task in question.
4009 */ 4009 */
4010 int task_nice(const struct task_struct *p) 4010 int task_nice(const struct task_struct *p)
4011 { 4011 {
4012 return TASK_NICE(p); 4012 return TASK_NICE(p);
4013 } 4013 }
4014 EXPORT_SYMBOL_GPL(task_nice); 4014 EXPORT_SYMBOL_GPL(task_nice);
4015 4015
4016 /** 4016 /**
4017 * idle_cpu - is a given cpu idle currently? 4017 * idle_cpu - is a given cpu idle currently?
4018 * @cpu: the processor in question. 4018 * @cpu: the processor in question.
4019 */ 4019 */
4020 int idle_cpu(int cpu) 4020 int idle_cpu(int cpu)
4021 { 4021 {
4022 return cpu_curr(cpu) == cpu_rq(cpu)->idle; 4022 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
4023 } 4023 }
4024 4024
4025 /** 4025 /**
4026 * idle_task - return the idle task for a given cpu. 4026 * idle_task - return the idle task for a given cpu.
4027 * @cpu: the processor in question. 4027 * @cpu: the processor in question.
4028 */ 4028 */
4029 struct task_struct *idle_task(int cpu) 4029 struct task_struct *idle_task(int cpu)
4030 { 4030 {
4031 return cpu_rq(cpu)->idle; 4031 return cpu_rq(cpu)->idle;
4032 } 4032 }
4033 4033
4034 /** 4034 /**
4035 * find_process_by_pid - find a process with a matching PID value. 4035 * find_process_by_pid - find a process with a matching PID value.
4036 * @pid: the pid in question. 4036 * @pid: the pid in question.
4037 */ 4037 */
4038 static inline struct task_struct *find_process_by_pid(pid_t pid) 4038 static inline struct task_struct *find_process_by_pid(pid_t pid)
4039 { 4039 {
4040 return pid ? find_task_by_pid(pid) : current; 4040 return pid ? find_task_by_pid(pid) : current;
4041 } 4041 }
4042 4042
4043 /* Actually do priority change: must hold rq lock. */ 4043 /* Actually do priority change: must hold rq lock. */
4044 static void __setscheduler(struct task_struct *p, int policy, int prio) 4044 static void __setscheduler(struct task_struct *p, int policy, int prio)
4045 { 4045 {
4046 BUG_ON(p->array); 4046 BUG_ON(p->array);
4047 4047
4048 p->policy = policy; 4048 p->policy = policy;
4049 p->rt_priority = prio; 4049 p->rt_priority = prio;
4050 p->normal_prio = normal_prio(p); 4050 p->normal_prio = normal_prio(p);
4051 /* we are holding p->pi_lock already */ 4051 /* we are holding p->pi_lock already */
4052 p->prio = rt_mutex_getprio(p); 4052 p->prio = rt_mutex_getprio(p);
4053 /* 4053 /*
4054 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 4054 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
4055 */ 4055 */
4056 if (policy == SCHED_BATCH) 4056 if (policy == SCHED_BATCH)
4057 p->sleep_avg = 0; 4057 p->sleep_avg = 0;
4058 set_load_weight(p); 4058 set_load_weight(p);
4059 } 4059 }
4060 4060
4061 /** 4061 /**
4062 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4062 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4063 * @p: the task in question. 4063 * @p: the task in question.
4064 * @policy: new policy. 4064 * @policy: new policy.
4065 * @param: structure containing the new RT priority. 4065 * @param: structure containing the new RT priority.
4066 * 4066 *
4067 * NOTE that the task may be already dead. 4067 * NOTE that the task may be already dead.
4068 */ 4068 */
4069 int sched_setscheduler(struct task_struct *p, int policy, 4069 int sched_setscheduler(struct task_struct *p, int policy,
4070 struct sched_param *param) 4070 struct sched_param *param)
4071 { 4071 {
4072 int retval, oldprio, oldpolicy = -1; 4072 int retval, oldprio, oldpolicy = -1;
4073 struct prio_array *array; 4073 struct prio_array *array;
4074 unsigned long flags; 4074 unsigned long flags;
4075 struct rq *rq; 4075 struct rq *rq;
4076 4076
4077 /* may grab non-irq protected spin_locks */ 4077 /* may grab non-irq protected spin_locks */
4078 BUG_ON(in_interrupt()); 4078 BUG_ON(in_interrupt());
4079 recheck: 4079 recheck:
4080 /* double check policy once rq lock held */ 4080 /* double check policy once rq lock held */
4081 if (policy < 0) 4081 if (policy < 0)
4082 policy = oldpolicy = p->policy; 4082 policy = oldpolicy = p->policy;
4083 else if (policy != SCHED_FIFO && policy != SCHED_RR && 4083 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
4084 policy != SCHED_NORMAL && policy != SCHED_BATCH) 4084 policy != SCHED_NORMAL && policy != SCHED_BATCH)
4085 return -EINVAL; 4085 return -EINVAL;
4086 /* 4086 /*
4087 * Valid priorities for SCHED_FIFO and SCHED_RR are 4087 * Valid priorities for SCHED_FIFO and SCHED_RR are
4088 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and 4088 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
4089 * SCHED_BATCH is 0. 4089 * SCHED_BATCH is 0.
4090 */ 4090 */
4091 if (param->sched_priority < 0 || 4091 if (param->sched_priority < 0 ||
4092 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4092 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
4093 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4093 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
4094 return -EINVAL; 4094 return -EINVAL;
4095 if (is_rt_policy(policy) != (param->sched_priority != 0)) 4095 if (is_rt_policy(policy) != (param->sched_priority != 0))
4096 return -EINVAL; 4096 return -EINVAL;
4097 4097
4098 /* 4098 /*
4099 * Allow unprivileged RT tasks to decrease priority: 4099 * Allow unprivileged RT tasks to decrease priority:
4100 */ 4100 */
4101 if (!capable(CAP_SYS_NICE)) { 4101 if (!capable(CAP_SYS_NICE)) {
4102 if (is_rt_policy(policy)) { 4102 if (is_rt_policy(policy)) {
4103 unsigned long rlim_rtprio; 4103 unsigned long rlim_rtprio;
4104 unsigned long flags; 4104 unsigned long flags;
4105 4105
4106 if (!lock_task_sighand(p, &flags)) 4106 if (!lock_task_sighand(p, &flags))
4107 return -ESRCH; 4107 return -ESRCH;
4108 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4108 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4109 unlock_task_sighand(p, &flags); 4109 unlock_task_sighand(p, &flags);
4110 4110
4111 /* can't set/change the rt policy */ 4111 /* can't set/change the rt policy */
4112 if (policy != p->policy && !rlim_rtprio) 4112 if (policy != p->policy && !rlim_rtprio)
4113 return -EPERM; 4113 return -EPERM;
4114 4114
4115 /* can't increase priority */ 4115 /* can't increase priority */
4116 if (param->sched_priority > p->rt_priority && 4116 if (param->sched_priority > p->rt_priority &&
4117 param->sched_priority > rlim_rtprio) 4117 param->sched_priority > rlim_rtprio)
4118 return -EPERM; 4118 return -EPERM;
4119 } 4119 }
4120 4120
4121 /* can't change other user's priorities */ 4121 /* can't change other user's priorities */
4122 if ((current->euid != p->euid) && 4122 if ((current->euid != p->euid) &&
4123 (current->euid != p->uid)) 4123 (current->euid != p->uid))
4124 return -EPERM; 4124 return -EPERM;
4125 } 4125 }
4126 4126
4127 retval = security_task_setscheduler(p, policy, param); 4127 retval = security_task_setscheduler(p, policy, param);
4128 if (retval) 4128 if (retval)
4129 return retval; 4129 return retval;
4130 /* 4130 /*
4131 * make sure no PI-waiters arrive (or leave) while we are 4131 * make sure no PI-waiters arrive (or leave) while we are
4132 * changing the priority of the task: 4132 * changing the priority of the task:
4133 */ 4133 */
4134 spin_lock_irqsave(&p->pi_lock, flags); 4134 spin_lock_irqsave(&p->pi_lock, flags);
4135 /* 4135 /*
4136 * To be able to change p->policy safely, the apropriate 4136 * To be able to change p->policy safely, the apropriate
4137 * runqueue lock must be held. 4137 * runqueue lock must be held.
4138 */ 4138 */
4139 rq = __task_rq_lock(p); 4139 rq = __task_rq_lock(p);
4140 /* recheck policy now with rq lock held */ 4140 /* recheck policy now with rq lock held */
4141 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4141 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4142 policy = oldpolicy = -1; 4142 policy = oldpolicy = -1;
4143 __task_rq_unlock(rq); 4143 __task_rq_unlock(rq);
4144 spin_unlock_irqrestore(&p->pi_lock, flags); 4144 spin_unlock_irqrestore(&p->pi_lock, flags);
4145 goto recheck; 4145 goto recheck;
4146 } 4146 }
4147 array = p->array; 4147 array = p->array;
4148 if (array) 4148 if (array)
4149 deactivate_task(p, rq); 4149 deactivate_task(p, rq);
4150 oldprio = p->prio; 4150 oldprio = p->prio;
4151 __setscheduler(p, policy, param->sched_priority); 4151 __setscheduler(p, policy, param->sched_priority);
4152 if (array) { 4152 if (array) {
4153 __activate_task(p, rq); 4153 __activate_task(p, rq);
4154 /* 4154 /*
4155 * Reschedule if we are currently running on this runqueue and 4155 * Reschedule if we are currently running on this runqueue and
4156 * our priority decreased, or if we are not currently running on 4156 * our priority decreased, or if we are not currently running on
4157 * this runqueue and our priority is higher than the current's 4157 * this runqueue and our priority is higher than the current's
4158 */ 4158 */
4159 if (task_running(rq, p)) { 4159 if (task_running(rq, p)) {
4160 if (p->prio > oldprio) 4160 if (p->prio > oldprio)
4161 resched_task(rq->curr); 4161 resched_task(rq->curr);
4162 } else if (TASK_PREEMPTS_CURR(p, rq)) 4162 } else if (TASK_PREEMPTS_CURR(p, rq))
4163 resched_task(rq->curr); 4163 resched_task(rq->curr);
4164 } 4164 }
4165 __task_rq_unlock(rq); 4165 __task_rq_unlock(rq);
4166 spin_unlock_irqrestore(&p->pi_lock, flags); 4166 spin_unlock_irqrestore(&p->pi_lock, flags);
4167 4167
4168 rt_mutex_adjust_pi(p); 4168 rt_mutex_adjust_pi(p);
4169 4169
4170 return 0; 4170 return 0;
4171 } 4171 }
4172 EXPORT_SYMBOL_GPL(sched_setscheduler); 4172 EXPORT_SYMBOL_GPL(sched_setscheduler);
4173 4173
4174 static int 4174 static int
4175 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4175 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4176 { 4176 {
4177 struct sched_param lparam; 4177 struct sched_param lparam;
4178 struct task_struct *p; 4178 struct task_struct *p;
4179 int retval; 4179 int retval;
4180 4180
4181 if (!param || pid < 0) 4181 if (!param || pid < 0)
4182 return -EINVAL; 4182 return -EINVAL;
4183 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4183 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4184 return -EFAULT; 4184 return -EFAULT;
4185 4185
4186 rcu_read_lock(); 4186 rcu_read_lock();
4187 retval = -ESRCH; 4187 retval = -ESRCH;
4188 p = find_process_by_pid(pid); 4188 p = find_process_by_pid(pid);
4189 if (p != NULL) 4189 if (p != NULL)
4190 retval = sched_setscheduler(p, policy, &lparam); 4190 retval = sched_setscheduler(p, policy, &lparam);
4191 rcu_read_unlock(); 4191 rcu_read_unlock();
4192 4192
4193 return retval; 4193 return retval;
4194 } 4194 }
4195 4195
4196 /** 4196 /**
4197 * sys_sched_setscheduler - set/change the scheduler policy and RT priority 4197 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4198 * @pid: the pid in question. 4198 * @pid: the pid in question.
4199 * @policy: new policy. 4199 * @policy: new policy.
4200 * @param: structure containing the new RT priority. 4200 * @param: structure containing the new RT priority.
4201 */ 4201 */
4202 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 4202 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4203 struct sched_param __user *param) 4203 struct sched_param __user *param)
4204 { 4204 {
4205 /* negative values for policy are not valid */ 4205 /* negative values for policy are not valid */
4206 if (policy < 0) 4206 if (policy < 0)
4207 return -EINVAL; 4207 return -EINVAL;
4208 4208
4209 return do_sched_setscheduler(pid, policy, param); 4209 return do_sched_setscheduler(pid, policy, param);
4210 } 4210 }
4211 4211
4212 /** 4212 /**
4213 * sys_sched_setparam - set/change the RT priority of a thread 4213 * sys_sched_setparam - set/change the RT priority of a thread
4214 * @pid: the pid in question. 4214 * @pid: the pid in question.
4215 * @param: structure containing the new RT priority. 4215 * @param: structure containing the new RT priority.
4216 */ 4216 */
4217 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) 4217 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4218 { 4218 {
4219 return do_sched_setscheduler(pid, -1, param); 4219 return do_sched_setscheduler(pid, -1, param);
4220 } 4220 }
4221 4221
4222 /** 4222 /**
4223 * sys_sched_getscheduler - get the policy (scheduling class) of a thread 4223 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4224 * @pid: the pid in question. 4224 * @pid: the pid in question.
4225 */ 4225 */
4226 asmlinkage long sys_sched_getscheduler(pid_t pid) 4226 asmlinkage long sys_sched_getscheduler(pid_t pid)
4227 { 4227 {
4228 struct task_struct *p; 4228 struct task_struct *p;
4229 int retval = -EINVAL; 4229 int retval = -EINVAL;
4230 4230
4231 if (pid < 0) 4231 if (pid < 0)
4232 goto out_nounlock; 4232 goto out_nounlock;
4233 4233
4234 retval = -ESRCH; 4234 retval = -ESRCH;
4235 read_lock(&tasklist_lock); 4235 read_lock(&tasklist_lock);
4236 p = find_process_by_pid(pid); 4236 p = find_process_by_pid(pid);
4237 if (p) { 4237 if (p) {
4238 retval = security_task_getscheduler(p); 4238 retval = security_task_getscheduler(p);
4239 if (!retval) 4239 if (!retval)
4240 retval = p->policy; 4240 retval = p->policy;
4241 } 4241 }
4242 read_unlock(&tasklist_lock); 4242 read_unlock(&tasklist_lock);
4243 4243
4244 out_nounlock: 4244 out_nounlock:
4245 return retval; 4245 return retval;
4246 } 4246 }
4247 4247
4248 /** 4248 /**
4249 * sys_sched_getscheduler - get the RT priority of a thread 4249 * sys_sched_getscheduler - get the RT priority of a thread
4250 * @pid: the pid in question. 4250 * @pid: the pid in question.
4251 * @param: structure containing the RT priority. 4251 * @param: structure containing the RT priority.
4252 */ 4252 */
4253 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 4253 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4254 { 4254 {
4255 struct sched_param lp; 4255 struct sched_param lp;
4256 struct task_struct *p; 4256 struct task_struct *p;
4257 int retval = -EINVAL; 4257 int retval = -EINVAL;
4258 4258
4259 if (!param || pid < 0) 4259 if (!param || pid < 0)
4260 goto out_nounlock; 4260 goto out_nounlock;
4261 4261
4262 read_lock(&tasklist_lock); 4262 read_lock(&tasklist_lock);
4263 p = find_process_by_pid(pid); 4263 p = find_process_by_pid(pid);
4264 retval = -ESRCH; 4264 retval = -ESRCH;
4265 if (!p) 4265 if (!p)
4266 goto out_unlock; 4266 goto out_unlock;
4267 4267
4268 retval = security_task_getscheduler(p); 4268 retval = security_task_getscheduler(p);
4269 if (retval) 4269 if (retval)
4270 goto out_unlock; 4270 goto out_unlock;
4271 4271
4272 lp.sched_priority = p->rt_priority; 4272 lp.sched_priority = p->rt_priority;
4273 read_unlock(&tasklist_lock); 4273 read_unlock(&tasklist_lock);
4274 4274
4275 /* 4275 /*
4276 * This one might sleep, we cannot do it with a spinlock held ... 4276 * This one might sleep, we cannot do it with a spinlock held ...
4277 */ 4277 */
4278 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; 4278 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4279 4279
4280 out_nounlock: 4280 out_nounlock:
4281 return retval; 4281 return retval;
4282 4282
4283 out_unlock: 4283 out_unlock:
4284 read_unlock(&tasklist_lock); 4284 read_unlock(&tasklist_lock);
4285 return retval; 4285 return retval;
4286 } 4286 }
4287 4287
4288 long sched_setaffinity(pid_t pid, cpumask_t new_mask) 4288 long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4289 { 4289 {
4290 cpumask_t cpus_allowed; 4290 cpumask_t cpus_allowed;
4291 struct task_struct *p; 4291 struct task_struct *p;
4292 int retval; 4292 int retval;
4293 4293
4294 lock_cpu_hotplug(); 4294 lock_cpu_hotplug();
4295 read_lock(&tasklist_lock); 4295 read_lock(&tasklist_lock);
4296 4296
4297 p = find_process_by_pid(pid); 4297 p = find_process_by_pid(pid);
4298 if (!p) { 4298 if (!p) {
4299 read_unlock(&tasklist_lock); 4299 read_unlock(&tasklist_lock);
4300 unlock_cpu_hotplug(); 4300 unlock_cpu_hotplug();
4301 return -ESRCH; 4301 return -ESRCH;
4302 } 4302 }
4303 4303
4304 /* 4304 /*
4305 * It is not safe to call set_cpus_allowed with the 4305 * It is not safe to call set_cpus_allowed with the
4306 * tasklist_lock held. We will bump the task_struct's 4306 * tasklist_lock held. We will bump the task_struct's
4307 * usage count and then drop tasklist_lock. 4307 * usage count and then drop tasklist_lock.
4308 */ 4308 */
4309 get_task_struct(p); 4309 get_task_struct(p);
4310 read_unlock(&tasklist_lock); 4310 read_unlock(&tasklist_lock);
4311 4311
4312 retval = -EPERM; 4312 retval = -EPERM;
4313 if ((current->euid != p->euid) && (current->euid != p->uid) && 4313 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4314 !capable(CAP_SYS_NICE)) 4314 !capable(CAP_SYS_NICE))
4315 goto out_unlock; 4315 goto out_unlock;
4316 4316
4317 retval = security_task_setscheduler(p, 0, NULL); 4317 retval = security_task_setscheduler(p, 0, NULL);
4318 if (retval) 4318 if (retval)
4319 goto out_unlock; 4319 goto out_unlock;
4320 4320
4321 cpus_allowed = cpuset_cpus_allowed(p); 4321 cpus_allowed = cpuset_cpus_allowed(p);
4322 cpus_and(new_mask, new_mask, cpus_allowed); 4322 cpus_and(new_mask, new_mask, cpus_allowed);
4323 retval = set_cpus_allowed(p, new_mask); 4323 retval = set_cpus_allowed(p, new_mask);
4324 4324
4325 out_unlock: 4325 out_unlock:
4326 put_task_struct(p); 4326 put_task_struct(p);
4327 unlock_cpu_hotplug(); 4327 unlock_cpu_hotplug();
4328 return retval; 4328 return retval;
4329 } 4329 }
4330 4330
4331 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, 4331 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4332 cpumask_t *new_mask) 4332 cpumask_t *new_mask)
4333 { 4333 {
4334 if (len < sizeof(cpumask_t)) { 4334 if (len < sizeof(cpumask_t)) {
4335 memset(new_mask, 0, sizeof(cpumask_t)); 4335 memset(new_mask, 0, sizeof(cpumask_t));
4336 } else if (len > sizeof(cpumask_t)) { 4336 } else if (len > sizeof(cpumask_t)) {
4337 len = sizeof(cpumask_t); 4337 len = sizeof(cpumask_t);
4338 } 4338 }
4339 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; 4339 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4340 } 4340 }
4341 4341
4342 /** 4342 /**
4343 * sys_sched_setaffinity - set the cpu affinity of a process 4343 * sys_sched_setaffinity - set the cpu affinity of a process
4344 * @pid: pid of the process 4344 * @pid: pid of the process
4345 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4345 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4346 * @user_mask_ptr: user-space pointer to the new cpu mask 4346 * @user_mask_ptr: user-space pointer to the new cpu mask
4347 */ 4347 */
4348 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, 4348 asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4349 unsigned long __user *user_mask_ptr) 4349 unsigned long __user *user_mask_ptr)
4350 { 4350 {
4351 cpumask_t new_mask; 4351 cpumask_t new_mask;
4352 int retval; 4352 int retval;
4353 4353
4354 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask); 4354 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4355 if (retval) 4355 if (retval)
4356 return retval; 4356 return retval;
4357 4357
4358 return sched_setaffinity(pid, new_mask); 4358 return sched_setaffinity(pid, new_mask);
4359 } 4359 }
4360 4360
4361 /* 4361 /*
4362 * Represents all cpu's present in the system 4362 * Represents all cpu's present in the system
4363 * In systems capable of hotplug, this map could dynamically grow 4363 * In systems capable of hotplug, this map could dynamically grow
4364 * as new cpu's are detected in the system via any platform specific 4364 * as new cpu's are detected in the system via any platform specific
4365 * method, such as ACPI for e.g. 4365 * method, such as ACPI for e.g.
4366 */ 4366 */
4367 4367
4368 cpumask_t cpu_present_map __read_mostly; 4368 cpumask_t cpu_present_map __read_mostly;
4369 EXPORT_SYMBOL(cpu_present_map); 4369 EXPORT_SYMBOL(cpu_present_map);
4370 4370
4371 #ifndef CONFIG_SMP 4371 #ifndef CONFIG_SMP
4372 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; 4372 cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
4373 EXPORT_SYMBOL(cpu_online_map); 4373 EXPORT_SYMBOL(cpu_online_map);
4374 4374
4375 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; 4375 cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
4376 EXPORT_SYMBOL(cpu_possible_map); 4376 EXPORT_SYMBOL(cpu_possible_map);
4377 #endif 4377 #endif
4378 4378
4379 long sched_getaffinity(pid_t pid, cpumask_t *mask) 4379 long sched_getaffinity(pid_t pid, cpumask_t *mask)
4380 { 4380 {
4381 struct task_struct *p; 4381 struct task_struct *p;
4382 int retval; 4382 int retval;
4383 4383
4384 lock_cpu_hotplug(); 4384 lock_cpu_hotplug();
4385 read_lock(&tasklist_lock); 4385 read_lock(&tasklist_lock);
4386 4386
4387 retval = -ESRCH; 4387 retval = -ESRCH;
4388 p = find_process_by_pid(pid); 4388 p = find_process_by_pid(pid);
4389 if (!p) 4389 if (!p)
4390 goto out_unlock; 4390 goto out_unlock;
4391 4391
4392 retval = security_task_getscheduler(p); 4392 retval = security_task_getscheduler(p);
4393 if (retval) 4393 if (retval)
4394 goto out_unlock; 4394 goto out_unlock;
4395 4395
4396 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 4396 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
4397 4397
4398 out_unlock: 4398 out_unlock:
4399 read_unlock(&tasklist_lock); 4399 read_unlock(&tasklist_lock);
4400 unlock_cpu_hotplug(); 4400 unlock_cpu_hotplug();
4401 if (retval) 4401 if (retval)
4402 return retval; 4402 return retval;
4403 4403
4404 return 0; 4404 return 0;
4405 } 4405 }
4406 4406
4407 /** 4407 /**
4408 * sys_sched_getaffinity - get the cpu affinity of a process 4408 * sys_sched_getaffinity - get the cpu affinity of a process
4409 * @pid: pid of the process 4409 * @pid: pid of the process
4410 * @len: length in bytes of the bitmask pointed to by user_mask_ptr 4410 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4411 * @user_mask_ptr: user-space pointer to hold the current cpu mask 4411 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4412 */ 4412 */
4413 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, 4413 asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4414 unsigned long __user *user_mask_ptr) 4414 unsigned long __user *user_mask_ptr)
4415 { 4415 {
4416 int ret; 4416 int ret;
4417 cpumask_t mask; 4417 cpumask_t mask;
4418 4418
4419 if (len < sizeof(cpumask_t)) 4419 if (len < sizeof(cpumask_t))
4420 return -EINVAL; 4420 return -EINVAL;
4421 4421
4422 ret = sched_getaffinity(pid, &mask); 4422 ret = sched_getaffinity(pid, &mask);
4423 if (ret < 0) 4423 if (ret < 0)
4424 return ret; 4424 return ret;
4425 4425
4426 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t))) 4426 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4427 return -EFAULT; 4427 return -EFAULT;
4428 4428
4429 return sizeof(cpumask_t); 4429 return sizeof(cpumask_t);
4430 } 4430 }
4431 4431
4432 /** 4432 /**
4433 * sys_sched_yield - yield the current processor to other threads. 4433 * sys_sched_yield - yield the current processor to other threads.
4434 * 4434 *
4435 * This function yields the current CPU by moving the calling thread 4435 * This function yields the current CPU by moving the calling thread
4436 * to the expired array. If there are no other threads running on this 4436 * to the expired array. If there are no other threads running on this
4437 * CPU then this function will return. 4437 * CPU then this function will return.
4438 */ 4438 */
4439 asmlinkage long sys_sched_yield(void) 4439 asmlinkage long sys_sched_yield(void)
4440 { 4440 {
4441 struct rq *rq = this_rq_lock(); 4441 struct rq *rq = this_rq_lock();
4442 struct prio_array *array = current->array, *target = rq->expired; 4442 struct prio_array *array = current->array, *target = rq->expired;
4443 4443
4444 schedstat_inc(rq, yld_cnt); 4444 schedstat_inc(rq, yld_cnt);
4445 /* 4445 /*
4446 * We implement yielding by moving the task into the expired 4446 * We implement yielding by moving the task into the expired
4447 * queue. 4447 * queue.
4448 * 4448 *
4449 * (special rule: RT tasks will just roundrobin in the active 4449 * (special rule: RT tasks will just roundrobin in the active
4450 * array.) 4450 * array.)
4451 */ 4451 */
4452 if (rt_task(current)) 4452 if (rt_task(current))
4453 target = rq->active; 4453 target = rq->active;
4454 4454
4455 if (array->nr_active == 1) { 4455 if (array->nr_active == 1) {
4456 schedstat_inc(rq, yld_act_empty); 4456 schedstat_inc(rq, yld_act_empty);
4457 if (!rq->expired->nr_active) 4457 if (!rq->expired->nr_active)
4458 schedstat_inc(rq, yld_both_empty); 4458 schedstat_inc(rq, yld_both_empty);
4459 } else if (!rq->expired->nr_active) 4459 } else if (!rq->expired->nr_active)
4460 schedstat_inc(rq, yld_exp_empty); 4460 schedstat_inc(rq, yld_exp_empty);
4461 4461
4462 if (array != target) { 4462 if (array != target) {
4463 dequeue_task(current, array); 4463 dequeue_task(current, array);
4464 enqueue_task(current, target); 4464 enqueue_task(current, target);
4465 } else 4465 } else
4466 /* 4466 /*
4467 * requeue_task is cheaper so perform that if possible. 4467 * requeue_task is cheaper so perform that if possible.
4468 */ 4468 */
4469 requeue_task(current, array); 4469 requeue_task(current, array);
4470 4470
4471 /* 4471 /*
4472 * Since we are going to call schedule() anyway, there's 4472 * Since we are going to call schedule() anyway, there's
4473 * no need to preempt or enable interrupts: 4473 * no need to preempt or enable interrupts:
4474 */ 4474 */
4475 __release(rq->lock); 4475 __release(rq->lock);
4476 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4476 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4477 _raw_spin_unlock(&rq->lock); 4477 _raw_spin_unlock(&rq->lock);
4478 preempt_enable_no_resched(); 4478 preempt_enable_no_resched();
4479 4479
4480 schedule(); 4480 schedule();
4481 4481
4482 return 0; 4482 return 0;
4483 } 4483 }
4484 4484
4485 static void __cond_resched(void) 4485 static void __cond_resched(void)
4486 { 4486 {
4487 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 4487 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4488 __might_sleep(__FILE__, __LINE__); 4488 __might_sleep(__FILE__, __LINE__);
4489 #endif 4489 #endif
4490 /* 4490 /*
4491 * The BKS might be reacquired before we have dropped 4491 * The BKS might be reacquired before we have dropped
4492 * PREEMPT_ACTIVE, which could trigger a second 4492 * PREEMPT_ACTIVE, which could trigger a second
4493 * cond_resched() call. 4493 * cond_resched() call.
4494 */ 4494 */
4495 do { 4495 do {
4496 add_preempt_count(PREEMPT_ACTIVE); 4496 add_preempt_count(PREEMPT_ACTIVE);
4497 schedule(); 4497 schedule();
4498 sub_preempt_count(PREEMPT_ACTIVE); 4498 sub_preempt_count(PREEMPT_ACTIVE);
4499 } while (need_resched()); 4499 } while (need_resched());
4500 } 4500 }
4501 4501
4502 int __sched cond_resched(void) 4502 int __sched cond_resched(void)
4503 { 4503 {
4504 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4504 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4505 system_state == SYSTEM_RUNNING) { 4505 system_state == SYSTEM_RUNNING) {
4506 __cond_resched(); 4506 __cond_resched();
4507 return 1; 4507 return 1;
4508 } 4508 }
4509 return 0; 4509 return 0;
4510 } 4510 }
4511 EXPORT_SYMBOL(cond_resched); 4511 EXPORT_SYMBOL(cond_resched);
4512 4512
4513 /* 4513 /*
4514 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4514 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4515 * call schedule, and on return reacquire the lock. 4515 * call schedule, and on return reacquire the lock.
4516 * 4516 *
4517 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4517 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4518 * operations here to prevent schedule() from being called twice (once via 4518 * operations here to prevent schedule() from being called twice (once via
4519 * spin_unlock(), once by hand). 4519 * spin_unlock(), once by hand).
4520 */ 4520 */
4521 int cond_resched_lock(spinlock_t *lock) 4521 int cond_resched_lock(spinlock_t *lock)
4522 { 4522 {
4523 int ret = 0; 4523 int ret = 0;
4524 4524
4525 if (need_lockbreak(lock)) { 4525 if (need_lockbreak(lock)) {
4526 spin_unlock(lock); 4526 spin_unlock(lock);
4527 cpu_relax(); 4527 cpu_relax();
4528 ret = 1; 4528 ret = 1;
4529 spin_lock(lock); 4529 spin_lock(lock);
4530 } 4530 }
4531 if (need_resched() && system_state == SYSTEM_RUNNING) { 4531 if (need_resched() && system_state == SYSTEM_RUNNING) {
4532 spin_release(&lock->dep_map, 1, _THIS_IP_); 4532 spin_release(&lock->dep_map, 1, _THIS_IP_);
4533 _raw_spin_unlock(lock); 4533 _raw_spin_unlock(lock);
4534 preempt_enable_no_resched(); 4534 preempt_enable_no_resched();
4535 __cond_resched(); 4535 __cond_resched();
4536 ret = 1; 4536 ret = 1;
4537 spin_lock(lock); 4537 spin_lock(lock);
4538 } 4538 }
4539 return ret; 4539 return ret;
4540 } 4540 }
4541 EXPORT_SYMBOL(cond_resched_lock); 4541 EXPORT_SYMBOL(cond_resched_lock);
4542 4542
4543 int __sched cond_resched_softirq(void) 4543 int __sched cond_resched_softirq(void)
4544 { 4544 {
4545 BUG_ON(!in_softirq()); 4545 BUG_ON(!in_softirq());
4546 4546
4547 if (need_resched() && system_state == SYSTEM_RUNNING) { 4547 if (need_resched() && system_state == SYSTEM_RUNNING) {
4548 raw_local_irq_disable(); 4548 raw_local_irq_disable();
4549 _local_bh_enable(); 4549 _local_bh_enable();
4550 raw_local_irq_enable(); 4550 raw_local_irq_enable();
4551 __cond_resched(); 4551 __cond_resched();
4552 local_bh_disable(); 4552 local_bh_disable();
4553 return 1; 4553 return 1;
4554 } 4554 }
4555 return 0; 4555 return 0;
4556 } 4556 }
4557 EXPORT_SYMBOL(cond_resched_softirq); 4557 EXPORT_SYMBOL(cond_resched_softirq);
4558 4558
4559 /** 4559 /**
4560 * yield - yield the current processor to other threads. 4560 * yield - yield the current processor to other threads.
4561 * 4561 *
4562 * This is a shortcut for kernel-space yielding - it marks the 4562 * This is a shortcut for kernel-space yielding - it marks the
4563 * thread runnable and calls sys_sched_yield(). 4563 * thread runnable and calls sys_sched_yield().
4564 */ 4564 */
4565 void __sched yield(void) 4565 void __sched yield(void)
4566 { 4566 {
4567 set_current_state(TASK_RUNNING); 4567 set_current_state(TASK_RUNNING);
4568 sys_sched_yield(); 4568 sys_sched_yield();
4569 } 4569 }
4570 EXPORT_SYMBOL(yield); 4570 EXPORT_SYMBOL(yield);
4571 4571
4572 /* 4572 /*
4573 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 4573 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4574 * that process accounting knows that this is a task in IO wait state. 4574 * that process accounting knows that this is a task in IO wait state.
4575 * 4575 *
4576 * But don't do that if it is a deliberate, throttling IO wait (this task 4576 * But don't do that if it is a deliberate, throttling IO wait (this task
4577 * has set its backing_dev_info: the queue against which it should throttle) 4577 * has set its backing_dev_info: the queue against which it should throttle)
4578 */ 4578 */
4579 void __sched io_schedule(void) 4579 void __sched io_schedule(void)
4580 { 4580 {
4581 struct rq *rq = &__raw_get_cpu_var(runqueues); 4581 struct rq *rq = &__raw_get_cpu_var(runqueues);
4582 4582
4583 delayacct_blkio_start(); 4583 delayacct_blkio_start();
4584 atomic_inc(&rq->nr_iowait); 4584 atomic_inc(&rq->nr_iowait);
4585 schedule(); 4585 schedule();
4586 atomic_dec(&rq->nr_iowait); 4586 atomic_dec(&rq->nr_iowait);
4587 delayacct_blkio_end(); 4587 delayacct_blkio_end();
4588 } 4588 }
4589 EXPORT_SYMBOL(io_schedule); 4589 EXPORT_SYMBOL(io_schedule);
4590 4590
4591 long __sched io_schedule_timeout(long timeout) 4591 long __sched io_schedule_timeout(long timeout)
4592 { 4592 {
4593 struct rq *rq = &__raw_get_cpu_var(runqueues); 4593 struct rq *rq = &__raw_get_cpu_var(runqueues);
4594 long ret; 4594 long ret;
4595 4595
4596 delayacct_blkio_start(); 4596 delayacct_blkio_start();
4597 atomic_inc(&rq->nr_iowait); 4597 atomic_inc(&rq->nr_iowait);
4598 ret = schedule_timeout(timeout); 4598 ret = schedule_timeout(timeout);
4599 atomic_dec(&rq->nr_iowait); 4599 atomic_dec(&rq->nr_iowait);
4600 delayacct_blkio_end(); 4600 delayacct_blkio_end();
4601 return ret; 4601 return ret;
4602 } 4602 }
4603 4603
4604 /** 4604 /**
4605 * sys_sched_get_priority_max - return maximum RT priority. 4605 * sys_sched_get_priority_max - return maximum RT priority.
4606 * @policy: scheduling class. 4606 * @policy: scheduling class.
4607 * 4607 *
4608 * this syscall returns the maximum rt_priority that can be used 4608 * this syscall returns the maximum rt_priority that can be used
4609 * by a given scheduling class. 4609 * by a given scheduling class.
4610 */ 4610 */
4611 asmlinkage long sys_sched_get_priority_max(int policy) 4611 asmlinkage long sys_sched_get_priority_max(int policy)
4612 { 4612 {
4613 int ret = -EINVAL; 4613 int ret = -EINVAL;
4614 4614
4615 switch (policy) { 4615 switch (policy) {
4616 case SCHED_FIFO: 4616 case SCHED_FIFO:
4617 case SCHED_RR: 4617 case SCHED_RR:
4618 ret = MAX_USER_RT_PRIO-1; 4618 ret = MAX_USER_RT_PRIO-1;
4619 break; 4619 break;
4620 case SCHED_NORMAL: 4620 case SCHED_NORMAL:
4621 case SCHED_BATCH: 4621 case SCHED_BATCH:
4622 ret = 0; 4622 ret = 0;
4623 break; 4623 break;
4624 } 4624 }
4625 return ret; 4625 return ret;
4626 } 4626 }
4627 4627
4628 /** 4628 /**
4629 * sys_sched_get_priority_min - return minimum RT priority. 4629 * sys_sched_get_priority_min - return minimum RT priority.
4630 * @policy: scheduling class. 4630 * @policy: scheduling class.
4631 * 4631 *
4632 * this syscall returns the minimum rt_priority that can be used 4632 * this syscall returns the minimum rt_priority that can be used
4633 * by a given scheduling class. 4633 * by a given scheduling class.
4634 */ 4634 */
4635 asmlinkage long sys_sched_get_priority_min(int policy) 4635 asmlinkage long sys_sched_get_priority_min(int policy)
4636 { 4636 {
4637 int ret = -EINVAL; 4637 int ret = -EINVAL;
4638 4638
4639 switch (policy) { 4639 switch (policy) {
4640 case SCHED_FIFO: 4640 case SCHED_FIFO:
4641 case SCHED_RR: 4641 case SCHED_RR:
4642 ret = 1; 4642 ret = 1;
4643 break; 4643 break;
4644 case SCHED_NORMAL: 4644 case SCHED_NORMAL:
4645 case SCHED_BATCH: 4645 case SCHED_BATCH:
4646 ret = 0; 4646 ret = 0;
4647 } 4647 }
4648 return ret; 4648 return ret;
4649 } 4649 }
4650 4650
4651 /** 4651 /**
4652 * sys_sched_rr_get_interval - return the default timeslice of a process. 4652 * sys_sched_rr_get_interval - return the default timeslice of a process.
4653 * @pid: pid of the process. 4653 * @pid: pid of the process.
4654 * @interval: userspace pointer to the timeslice value. 4654 * @interval: userspace pointer to the timeslice value.
4655 * 4655 *
4656 * this syscall writes the default timeslice value of a given process 4656 * this syscall writes the default timeslice value of a given process
4657 * into the user-space timespec buffer. A value of '0' means infinity. 4657 * into the user-space timespec buffer. A value of '0' means infinity.
4658 */ 4658 */
4659 asmlinkage 4659 asmlinkage
4660 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4660 long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4661 { 4661 {
4662 struct task_struct *p; 4662 struct task_struct *p;
4663 int retval = -EINVAL; 4663 int retval = -EINVAL;
4664 struct timespec t; 4664 struct timespec t;
4665 4665
4666 if (pid < 0) 4666 if (pid < 0)
4667 goto out_nounlock; 4667 goto out_nounlock;
4668 4668
4669 retval = -ESRCH; 4669 retval = -ESRCH;
4670 read_lock(&tasklist_lock); 4670 read_lock(&tasklist_lock);
4671 p = find_process_by_pid(pid); 4671 p = find_process_by_pid(pid);
4672 if (!p) 4672 if (!p)
4673 goto out_unlock; 4673 goto out_unlock;
4674 4674
4675 retval = security_task_getscheduler(p); 4675 retval = security_task_getscheduler(p);
4676 if (retval) 4676 if (retval)
4677 goto out_unlock; 4677 goto out_unlock;
4678 4678
4679 jiffies_to_timespec(p->policy == SCHED_FIFO ? 4679 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4680 0 : task_timeslice(p), &t); 4680 0 : task_timeslice(p), &t);
4681 read_unlock(&tasklist_lock); 4681 read_unlock(&tasklist_lock);
4682 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4682 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4683 out_nounlock: 4683 out_nounlock:
4684 return retval; 4684 return retval;
4685 out_unlock: 4685 out_unlock:
4686 read_unlock(&tasklist_lock); 4686 read_unlock(&tasklist_lock);
4687 return retval; 4687 return retval;
4688 } 4688 }
4689 4689
4690 static const char stat_nam[] = "RSDTtZX"; 4690 static const char stat_nam[] = "RSDTtZX";
4691 4691
4692 static void show_task(struct task_struct *p) 4692 static void show_task(struct task_struct *p)
4693 { 4693 {
4694 unsigned long free = 0; 4694 unsigned long free = 0;
4695 unsigned state; 4695 unsigned state;
4696 4696
4697 state = p->state ? __ffs(p->state) + 1 : 0; 4697 state = p->state ? __ffs(p->state) + 1 : 0;
4698 printk("%-13.13s %c", p->comm, 4698 printk("%-13.13s %c", p->comm,
4699 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4699 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4700 #if (BITS_PER_LONG == 32) 4700 #if (BITS_PER_LONG == 32)
4701 if (state == TASK_RUNNING) 4701 if (state == TASK_RUNNING)
4702 printk(" running "); 4702 printk(" running ");
4703 else 4703 else
4704 printk(" %08lX ", thread_saved_pc(p)); 4704 printk(" %08lX ", thread_saved_pc(p));
4705 #else 4705 #else
4706 if (state == TASK_RUNNING) 4706 if (state == TASK_RUNNING)
4707 printk(" running task "); 4707 printk(" running task ");
4708 else 4708 else
4709 printk(" %016lx ", thread_saved_pc(p)); 4709 printk(" %016lx ", thread_saved_pc(p));
4710 #endif 4710 #endif
4711 #ifdef CONFIG_DEBUG_STACK_USAGE 4711 #ifdef CONFIG_DEBUG_STACK_USAGE
4712 { 4712 {
4713 unsigned long *n = end_of_stack(p); 4713 unsigned long *n = end_of_stack(p);
4714 while (!*n) 4714 while (!*n)
4715 n++; 4715 n++;
4716 free = (unsigned long)n - (unsigned long)end_of_stack(p); 4716 free = (unsigned long)n - (unsigned long)end_of_stack(p);
4717 } 4717 }
4718 #endif 4718 #endif
4719 printk("%5lu %5d %6d", free, p->pid, p->parent->pid); 4719 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
4720 if (!p->mm) 4720 if (!p->mm)
4721 printk(" (L-TLB)\n"); 4721 printk(" (L-TLB)\n");
4722 else 4722 else
4723 printk(" (NOTLB)\n"); 4723 printk(" (NOTLB)\n");
4724 4724
4725 if (state != TASK_RUNNING) 4725 if (state != TASK_RUNNING)
4726 show_stack(p, NULL); 4726 show_stack(p, NULL);
4727 } 4727 }
4728 4728
4729 void show_state_filter(unsigned long state_filter) 4729 void show_state_filter(unsigned long state_filter)
4730 { 4730 {
4731 struct task_struct *g, *p; 4731 struct task_struct *g, *p;
4732 4732
4733 #if (BITS_PER_LONG == 32) 4733 #if (BITS_PER_LONG == 32)
4734 printk("\n" 4734 printk("\n"
4735 " free sibling\n"); 4735 " free sibling\n");
4736 printk(" task PC stack pid father child younger older\n"); 4736 printk(" task PC stack pid father child younger older\n");
4737 #else 4737 #else
4738 printk("\n" 4738 printk("\n"
4739 " free sibling\n"); 4739 " free sibling\n");
4740 printk(" task PC stack pid father child younger older\n"); 4740 printk(" task PC stack pid father child younger older\n");
4741 #endif 4741 #endif
4742 read_lock(&tasklist_lock); 4742 read_lock(&tasklist_lock);
4743 do_each_thread(g, p) { 4743 do_each_thread(g, p) {
4744 /* 4744 /*
4745 * reset the NMI-timeout, listing all files on a slow 4745 * reset the NMI-timeout, listing all files on a slow
4746 * console might take alot of time: 4746 * console might take alot of time:
4747 */ 4747 */
4748 touch_nmi_watchdog(); 4748 touch_nmi_watchdog();
4749 if (!state_filter || (p->state & state_filter)) 4749 if (!state_filter || (p->state & state_filter))
4750 show_task(p); 4750 show_task(p);
4751 } while_each_thread(g, p); 4751 } while_each_thread(g, p);
4752 4752
4753 read_unlock(&tasklist_lock); 4753 read_unlock(&tasklist_lock);
4754 /* 4754 /*
4755 * Only show locks if all tasks are dumped: 4755 * Only show locks if all tasks are dumped:
4756 */ 4756 */
4757 if (state_filter == -1) 4757 if (state_filter == -1)
4758 debug_show_all_locks(); 4758 debug_show_all_locks();
4759 } 4759 }
4760 4760
4761 /** 4761 /**
4762 * init_idle - set up an idle thread for a given CPU 4762 * init_idle - set up an idle thread for a given CPU
4763 * @idle: task in question 4763 * @idle: task in question
4764 * @cpu: cpu the idle task belongs to 4764 * @cpu: cpu the idle task belongs to
4765 * 4765 *
4766 * NOTE: this function does not set the idle thread's NEED_RESCHED 4766 * NOTE: this function does not set the idle thread's NEED_RESCHED
4767 * flag, to make booting more robust. 4767 * flag, to make booting more robust.
4768 */ 4768 */
4769 void __cpuinit init_idle(struct task_struct *idle, int cpu) 4769 void __cpuinit init_idle(struct task_struct *idle, int cpu)
4770 { 4770 {
4771 struct rq *rq = cpu_rq(cpu); 4771 struct rq *rq = cpu_rq(cpu);
4772 unsigned long flags; 4772 unsigned long flags;
4773 4773
4774 idle->timestamp = sched_clock(); 4774 idle->timestamp = sched_clock();
4775 idle->sleep_avg = 0; 4775 idle->sleep_avg = 0;
4776 idle->array = NULL; 4776 idle->array = NULL;
4777 idle->prio = idle->normal_prio = MAX_PRIO; 4777 idle->prio = idle->normal_prio = MAX_PRIO;
4778 idle->state = TASK_RUNNING; 4778 idle->state = TASK_RUNNING;
4779 idle->cpus_allowed = cpumask_of_cpu(cpu); 4779 idle->cpus_allowed = cpumask_of_cpu(cpu);
4780 set_task_cpu(idle, cpu); 4780 set_task_cpu(idle, cpu);
4781 4781
4782 spin_lock_irqsave(&rq->lock, flags); 4782 spin_lock_irqsave(&rq->lock, flags);
4783 rq->curr = rq->idle = idle; 4783 rq->curr = rq->idle = idle;
4784 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 4784 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4785 idle->oncpu = 1; 4785 idle->oncpu = 1;
4786 #endif 4786 #endif
4787 spin_unlock_irqrestore(&rq->lock, flags); 4787 spin_unlock_irqrestore(&rq->lock, flags);
4788 4788
4789 /* Set the preempt count _outside_ the spinlocks! */ 4789 /* Set the preempt count _outside_ the spinlocks! */
4790 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) 4790 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4791 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); 4791 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4792 #else 4792 #else
4793 task_thread_info(idle)->preempt_count = 0; 4793 task_thread_info(idle)->preempt_count = 0;
4794 #endif 4794 #endif
4795 } 4795 }
4796 4796
4797 /* 4797 /*
4798 * In a system that switches off the HZ timer nohz_cpu_mask 4798 * In a system that switches off the HZ timer nohz_cpu_mask
4799 * indicates which cpus entered this state. This is used 4799 * indicates which cpus entered this state. This is used
4800 * in the rcu update to wait only for active cpus. For system 4800 * in the rcu update to wait only for active cpus. For system
4801 * which do not switch off the HZ timer nohz_cpu_mask should 4801 * which do not switch off the HZ timer nohz_cpu_mask should
4802 * always be CPU_MASK_NONE. 4802 * always be CPU_MASK_NONE.
4803 */ 4803 */
4804 cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 4804 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4805 4805
4806 #ifdef CONFIG_SMP 4806 #ifdef CONFIG_SMP
4807 /* 4807 /*
4808 * This is how migration works: 4808 * This is how migration works:
4809 * 4809 *
4810 * 1) we queue a struct migration_req structure in the source CPU's 4810 * 1) we queue a struct migration_req structure in the source CPU's
4811 * runqueue and wake up that CPU's migration thread. 4811 * runqueue and wake up that CPU's migration thread.
4812 * 2) we down() the locked semaphore => thread blocks. 4812 * 2) we down() the locked semaphore => thread blocks.
4813 * 3) migration thread wakes up (implicitly it forces the migrated 4813 * 3) migration thread wakes up (implicitly it forces the migrated
4814 * thread off the CPU) 4814 * thread off the CPU)
4815 * 4) it gets the migration request and checks whether the migrated 4815 * 4) it gets the migration request and checks whether the migrated
4816 * task is still in the wrong runqueue. 4816 * task is still in the wrong runqueue.
4817 * 5) if it's in the wrong runqueue then the migration thread removes 4817 * 5) if it's in the wrong runqueue then the migration thread removes
4818 * it and puts it into the right queue. 4818 * it and puts it into the right queue.
4819 * 6) migration thread up()s the semaphore. 4819 * 6) migration thread up()s the semaphore.
4820 * 7) we wake up and the migration is done. 4820 * 7) we wake up and the migration is done.
4821 */ 4821 */
4822 4822
4823 /* 4823 /*
4824 * Change a given task's CPU affinity. Migrate the thread to a 4824 * Change a given task's CPU affinity. Migrate the thread to a
4825 * proper CPU and schedule it away if the CPU it's executing on 4825 * proper CPU and schedule it away if the CPU it's executing on
4826 * is removed from the allowed bitmask. 4826 * is removed from the allowed bitmask.
4827 * 4827 *
4828 * NOTE: the caller must have a valid reference to the task, the 4828 * NOTE: the caller must have a valid reference to the task, the
4829 * task must not exit() & deallocate itself prematurely. The 4829 * task must not exit() & deallocate itself prematurely. The
4830 * call is not atomic; no spinlocks may be held. 4830 * call is not atomic; no spinlocks may be held.
4831 */ 4831 */
4832 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 4832 int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4833 { 4833 {
4834 struct migration_req req; 4834 struct migration_req req;
4835 unsigned long flags; 4835 unsigned long flags;
4836 struct rq *rq; 4836 struct rq *rq;
4837 int ret = 0; 4837 int ret = 0;
4838 4838
4839 rq = task_rq_lock(p, &flags); 4839 rq = task_rq_lock(p, &flags);
4840 if (!cpus_intersects(new_mask, cpu_online_map)) { 4840 if (!cpus_intersects(new_mask, cpu_online_map)) {
4841 ret = -EINVAL; 4841 ret = -EINVAL;
4842 goto out; 4842 goto out;
4843 } 4843 }
4844 4844
4845 p->cpus_allowed = new_mask; 4845 p->cpus_allowed = new_mask;
4846 /* Can the task run on the task's current CPU? If so, we're done */ 4846 /* Can the task run on the task's current CPU? If so, we're done */
4847 if (cpu_isset(task_cpu(p), new_mask)) 4847 if (cpu_isset(task_cpu(p), new_mask))
4848 goto out; 4848 goto out;
4849 4849
4850 if (migrate_task(p, any_online_cpu(new_mask), &req)) { 4850 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4851 /* Need help from migration thread: drop lock and wait. */ 4851 /* Need help from migration thread: drop lock and wait. */
4852 task_rq_unlock(rq, &flags); 4852 task_rq_unlock(rq, &flags);
4853 wake_up_process(rq->migration_thread); 4853 wake_up_process(rq->migration_thread);
4854 wait_for_completion(&req.done); 4854 wait_for_completion(&req.done);
4855 tlb_migrate_finish(p->mm); 4855 tlb_migrate_finish(p->mm);
4856 return 0; 4856 return 0;
4857 } 4857 }
4858 out: 4858 out:
4859 task_rq_unlock(rq, &flags); 4859 task_rq_unlock(rq, &flags);
4860 4860
4861 return ret; 4861 return ret;
4862 } 4862 }
4863 EXPORT_SYMBOL_GPL(set_cpus_allowed); 4863 EXPORT_SYMBOL_GPL(set_cpus_allowed);
4864 4864
4865 /* 4865 /*
4866 * Move (not current) task off this cpu, onto dest cpu. We're doing 4866 * Move (not current) task off this cpu, onto dest cpu. We're doing
4867 * this because either it can't run here any more (set_cpus_allowed() 4867 * this because either it can't run here any more (set_cpus_allowed()
4868 * away from this CPU, or CPU going down), or because we're 4868 * away from this CPU, or CPU going down), or because we're
4869 * attempting to rebalance this task on exec (sched_exec). 4869 * attempting to rebalance this task on exec (sched_exec).
4870 * 4870 *
4871 * So we race with normal scheduler movements, but that's OK, as long 4871 * So we race with normal scheduler movements, but that's OK, as long
4872 * as the task is no longer on this CPU. 4872 * as the task is no longer on this CPU.
4873 * 4873 *
4874 * Returns non-zero if task was successfully migrated. 4874 * Returns non-zero if task was successfully migrated.
4875 */ 4875 */
4876 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4876 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4877 { 4877 {
4878 struct rq *rq_dest, *rq_src; 4878 struct rq *rq_dest, *rq_src;
4879 int ret = 0; 4879 int ret = 0;
4880 4880
4881 if (unlikely(cpu_is_offline(dest_cpu))) 4881 if (unlikely(cpu_is_offline(dest_cpu)))
4882 return ret; 4882 return ret;
4883 4883
4884 rq_src = cpu_rq(src_cpu); 4884 rq_src = cpu_rq(src_cpu);
4885 rq_dest = cpu_rq(dest_cpu); 4885 rq_dest = cpu_rq(dest_cpu);
4886 4886
4887 double_rq_lock(rq_src, rq_dest); 4887 double_rq_lock(rq_src, rq_dest);
4888 /* Already moved. */ 4888 /* Already moved. */
4889 if (task_cpu(p) != src_cpu) 4889 if (task_cpu(p) != src_cpu)
4890 goto out; 4890 goto out;
4891 /* Affinity changed (again). */ 4891 /* Affinity changed (again). */
4892 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 4892 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4893 goto out; 4893 goto out;
4894 4894
4895 set_task_cpu(p, dest_cpu); 4895 set_task_cpu(p, dest_cpu);
4896 if (p->array) { 4896 if (p->array) {
4897 /* 4897 /*
4898 * Sync timestamp with rq_dest's before activating. 4898 * Sync timestamp with rq_dest's before activating.
4899 * The same thing could be achieved by doing this step 4899 * The same thing could be achieved by doing this step
4900 * afterwards, and pretending it was a local activate. 4900 * afterwards, and pretending it was a local activate.
4901 * This way is cleaner and logically correct. 4901 * This way is cleaner and logically correct.
4902 */ 4902 */
4903 p->timestamp = p->timestamp - rq_src->most_recent_timestamp 4903 p->timestamp = p->timestamp - rq_src->most_recent_timestamp
4904 + rq_dest->most_recent_timestamp; 4904 + rq_dest->most_recent_timestamp;
4905 deactivate_task(p, rq_src); 4905 deactivate_task(p, rq_src);
4906 __activate_task(p, rq_dest); 4906 __activate_task(p, rq_dest);
4907 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4907 if (TASK_PREEMPTS_CURR(p, rq_dest))
4908 resched_task(rq_dest->curr); 4908 resched_task(rq_dest->curr);
4909 } 4909 }
4910 ret = 1; 4910 ret = 1;
4911 out: 4911 out:
4912 double_rq_unlock(rq_src, rq_dest); 4912 double_rq_unlock(rq_src, rq_dest);
4913 return ret; 4913 return ret;
4914 } 4914 }
4915 4915
4916 /* 4916 /*
4917 * migration_thread - this is a highprio system thread that performs 4917 * migration_thread - this is a highprio system thread that performs
4918 * thread migration by bumping thread off CPU then 'pushing' onto 4918 * thread migration by bumping thread off CPU then 'pushing' onto
4919 * another runqueue. 4919 * another runqueue.
4920 */ 4920 */
4921 static int migration_thread(void *data) 4921 static int migration_thread(void *data)
4922 { 4922 {
4923 int cpu = (long)data; 4923 int cpu = (long)data;
4924 struct rq *rq; 4924 struct rq *rq;
4925 4925
4926 rq = cpu_rq(cpu); 4926 rq = cpu_rq(cpu);
4927 BUG_ON(rq->migration_thread != current); 4927 BUG_ON(rq->migration_thread != current);
4928 4928
4929 set_current_state(TASK_INTERRUPTIBLE); 4929 set_current_state(TASK_INTERRUPTIBLE);
4930 while (!kthread_should_stop()) { 4930 while (!kthread_should_stop()) {
4931 struct migration_req *req; 4931 struct migration_req *req;
4932 struct list_head *head; 4932 struct list_head *head;
4933 4933
4934 try_to_freeze(); 4934 try_to_freeze();
4935 4935
4936 spin_lock_irq(&rq->lock); 4936 spin_lock_irq(&rq->lock);
4937 4937
4938 if (cpu_is_offline(cpu)) { 4938 if (cpu_is_offline(cpu)) {
4939 spin_unlock_irq(&rq->lock); 4939 spin_unlock_irq(&rq->lock);
4940 goto wait_to_die; 4940 goto wait_to_die;
4941 } 4941 }
4942 4942
4943 if (rq->active_balance) { 4943 if (rq->active_balance) {
4944 active_load_balance(rq, cpu); 4944 active_load_balance(rq, cpu);
4945 rq->active_balance = 0; 4945 rq->active_balance = 0;
4946 } 4946 }
4947 4947
4948 head = &rq->migration_queue; 4948 head = &rq->migration_queue;
4949 4949
4950 if (list_empty(head)) { 4950 if (list_empty(head)) {
4951 spin_unlock_irq(&rq->lock); 4951 spin_unlock_irq(&rq->lock);
4952 schedule(); 4952 schedule();
4953 set_current_state(TASK_INTERRUPTIBLE); 4953 set_current_state(TASK_INTERRUPTIBLE);
4954 continue; 4954 continue;
4955 } 4955 }
4956 req = list_entry(head->next, struct migration_req, list); 4956 req = list_entry(head->next, struct migration_req, list);
4957 list_del_init(head->next); 4957 list_del_init(head->next);
4958 4958
4959 spin_unlock(&rq->lock); 4959 spin_unlock(&rq->lock);
4960 __migrate_task(req->task, cpu, req->dest_cpu); 4960 __migrate_task(req->task, cpu, req->dest_cpu);
4961 local_irq_enable(); 4961 local_irq_enable();
4962 4962
4963 complete(&req->done); 4963 complete(&req->done);
4964 } 4964 }
4965 __set_current_state(TASK_RUNNING); 4965 __set_current_state(TASK_RUNNING);
4966 return 0; 4966 return 0;
4967 4967
4968 wait_to_die: 4968 wait_to_die:
4969 /* Wait for kthread_stop */ 4969 /* Wait for kthread_stop */
4970 set_current_state(TASK_INTERRUPTIBLE); 4970 set_current_state(TASK_INTERRUPTIBLE);
4971 while (!kthread_should_stop()) { 4971 while (!kthread_should_stop()) {
4972 schedule(); 4972 schedule();
4973 set_current_state(TASK_INTERRUPTIBLE); 4973 set_current_state(TASK_INTERRUPTIBLE);
4974 } 4974 }
4975 __set_current_state(TASK_RUNNING); 4975 __set_current_state(TASK_RUNNING);
4976 return 0; 4976 return 0;
4977 } 4977 }
4978 4978
4979 #ifdef CONFIG_HOTPLUG_CPU 4979 #ifdef CONFIG_HOTPLUG_CPU
4980 /* 4980 /*
4981 * Figure out where task on dead CPU should go, use force if neccessary. 4981 * Figure out where task on dead CPU should go, use force if neccessary.
4982 * NOTE: interrupts should be disabled by the caller 4982 * NOTE: interrupts should be disabled by the caller
4983 */ 4983 */
4984 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 4984 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4985 { 4985 {
4986 unsigned long flags; 4986 unsigned long flags;
4987 cpumask_t mask; 4987 cpumask_t mask;
4988 struct rq *rq; 4988 struct rq *rq;
4989 int dest_cpu; 4989 int dest_cpu;
4990 4990
4991 restart: 4991 restart:
4992 /* On same node? */ 4992 /* On same node? */
4993 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4993 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4994 cpus_and(mask, mask, p->cpus_allowed); 4994 cpus_and(mask, mask, p->cpus_allowed);
4995 dest_cpu = any_online_cpu(mask); 4995 dest_cpu = any_online_cpu(mask);
4996 4996
4997 /* On any allowed CPU? */ 4997 /* On any allowed CPU? */
4998 if (dest_cpu == NR_CPUS) 4998 if (dest_cpu == NR_CPUS)
4999 dest_cpu = any_online_cpu(p->cpus_allowed); 4999 dest_cpu = any_online_cpu(p->cpus_allowed);
5000 5000
5001 /* No more Mr. Nice Guy. */ 5001 /* No more Mr. Nice Guy. */
5002 if (dest_cpu == NR_CPUS) { 5002 if (dest_cpu == NR_CPUS) {
5003 rq = task_rq_lock(p, &flags); 5003 rq = task_rq_lock(p, &flags);
5004 cpus_setall(p->cpus_allowed); 5004 cpus_setall(p->cpus_allowed);
5005 dest_cpu = any_online_cpu(p->cpus_allowed); 5005 dest_cpu = any_online_cpu(p->cpus_allowed);
5006 task_rq_unlock(rq, &flags); 5006 task_rq_unlock(rq, &flags);
5007 5007
5008 /* 5008 /*
5009 * Don't tell them about moving exiting tasks or 5009 * Don't tell them about moving exiting tasks or
5010 * kernel threads (both mm NULL), since they never 5010 * kernel threads (both mm NULL), since they never
5011 * leave kernel. 5011 * leave kernel.
5012 */ 5012 */
5013 if (p->mm && printk_ratelimit()) 5013 if (p->mm && printk_ratelimit())
5014 printk(KERN_INFO "process %d (%s) no " 5014 printk(KERN_INFO "process %d (%s) no "
5015 "longer affine to cpu%d\n", 5015 "longer affine to cpu%d\n",
5016 p->pid, p->comm, dead_cpu); 5016 p->pid, p->comm, dead_cpu);
5017 } 5017 }
5018 if (!__migrate_task(p, dead_cpu, dest_cpu)) 5018 if (!__migrate_task(p, dead_cpu, dest_cpu))
5019 goto restart; 5019 goto restart;
5020 } 5020 }
5021 5021
5022 /* 5022 /*
5023 * While a dead CPU has no uninterruptible tasks queued at this point, 5023 * While a dead CPU has no uninterruptible tasks queued at this point,
5024 * it might still have a nonzero ->nr_uninterruptible counter, because 5024 * it might still have a nonzero ->nr_uninterruptible counter, because
5025 * for performance reasons the counter is not stricly tracking tasks to 5025 * for performance reasons the counter is not stricly tracking tasks to
5026 * their home CPUs. So we just add the counter to another CPU's counter, 5026 * their home CPUs. So we just add the counter to another CPU's counter,
5027 * to keep the global sum constant after CPU-down: 5027 * to keep the global sum constant after CPU-down:
5028 */ 5028 */
5029 static void migrate_nr_uninterruptible(struct rq *rq_src) 5029 static void migrate_nr_uninterruptible(struct rq *rq_src)
5030 { 5030 {
5031 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 5031 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
5032 unsigned long flags; 5032 unsigned long flags;
5033 5033
5034 local_irq_save(flags); 5034 local_irq_save(flags);
5035 double_rq_lock(rq_src, rq_dest); 5035 double_rq_lock(rq_src, rq_dest);
5036 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5036 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5037 rq_src->nr_uninterruptible = 0; 5037 rq_src->nr_uninterruptible = 0;
5038 double_rq_unlock(rq_src, rq_dest); 5038 double_rq_unlock(rq_src, rq_dest);
5039 local_irq_restore(flags); 5039 local_irq_restore(flags);
5040 } 5040 }
5041 5041
5042 /* Run through task list and migrate tasks from the dead cpu. */ 5042 /* Run through task list and migrate tasks from the dead cpu. */
5043 static void migrate_live_tasks(int src_cpu) 5043 static void migrate_live_tasks(int src_cpu)
5044 { 5044 {
5045 struct task_struct *p, *t; 5045 struct task_struct *p, *t;
5046 5046
5047 write_lock_irq(&tasklist_lock); 5047 write_lock_irq(&tasklist_lock);
5048 5048
5049 do_each_thread(t, p) { 5049 do_each_thread(t, p) {
5050 if (p == current) 5050 if (p == current)
5051 continue; 5051 continue;
5052 5052
5053 if (task_cpu(p) == src_cpu) 5053 if (task_cpu(p) == src_cpu)
5054 move_task_off_dead_cpu(src_cpu, p); 5054 move_task_off_dead_cpu(src_cpu, p);
5055 } while_each_thread(t, p); 5055 } while_each_thread(t, p);
5056 5056
5057 write_unlock_irq(&tasklist_lock); 5057 write_unlock_irq(&tasklist_lock);
5058 } 5058 }
5059 5059
5060 /* Schedules idle task to be the next runnable task on current CPU. 5060 /* Schedules idle task to be the next runnable task on current CPU.
5061 * It does so by boosting its priority to highest possible and adding it to 5061 * It does so by boosting its priority to highest possible and adding it to
5062 * the _front_ of the runqueue. Used by CPU offline code. 5062 * the _front_ of the runqueue. Used by CPU offline code.
5063 */ 5063 */
5064 void sched_idle_next(void) 5064 void sched_idle_next(void)
5065 { 5065 {
5066 int this_cpu = smp_processor_id(); 5066 int this_cpu = smp_processor_id();
5067 struct rq *rq = cpu_rq(this_cpu); 5067 struct rq *rq = cpu_rq(this_cpu);
5068 struct task_struct *p = rq->idle; 5068 struct task_struct *p = rq->idle;
5069 unsigned long flags; 5069 unsigned long flags;
5070 5070
5071 /* cpu has to be offline */ 5071 /* cpu has to be offline */
5072 BUG_ON(cpu_online(this_cpu)); 5072 BUG_ON(cpu_online(this_cpu));
5073 5073
5074 /* 5074 /*
5075 * Strictly not necessary since rest of the CPUs are stopped by now 5075 * Strictly not necessary since rest of the CPUs are stopped by now
5076 * and interrupts disabled on the current cpu. 5076 * and interrupts disabled on the current cpu.
5077 */ 5077 */
5078 spin_lock_irqsave(&rq->lock, flags); 5078 spin_lock_irqsave(&rq->lock, flags);
5079 5079
5080 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5080 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
5081 5081
5082 /* Add idle task to the _front_ of its priority queue: */ 5082 /* Add idle task to the _front_ of its priority queue: */
5083 __activate_idle_task(p, rq); 5083 __activate_idle_task(p, rq);
5084 5084
5085 spin_unlock_irqrestore(&rq->lock, flags); 5085 spin_unlock_irqrestore(&rq->lock, flags);
5086 } 5086 }
5087 5087
5088 /* 5088 /*
5089 * Ensures that the idle task is using init_mm right before its cpu goes 5089 * Ensures that the idle task is using init_mm right before its cpu goes
5090 * offline. 5090 * offline.
5091 */ 5091 */
5092 void idle_task_exit(void) 5092 void idle_task_exit(void)
5093 { 5093 {
5094 struct mm_struct *mm = current->active_mm; 5094 struct mm_struct *mm = current->active_mm;
5095 5095
5096 BUG_ON(cpu_online(smp_processor_id())); 5096 BUG_ON(cpu_online(smp_processor_id()));
5097 5097
5098 if (mm != &init_mm) 5098 if (mm != &init_mm)
5099 switch_mm(mm, &init_mm, current); 5099 switch_mm(mm, &init_mm, current);
5100 mmdrop(mm); 5100 mmdrop(mm);
5101 } 5101 }
5102 5102
5103 /* called under rq->lock with disabled interrupts */ 5103 /* called under rq->lock with disabled interrupts */
5104 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) 5104 static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5105 { 5105 {
5106 struct rq *rq = cpu_rq(dead_cpu); 5106 struct rq *rq = cpu_rq(dead_cpu);
5107 5107
5108 /* Must be exiting, otherwise would be on tasklist. */ 5108 /* Must be exiting, otherwise would be on tasklist. */
5109 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD); 5109 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
5110 5110
5111 /* Cannot have done final schedule yet: would have vanished. */ 5111 /* Cannot have done final schedule yet: would have vanished. */
5112 BUG_ON(p->state == TASK_DEAD); 5112 BUG_ON(p->state == TASK_DEAD);
5113 5113
5114 get_task_struct(p); 5114 get_task_struct(p);
5115 5115
5116 /* 5116 /*
5117 * Drop lock around migration; if someone else moves it, 5117 * Drop lock around migration; if someone else moves it,
5118 * that's OK. No task can be added to this CPU, so iteration is 5118 * that's OK. No task can be added to this CPU, so iteration is
5119 * fine. 5119 * fine.
5120 * NOTE: interrupts should be left disabled --dev@ 5120 * NOTE: interrupts should be left disabled --dev@
5121 */ 5121 */
5122 spin_unlock(&rq->lock); 5122 spin_unlock(&rq->lock);
5123 move_task_off_dead_cpu(dead_cpu, p); 5123 move_task_off_dead_cpu(dead_cpu, p);
5124 spin_lock(&rq->lock); 5124 spin_lock(&rq->lock);
5125 5125
5126 put_task_struct(p); 5126 put_task_struct(p);
5127 } 5127 }
5128 5128
5129 /* release_task() removes task from tasklist, so we won't find dead tasks. */ 5129 /* release_task() removes task from tasklist, so we won't find dead tasks. */
5130 static void migrate_dead_tasks(unsigned int dead_cpu) 5130 static void migrate_dead_tasks(unsigned int dead_cpu)
5131 { 5131 {
5132 struct rq *rq = cpu_rq(dead_cpu); 5132 struct rq *rq = cpu_rq(dead_cpu);
5133 unsigned int arr, i; 5133 unsigned int arr, i;
5134 5134
5135 for (arr = 0; arr < 2; arr++) { 5135 for (arr = 0; arr < 2; arr++) {
5136 for (i = 0; i < MAX_PRIO; i++) { 5136 for (i = 0; i < MAX_PRIO; i++) {
5137 struct list_head *list = &rq->arrays[arr].queue[i]; 5137 struct list_head *list = &rq->arrays[arr].queue[i];
5138 5138
5139 while (!list_empty(list)) 5139 while (!list_empty(list))
5140 migrate_dead(dead_cpu, list_entry(list->next, 5140 migrate_dead(dead_cpu, list_entry(list->next,
5141 struct task_struct, run_list)); 5141 struct task_struct, run_list));
5142 } 5142 }
5143 } 5143 }
5144 } 5144 }
5145 #endif /* CONFIG_HOTPLUG_CPU */ 5145 #endif /* CONFIG_HOTPLUG_CPU */
5146 5146
5147 /* 5147 /*
5148 * migration_call - callback that gets triggered when a CPU is added. 5148 * migration_call - callback that gets triggered when a CPU is added.
5149 * Here we can start up the necessary migration thread for the new CPU. 5149 * Here we can start up the necessary migration thread for the new CPU.
5150 */ 5150 */
5151 static int __cpuinit 5151 static int __cpuinit
5152 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5152 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5153 { 5153 {
5154 struct task_struct *p; 5154 struct task_struct *p;
5155 int cpu = (long)hcpu; 5155 int cpu = (long)hcpu;
5156 unsigned long flags; 5156 unsigned long flags;
5157 struct rq *rq; 5157 struct rq *rq;
5158 5158
5159 switch (action) { 5159 switch (action) {
5160 case CPU_UP_PREPARE: 5160 case CPU_UP_PREPARE:
5161 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); 5161 p = kthread_create(migration_thread, hcpu, "migration/%d",cpu);
5162 if (IS_ERR(p)) 5162 if (IS_ERR(p))
5163 return NOTIFY_BAD; 5163 return NOTIFY_BAD;
5164 p->flags |= PF_NOFREEZE; 5164 p->flags |= PF_NOFREEZE;
5165 kthread_bind(p, cpu); 5165 kthread_bind(p, cpu);
5166 /* Must be high prio: stop_machine expects to yield to it. */ 5166 /* Must be high prio: stop_machine expects to yield to it. */
5167 rq = task_rq_lock(p, &flags); 5167 rq = task_rq_lock(p, &flags);
5168 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5168 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
5169 task_rq_unlock(rq, &flags); 5169 task_rq_unlock(rq, &flags);
5170 cpu_rq(cpu)->migration_thread = p; 5170 cpu_rq(cpu)->migration_thread = p;
5171 break; 5171 break;
5172 5172
5173 case CPU_ONLINE: 5173 case CPU_ONLINE:
5174 /* Strictly unneccessary, as first user will wake it. */ 5174 /* Strictly unneccessary, as first user will wake it. */
5175 wake_up_process(cpu_rq(cpu)->migration_thread); 5175 wake_up_process(cpu_rq(cpu)->migration_thread);
5176 break; 5176 break;
5177 5177
5178 #ifdef CONFIG_HOTPLUG_CPU 5178 #ifdef CONFIG_HOTPLUG_CPU
5179 case CPU_UP_CANCELED: 5179 case CPU_UP_CANCELED:
5180 if (!cpu_rq(cpu)->migration_thread) 5180 if (!cpu_rq(cpu)->migration_thread)
5181 break; 5181 break;
5182 /* Unbind it from offline cpu so it can run. Fall thru. */ 5182 /* Unbind it from offline cpu so it can run. Fall thru. */
5183 kthread_bind(cpu_rq(cpu)->migration_thread, 5183 kthread_bind(cpu_rq(cpu)->migration_thread,
5184 any_online_cpu(cpu_online_map)); 5184 any_online_cpu(cpu_online_map));
5185 kthread_stop(cpu_rq(cpu)->migration_thread); 5185 kthread_stop(cpu_rq(cpu)->migration_thread);
5186 cpu_rq(cpu)->migration_thread = NULL; 5186 cpu_rq(cpu)->migration_thread = NULL;
5187 break; 5187 break;
5188 5188
5189 case CPU_DEAD: 5189 case CPU_DEAD:
5190 migrate_live_tasks(cpu); 5190 migrate_live_tasks(cpu);
5191 rq = cpu_rq(cpu); 5191 rq = cpu_rq(cpu);
5192 kthread_stop(rq->migration_thread); 5192 kthread_stop(rq->migration_thread);
5193 rq->migration_thread = NULL; 5193 rq->migration_thread = NULL;
5194 /* Idle task back to normal (off runqueue, low prio) */ 5194 /* Idle task back to normal (off runqueue, low prio) */
5195 rq = task_rq_lock(rq->idle, &flags); 5195 rq = task_rq_lock(rq->idle, &flags);
5196 deactivate_task(rq->idle, rq); 5196 deactivate_task(rq->idle, rq);
5197 rq->idle->static_prio = MAX_PRIO; 5197 rq->idle->static_prio = MAX_PRIO;
5198 __setscheduler(rq->idle, SCHED_NORMAL, 0); 5198 __setscheduler(rq->idle, SCHED_NORMAL, 0);
5199 migrate_dead_tasks(cpu); 5199 migrate_dead_tasks(cpu);
5200 task_rq_unlock(rq, &flags); 5200 task_rq_unlock(rq, &flags);
5201 migrate_nr_uninterruptible(rq); 5201 migrate_nr_uninterruptible(rq);
5202 BUG_ON(rq->nr_running != 0); 5202 BUG_ON(rq->nr_running != 0);
5203 5203
5204 /* No need to migrate the tasks: it was best-effort if 5204 /* No need to migrate the tasks: it was best-effort if
5205 * they didn't do lock_cpu_hotplug(). Just wake up 5205 * they didn't do lock_cpu_hotplug(). Just wake up
5206 * the requestors. */ 5206 * the requestors. */
5207 spin_lock_irq(&rq->lock); 5207 spin_lock_irq(&rq->lock);
5208 while (!list_empty(&rq->migration_queue)) { 5208 while (!list_empty(&rq->migration_queue)) {
5209 struct migration_req *req; 5209 struct migration_req *req;
5210 5210
5211 req = list_entry(rq->migration_queue.next, 5211 req = list_entry(rq->migration_queue.next,
5212 struct migration_req, list); 5212 struct migration_req, list);
5213 list_del_init(&req->list); 5213 list_del_init(&req->list);
5214 complete(&req->done); 5214 complete(&req->done);
5215 } 5215 }
5216 spin_unlock_irq(&rq->lock); 5216 spin_unlock_irq(&rq->lock);
5217 break; 5217 break;
5218 #endif 5218 #endif
5219 } 5219 }
5220 return NOTIFY_OK; 5220 return NOTIFY_OK;
5221 } 5221 }
5222 5222
5223 /* Register at highest priority so that task migration (migrate_all_tasks) 5223 /* Register at highest priority so that task migration (migrate_all_tasks)
5224 * happens before everything else. 5224 * happens before everything else.
5225 */ 5225 */
5226 static struct notifier_block __cpuinitdata migration_notifier = { 5226 static struct notifier_block __cpuinitdata migration_notifier = {
5227 .notifier_call = migration_call, 5227 .notifier_call = migration_call,
5228 .priority = 10 5228 .priority = 10
5229 }; 5229 };
5230 5230
5231 int __init migration_init(void) 5231 int __init migration_init(void)
5232 { 5232 {
5233 void *cpu = (void *)(long)smp_processor_id(); 5233 void *cpu = (void *)(long)smp_processor_id();
5234 int err; 5234 int err;
5235 5235
5236 /* Start one for the boot CPU: */ 5236 /* Start one for the boot CPU: */
5237 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5237 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5238 BUG_ON(err == NOTIFY_BAD); 5238 BUG_ON(err == NOTIFY_BAD);
5239 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5239 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5240 register_cpu_notifier(&migration_notifier); 5240 register_cpu_notifier(&migration_notifier);
5241 5241
5242 return 0; 5242 return 0;
5243 } 5243 }
5244 #endif 5244 #endif
5245 5245
5246 #ifdef CONFIG_SMP 5246 #ifdef CONFIG_SMP
5247
5248 /* Number of possible processor ids */
5249 int nr_cpu_ids __read_mostly = NR_CPUS;
5250 EXPORT_SYMBOL(nr_cpu_ids);
5251
5247 #undef SCHED_DOMAIN_DEBUG 5252 #undef SCHED_DOMAIN_DEBUG
5248 #ifdef SCHED_DOMAIN_DEBUG 5253 #ifdef SCHED_DOMAIN_DEBUG
5249 static void sched_domain_debug(struct sched_domain *sd, int cpu) 5254 static void sched_domain_debug(struct sched_domain *sd, int cpu)
5250 { 5255 {
5251 int level = 0; 5256 int level = 0;
5252 5257
5253 if (!sd) { 5258 if (!sd) {
5254 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 5259 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5255 return; 5260 return;
5256 } 5261 }
5257 5262
5258 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 5263 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5259 5264
5260 do { 5265 do {
5261 int i; 5266 int i;
5262 char str[NR_CPUS]; 5267 char str[NR_CPUS];
5263 struct sched_group *group = sd->groups; 5268 struct sched_group *group = sd->groups;
5264 cpumask_t groupmask; 5269 cpumask_t groupmask;
5265 5270
5266 cpumask_scnprintf(str, NR_CPUS, sd->span); 5271 cpumask_scnprintf(str, NR_CPUS, sd->span);
5267 cpus_clear(groupmask); 5272 cpus_clear(groupmask);
5268 5273
5269 printk(KERN_DEBUG); 5274 printk(KERN_DEBUG);
5270 for (i = 0; i < level + 1; i++) 5275 for (i = 0; i < level + 1; i++)
5271 printk(" "); 5276 printk(" ");
5272 printk("domain %d: ", level); 5277 printk("domain %d: ", level);
5273 5278
5274 if (!(sd->flags & SD_LOAD_BALANCE)) { 5279 if (!(sd->flags & SD_LOAD_BALANCE)) {
5275 printk("does not load-balance\n"); 5280 printk("does not load-balance\n");
5276 if (sd->parent) 5281 if (sd->parent)
5277 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 5282 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5278 " has parent"); 5283 " has parent");
5279 break; 5284 break;
5280 } 5285 }
5281 5286
5282 printk("span %s\n", str); 5287 printk("span %s\n", str);
5283 5288
5284 if (!cpu_isset(cpu, sd->span)) 5289 if (!cpu_isset(cpu, sd->span))
5285 printk(KERN_ERR "ERROR: domain->span does not contain " 5290 printk(KERN_ERR "ERROR: domain->span does not contain "
5286 "CPU%d\n", cpu); 5291 "CPU%d\n", cpu);
5287 if (!cpu_isset(cpu, group->cpumask)) 5292 if (!cpu_isset(cpu, group->cpumask))
5288 printk(KERN_ERR "ERROR: domain->groups does not contain" 5293 printk(KERN_ERR "ERROR: domain->groups does not contain"
5289 " CPU%d\n", cpu); 5294 " CPU%d\n", cpu);
5290 5295
5291 printk(KERN_DEBUG); 5296 printk(KERN_DEBUG);
5292 for (i = 0; i < level + 2; i++) 5297 for (i = 0; i < level + 2; i++)
5293 printk(" "); 5298 printk(" ");
5294 printk("groups:"); 5299 printk("groups:");
5295 do { 5300 do {
5296 if (!group) { 5301 if (!group) {
5297 printk("\n"); 5302 printk("\n");
5298 printk(KERN_ERR "ERROR: group is NULL\n"); 5303 printk(KERN_ERR "ERROR: group is NULL\n");
5299 break; 5304 break;
5300 } 5305 }
5301 5306
5302 if (!group->cpu_power) { 5307 if (!group->cpu_power) {
5303 printk("\n"); 5308 printk("\n");
5304 printk(KERN_ERR "ERROR: domain->cpu_power not " 5309 printk(KERN_ERR "ERROR: domain->cpu_power not "
5305 "set\n"); 5310 "set\n");
5306 } 5311 }
5307 5312
5308 if (!cpus_weight(group->cpumask)) { 5313 if (!cpus_weight(group->cpumask)) {
5309 printk("\n"); 5314 printk("\n");
5310 printk(KERN_ERR "ERROR: empty group\n"); 5315 printk(KERN_ERR "ERROR: empty group\n");
5311 } 5316 }
5312 5317
5313 if (cpus_intersects(groupmask, group->cpumask)) { 5318 if (cpus_intersects(groupmask, group->cpumask)) {
5314 printk("\n"); 5319 printk("\n");
5315 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5320 printk(KERN_ERR "ERROR: repeated CPUs\n");
5316 } 5321 }
5317 5322
5318 cpus_or(groupmask, groupmask, group->cpumask); 5323 cpus_or(groupmask, groupmask, group->cpumask);
5319 5324
5320 cpumask_scnprintf(str, NR_CPUS, group->cpumask); 5325 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5321 printk(" %s", str); 5326 printk(" %s", str);
5322 5327
5323 group = group->next; 5328 group = group->next;
5324 } while (group != sd->groups); 5329 } while (group != sd->groups);
5325 printk("\n"); 5330 printk("\n");
5326 5331
5327 if (!cpus_equal(sd->span, groupmask)) 5332 if (!cpus_equal(sd->span, groupmask))
5328 printk(KERN_ERR "ERROR: groups don't span " 5333 printk(KERN_ERR "ERROR: groups don't span "
5329 "domain->span\n"); 5334 "domain->span\n");
5330 5335
5331 level++; 5336 level++;
5332 sd = sd->parent; 5337 sd = sd->parent;
5333 if (!sd) 5338 if (!sd)
5334 continue; 5339 continue;
5335 5340
5336 if (!cpus_subset(groupmask, sd->span)) 5341 if (!cpus_subset(groupmask, sd->span))
5337 printk(KERN_ERR "ERROR: parent span is not a superset " 5342 printk(KERN_ERR "ERROR: parent span is not a superset "
5338 "of domain->span\n"); 5343 "of domain->span\n");
5339 5344
5340 } while (sd); 5345 } while (sd);
5341 } 5346 }
5342 #else 5347 #else
5343 # define sched_domain_debug(sd, cpu) do { } while (0) 5348 # define sched_domain_debug(sd, cpu) do { } while (0)
5344 #endif 5349 #endif
5345 5350
5346 static int sd_degenerate(struct sched_domain *sd) 5351 static int sd_degenerate(struct sched_domain *sd)
5347 { 5352 {
5348 if (cpus_weight(sd->span) == 1) 5353 if (cpus_weight(sd->span) == 1)
5349 return 1; 5354 return 1;
5350 5355
5351 /* Following flags need at least 2 groups */ 5356 /* Following flags need at least 2 groups */
5352 if (sd->flags & (SD_LOAD_BALANCE | 5357 if (sd->flags & (SD_LOAD_BALANCE |
5353 SD_BALANCE_NEWIDLE | 5358 SD_BALANCE_NEWIDLE |
5354 SD_BALANCE_FORK | 5359 SD_BALANCE_FORK |
5355 SD_BALANCE_EXEC | 5360 SD_BALANCE_EXEC |
5356 SD_SHARE_CPUPOWER | 5361 SD_SHARE_CPUPOWER |
5357 SD_SHARE_PKG_RESOURCES)) { 5362 SD_SHARE_PKG_RESOURCES)) {
5358 if (sd->groups != sd->groups->next) 5363 if (sd->groups != sd->groups->next)
5359 return 0; 5364 return 0;
5360 } 5365 }
5361 5366
5362 /* Following flags don't use groups */ 5367 /* Following flags don't use groups */
5363 if (sd->flags & (SD_WAKE_IDLE | 5368 if (sd->flags & (SD_WAKE_IDLE |
5364 SD_WAKE_AFFINE | 5369 SD_WAKE_AFFINE |
5365 SD_WAKE_BALANCE)) 5370 SD_WAKE_BALANCE))
5366 return 0; 5371 return 0;
5367 5372
5368 return 1; 5373 return 1;
5369 } 5374 }
5370 5375
5371 static int 5376 static int
5372 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) 5377 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5373 { 5378 {
5374 unsigned long cflags = sd->flags, pflags = parent->flags; 5379 unsigned long cflags = sd->flags, pflags = parent->flags;
5375 5380
5376 if (sd_degenerate(parent)) 5381 if (sd_degenerate(parent))
5377 return 1; 5382 return 1;
5378 5383
5379 if (!cpus_equal(sd->span, parent->span)) 5384 if (!cpus_equal(sd->span, parent->span))
5380 return 0; 5385 return 0;
5381 5386
5382 /* Does parent contain flags not in child? */ 5387 /* Does parent contain flags not in child? */
5383 /* WAKE_BALANCE is a subset of WAKE_AFFINE */ 5388 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5384 if (cflags & SD_WAKE_AFFINE) 5389 if (cflags & SD_WAKE_AFFINE)
5385 pflags &= ~SD_WAKE_BALANCE; 5390 pflags &= ~SD_WAKE_BALANCE;
5386 /* Flags needing groups don't count if only 1 group in parent */ 5391 /* Flags needing groups don't count if only 1 group in parent */
5387 if (parent->groups == parent->groups->next) { 5392 if (parent->groups == parent->groups->next) {
5388 pflags &= ~(SD_LOAD_BALANCE | 5393 pflags &= ~(SD_LOAD_BALANCE |
5389 SD_BALANCE_NEWIDLE | 5394 SD_BALANCE_NEWIDLE |
5390 SD_BALANCE_FORK | 5395 SD_BALANCE_FORK |
5391 SD_BALANCE_EXEC | 5396 SD_BALANCE_EXEC |
5392 SD_SHARE_CPUPOWER | 5397 SD_SHARE_CPUPOWER |
5393 SD_SHARE_PKG_RESOURCES); 5398 SD_SHARE_PKG_RESOURCES);
5394 } 5399 }
5395 if (~cflags & pflags) 5400 if (~cflags & pflags)
5396 return 0; 5401 return 0;
5397 5402
5398 return 1; 5403 return 1;
5399 } 5404 }
5400 5405
5401 /* 5406 /*
5402 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 5407 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5403 * hold the hotplug lock. 5408 * hold the hotplug lock.
5404 */ 5409 */
5405 static void cpu_attach_domain(struct sched_domain *sd, int cpu) 5410 static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5406 { 5411 {
5407 struct rq *rq = cpu_rq(cpu); 5412 struct rq *rq = cpu_rq(cpu);
5408 struct sched_domain *tmp; 5413 struct sched_domain *tmp;
5409 5414
5410 /* Remove the sched domains which do not contribute to scheduling. */ 5415 /* Remove the sched domains which do not contribute to scheduling. */
5411 for (tmp = sd; tmp; tmp = tmp->parent) { 5416 for (tmp = sd; tmp; tmp = tmp->parent) {
5412 struct sched_domain *parent = tmp->parent; 5417 struct sched_domain *parent = tmp->parent;
5413 if (!parent) 5418 if (!parent)
5414 break; 5419 break;
5415 if (sd_parent_degenerate(tmp, parent)) { 5420 if (sd_parent_degenerate(tmp, parent)) {
5416 tmp->parent = parent->parent; 5421 tmp->parent = parent->parent;
5417 if (parent->parent) 5422 if (parent->parent)
5418 parent->parent->child = tmp; 5423 parent->parent->child = tmp;
5419 } 5424 }
5420 } 5425 }
5421 5426
5422 if (sd && sd_degenerate(sd)) { 5427 if (sd && sd_degenerate(sd)) {
5423 sd = sd->parent; 5428 sd = sd->parent;
5424 if (sd) 5429 if (sd)
5425 sd->child = NULL; 5430 sd->child = NULL;
5426 } 5431 }
5427 5432
5428 sched_domain_debug(sd, cpu); 5433 sched_domain_debug(sd, cpu);
5429 5434
5430 rcu_assign_pointer(rq->sd, sd); 5435 rcu_assign_pointer(rq->sd, sd);
5431 } 5436 }
5432 5437
5433 /* cpus with isolated domains */ 5438 /* cpus with isolated domains */
5434 static cpumask_t cpu_isolated_map = CPU_MASK_NONE; 5439 static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
5435 5440
5436 /* Setup the mask of cpus configured for isolated domains */ 5441 /* Setup the mask of cpus configured for isolated domains */
5437 static int __init isolated_cpu_setup(char *str) 5442 static int __init isolated_cpu_setup(char *str)
5438 { 5443 {
5439 int ints[NR_CPUS], i; 5444 int ints[NR_CPUS], i;
5440 5445
5441 str = get_options(str, ARRAY_SIZE(ints), ints); 5446 str = get_options(str, ARRAY_SIZE(ints), ints);
5442 cpus_clear(cpu_isolated_map); 5447 cpus_clear(cpu_isolated_map);
5443 for (i = 1; i <= ints[0]; i++) 5448 for (i = 1; i <= ints[0]; i++)
5444 if (ints[i] < NR_CPUS) 5449 if (ints[i] < NR_CPUS)
5445 cpu_set(ints[i], cpu_isolated_map); 5450 cpu_set(ints[i], cpu_isolated_map);
5446 return 1; 5451 return 1;
5447 } 5452 }
5448 5453
5449 __setup ("isolcpus=", isolated_cpu_setup); 5454 __setup ("isolcpus=", isolated_cpu_setup);
5450 5455
5451 /* 5456 /*
5452 * init_sched_build_groups takes the cpumask we wish to span, and a pointer 5457 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5453 * to a function which identifies what group(along with sched group) a CPU 5458 * to a function which identifies what group(along with sched group) a CPU
5454 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS 5459 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5455 * (due to the fact that we keep track of groups covered with a cpumask_t). 5460 * (due to the fact that we keep track of groups covered with a cpumask_t).
5456 * 5461 *
5457 * init_sched_build_groups will build a circular linked list of the groups 5462 * init_sched_build_groups will build a circular linked list of the groups
5458 * covered by the given span, and will set each group's ->cpumask correctly, 5463 * covered by the given span, and will set each group's ->cpumask correctly,
5459 * and ->cpu_power to 0. 5464 * and ->cpu_power to 0.
5460 */ 5465 */
5461 static void 5466 static void
5462 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, 5467 init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5463 int (*group_fn)(int cpu, const cpumask_t *cpu_map, 5468 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5464 struct sched_group **sg)) 5469 struct sched_group **sg))
5465 { 5470 {
5466 struct sched_group *first = NULL, *last = NULL; 5471 struct sched_group *first = NULL, *last = NULL;
5467 cpumask_t covered = CPU_MASK_NONE; 5472 cpumask_t covered = CPU_MASK_NONE;
5468 int i; 5473 int i;
5469 5474
5470 for_each_cpu_mask(i, span) { 5475 for_each_cpu_mask(i, span) {
5471 struct sched_group *sg; 5476 struct sched_group *sg;
5472 int group = group_fn(i, cpu_map, &sg); 5477 int group = group_fn(i, cpu_map, &sg);
5473 int j; 5478 int j;
5474 5479
5475 if (cpu_isset(i, covered)) 5480 if (cpu_isset(i, covered))
5476 continue; 5481 continue;
5477 5482
5478 sg->cpumask = CPU_MASK_NONE; 5483 sg->cpumask = CPU_MASK_NONE;
5479 sg->cpu_power = 0; 5484 sg->cpu_power = 0;
5480 5485
5481 for_each_cpu_mask(j, span) { 5486 for_each_cpu_mask(j, span) {
5482 if (group_fn(j, cpu_map, NULL) != group) 5487 if (group_fn(j, cpu_map, NULL) != group)
5483 continue; 5488 continue;
5484 5489
5485 cpu_set(j, covered); 5490 cpu_set(j, covered);
5486 cpu_set(j, sg->cpumask); 5491 cpu_set(j, sg->cpumask);
5487 } 5492 }
5488 if (!first) 5493 if (!first)
5489 first = sg; 5494 first = sg;
5490 if (last) 5495 if (last)
5491 last->next = sg; 5496 last->next = sg;
5492 last = sg; 5497 last = sg;
5493 } 5498 }
5494 last->next = first; 5499 last->next = first;
5495 } 5500 }
5496 5501
5497 #define SD_NODES_PER_DOMAIN 16 5502 #define SD_NODES_PER_DOMAIN 16
5498 5503
5499 /* 5504 /*
5500 * Self-tuning task migration cost measurement between source and target CPUs. 5505 * Self-tuning task migration cost measurement between source and target CPUs.
5501 * 5506 *
5502 * This is done by measuring the cost of manipulating buffers of varying 5507 * This is done by measuring the cost of manipulating buffers of varying
5503 * sizes. For a given buffer-size here are the steps that are taken: 5508 * sizes. For a given buffer-size here are the steps that are taken:
5504 * 5509 *
5505 * 1) the source CPU reads+dirties a shared buffer 5510 * 1) the source CPU reads+dirties a shared buffer
5506 * 2) the target CPU reads+dirties the same shared buffer 5511 * 2) the target CPU reads+dirties the same shared buffer
5507 * 5512 *
5508 * We measure how long they take, in the following 4 scenarios: 5513 * We measure how long they take, in the following 4 scenarios:
5509 * 5514 *
5510 * - source: CPU1, target: CPU2 | cost1 5515 * - source: CPU1, target: CPU2 | cost1
5511 * - source: CPU2, target: CPU1 | cost2 5516 * - source: CPU2, target: CPU1 | cost2
5512 * - source: CPU1, target: CPU1 | cost3 5517 * - source: CPU1, target: CPU1 | cost3
5513 * - source: CPU2, target: CPU2 | cost4 5518 * - source: CPU2, target: CPU2 | cost4
5514 * 5519 *
5515 * We then calculate the cost3+cost4-cost1-cost2 difference - this is 5520 * We then calculate the cost3+cost4-cost1-cost2 difference - this is
5516 * the cost of migration. 5521 * the cost of migration.
5517 * 5522 *
5518 * We then start off from a small buffer-size and iterate up to larger 5523 * We then start off from a small buffer-size and iterate up to larger
5519 * buffer sizes, in 5% steps - measuring each buffer-size separately, and 5524 * buffer sizes, in 5% steps - measuring each buffer-size separately, and
5520 * doing a maximum search for the cost. (The maximum cost for a migration 5525 * doing a maximum search for the cost. (The maximum cost for a migration
5521 * normally occurs when the working set size is around the effective cache 5526 * normally occurs when the working set size is around the effective cache
5522 * size.) 5527 * size.)
5523 */ 5528 */
5524 #define SEARCH_SCOPE 2 5529 #define SEARCH_SCOPE 2
5525 #define MIN_CACHE_SIZE (64*1024U) 5530 #define MIN_CACHE_SIZE (64*1024U)
5526 #define DEFAULT_CACHE_SIZE (5*1024*1024U) 5531 #define DEFAULT_CACHE_SIZE (5*1024*1024U)
5527 #define ITERATIONS 1 5532 #define ITERATIONS 1
5528 #define SIZE_THRESH 130 5533 #define SIZE_THRESH 130
5529 #define COST_THRESH 130 5534 #define COST_THRESH 130
5530 5535
5531 /* 5536 /*
5532 * The migration cost is a function of 'domain distance'. Domain 5537 * The migration cost is a function of 'domain distance'. Domain
5533 * distance is the number of steps a CPU has to iterate down its 5538 * distance is the number of steps a CPU has to iterate down its
5534 * domain tree to share a domain with the other CPU. The farther 5539 * domain tree to share a domain with the other CPU. The farther
5535 * two CPUs are from each other, the larger the distance gets. 5540 * two CPUs are from each other, the larger the distance gets.
5536 * 5541 *
5537 * Note that we use the distance only to cache measurement results, 5542 * Note that we use the distance only to cache measurement results,
5538 * the distance value is not used numerically otherwise. When two 5543 * the distance value is not used numerically otherwise. When two
5539 * CPUs have the same distance it is assumed that the migration 5544 * CPUs have the same distance it is assumed that the migration
5540 * cost is the same. (this is a simplification but quite practical) 5545 * cost is the same. (this is a simplification but quite practical)
5541 */ 5546 */
5542 #define MAX_DOMAIN_DISTANCE 32 5547 #define MAX_DOMAIN_DISTANCE 32
5543 5548
5544 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = 5549 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
5545 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = 5550 { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
5546 /* 5551 /*
5547 * Architectures may override the migration cost and thus avoid 5552 * Architectures may override the migration cost and thus avoid
5548 * boot-time calibration. Unit is nanoseconds. Mostly useful for 5553 * boot-time calibration. Unit is nanoseconds. Mostly useful for
5549 * virtualized hardware: 5554 * virtualized hardware:
5550 */ 5555 */
5551 #ifdef CONFIG_DEFAULT_MIGRATION_COST 5556 #ifdef CONFIG_DEFAULT_MIGRATION_COST
5552 CONFIG_DEFAULT_MIGRATION_COST 5557 CONFIG_DEFAULT_MIGRATION_COST
5553 #else 5558 #else
5554 -1LL 5559 -1LL
5555 #endif 5560 #endif
5556 }; 5561 };
5557 5562
5558 /* 5563 /*
5559 * Allow override of migration cost - in units of microseconds. 5564 * Allow override of migration cost - in units of microseconds.
5560 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost 5565 * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost
5561 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: 5566 * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs:
5562 */ 5567 */
5563 static int __init migration_cost_setup(char *str) 5568 static int __init migration_cost_setup(char *str)
5564 { 5569 {
5565 int ints[MAX_DOMAIN_DISTANCE+1], i; 5570 int ints[MAX_DOMAIN_DISTANCE+1], i;
5566 5571
5567 str = get_options(str, ARRAY_SIZE(ints), ints); 5572 str = get_options(str, ARRAY_SIZE(ints), ints);
5568 5573
5569 printk("#ints: %d\n", ints[0]); 5574 printk("#ints: %d\n", ints[0]);
5570 for (i = 1; i <= ints[0]; i++) { 5575 for (i = 1; i <= ints[0]; i++) {
5571 migration_cost[i-1] = (unsigned long long)ints[i]*1000; 5576 migration_cost[i-1] = (unsigned long long)ints[i]*1000;
5572 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); 5577 printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]);
5573 } 5578 }
5574 return 1; 5579 return 1;
5575 } 5580 }
5576 5581
5577 __setup ("migration_cost=", migration_cost_setup); 5582 __setup ("migration_cost=", migration_cost_setup);
5578 5583
5579 /* 5584 /*
5580 * Global multiplier (divisor) for migration-cutoff values, 5585 * Global multiplier (divisor) for migration-cutoff values,
5581 * in percentiles. E.g. use a value of 150 to get 1.5 times 5586 * in percentiles. E.g. use a value of 150 to get 1.5 times
5582 * longer cache-hot cutoff times. 5587 * longer cache-hot cutoff times.
5583 * 5588 *
5584 * (We scale it from 100 to 128 to long long handling easier.) 5589 * (We scale it from 100 to 128 to long long handling easier.)
5585 */ 5590 */
5586 5591
5587 #define MIGRATION_FACTOR_SCALE 128 5592 #define MIGRATION_FACTOR_SCALE 128
5588 5593
5589 static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; 5594 static unsigned int migration_factor = MIGRATION_FACTOR_SCALE;
5590 5595
5591 static int __init setup_migration_factor(char *str) 5596 static int __init setup_migration_factor(char *str)
5592 { 5597 {
5593 get_option(&str, &migration_factor); 5598 get_option(&str, &migration_factor);
5594 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; 5599 migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100;
5595 return 1; 5600 return 1;
5596 } 5601 }
5597 5602
5598 __setup("migration_factor=", setup_migration_factor); 5603 __setup("migration_factor=", setup_migration_factor);
5599 5604
5600 /* 5605 /*
5601 * Estimated distance of two CPUs, measured via the number of domains 5606 * Estimated distance of two CPUs, measured via the number of domains
5602 * we have to pass for the two CPUs to be in the same span: 5607 * we have to pass for the two CPUs to be in the same span:
5603 */ 5608 */
5604 static unsigned long domain_distance(int cpu1, int cpu2) 5609 static unsigned long domain_distance(int cpu1, int cpu2)
5605 { 5610 {
5606 unsigned long distance = 0; 5611 unsigned long distance = 0;
5607 struct sched_domain *sd; 5612 struct sched_domain *sd;
5608 5613
5609 for_each_domain(cpu1, sd) { 5614 for_each_domain(cpu1, sd) {
5610 WARN_ON(!cpu_isset(cpu1, sd->span)); 5615 WARN_ON(!cpu_isset(cpu1, sd->span));
5611 if (cpu_isset(cpu2, sd->span)) 5616 if (cpu_isset(cpu2, sd->span))
5612 return distance; 5617 return distance;
5613 distance++; 5618 distance++;
5614 } 5619 }
5615 if (distance >= MAX_DOMAIN_DISTANCE) { 5620 if (distance >= MAX_DOMAIN_DISTANCE) {
5616 WARN_ON(1); 5621 WARN_ON(1);
5617 distance = MAX_DOMAIN_DISTANCE-1; 5622 distance = MAX_DOMAIN_DISTANCE-1;
5618 } 5623 }
5619 5624
5620 return distance; 5625 return distance;
5621 } 5626 }
5622 5627
5623 static unsigned int migration_debug; 5628 static unsigned int migration_debug;
5624 5629
5625 static int __init setup_migration_debug(char *str) 5630 static int __init setup_migration_debug(char *str)
5626 { 5631 {
5627 get_option(&str, &migration_debug); 5632 get_option(&str, &migration_debug);
5628 return 1; 5633 return 1;
5629 } 5634 }
5630 5635
5631 __setup("migration_debug=", setup_migration_debug); 5636 __setup("migration_debug=", setup_migration_debug);
5632 5637
5633 /* 5638 /*
5634 * Maximum cache-size that the scheduler should try to measure. 5639 * Maximum cache-size that the scheduler should try to measure.
5635 * Architectures with larger caches should tune this up during 5640 * Architectures with larger caches should tune this up during
5636 * bootup. Gets used in the domain-setup code (i.e. during SMP 5641 * bootup. Gets used in the domain-setup code (i.e. during SMP
5637 * bootup). 5642 * bootup).
5638 */ 5643 */
5639 unsigned int max_cache_size; 5644 unsigned int max_cache_size;
5640 5645
5641 static int __init setup_max_cache_size(char *str) 5646 static int __init setup_max_cache_size(char *str)
5642 { 5647 {
5643 get_option(&str, &max_cache_size); 5648 get_option(&str, &max_cache_size);
5644 return 1; 5649 return 1;
5645 } 5650 }
5646 5651
5647 __setup("max_cache_size=", setup_max_cache_size); 5652 __setup("max_cache_size=", setup_max_cache_size);
5648 5653
5649 /* 5654 /*
5650 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This 5655 * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This
5651 * is the operation that is timed, so we try to generate unpredictable 5656 * is the operation that is timed, so we try to generate unpredictable
5652 * cachemisses that still end up filling the L2 cache: 5657 * cachemisses that still end up filling the L2 cache:
5653 */ 5658 */
5654 static void touch_cache(void *__cache, unsigned long __size) 5659 static void touch_cache(void *__cache, unsigned long __size)
5655 { 5660 {
5656 unsigned long size = __size / sizeof(long); 5661 unsigned long size = __size / sizeof(long);
5657 unsigned long chunk1 = size / 3; 5662 unsigned long chunk1 = size / 3;
5658 unsigned long chunk2 = 2 * size / 3; 5663 unsigned long chunk2 = 2 * size / 3;
5659 unsigned long *cache = __cache; 5664 unsigned long *cache = __cache;
5660 int i; 5665 int i;
5661 5666
5662 for (i = 0; i < size/6; i += 8) { 5667 for (i = 0; i < size/6; i += 8) {
5663 switch (i % 6) { 5668 switch (i % 6) {
5664 case 0: cache[i]++; 5669 case 0: cache[i]++;
5665 case 1: cache[size-1-i]++; 5670 case 1: cache[size-1-i]++;
5666 case 2: cache[chunk1-i]++; 5671 case 2: cache[chunk1-i]++;
5667 case 3: cache[chunk1+i]++; 5672 case 3: cache[chunk1+i]++;
5668 case 4: cache[chunk2-i]++; 5673 case 4: cache[chunk2-i]++;
5669 case 5: cache[chunk2+i]++; 5674 case 5: cache[chunk2+i]++;
5670 } 5675 }
5671 } 5676 }
5672 } 5677 }
5673 5678
5674 /* 5679 /*
5675 * Measure the cache-cost of one task migration. Returns in units of nsec. 5680 * Measure the cache-cost of one task migration. Returns in units of nsec.
5676 */ 5681 */
5677 static unsigned long long 5682 static unsigned long long
5678 measure_one(void *cache, unsigned long size, int source, int target) 5683 measure_one(void *cache, unsigned long size, int source, int target)
5679 { 5684 {
5680 cpumask_t mask, saved_mask; 5685 cpumask_t mask, saved_mask;
5681 unsigned long long t0, t1, t2, t3, cost; 5686 unsigned long long t0, t1, t2, t3, cost;
5682 5687
5683 saved_mask = current->cpus_allowed; 5688 saved_mask = current->cpus_allowed;
5684 5689
5685 /* 5690 /*
5686 * Flush source caches to RAM and invalidate them: 5691 * Flush source caches to RAM and invalidate them:
5687 */ 5692 */
5688 sched_cacheflush(); 5693 sched_cacheflush();
5689 5694
5690 /* 5695 /*
5691 * Migrate to the source CPU: 5696 * Migrate to the source CPU:
5692 */ 5697 */
5693 mask = cpumask_of_cpu(source); 5698 mask = cpumask_of_cpu(source);
5694 set_cpus_allowed(current, mask); 5699 set_cpus_allowed(current, mask);
5695 WARN_ON(smp_processor_id() != source); 5700 WARN_ON(smp_processor_id() != source);
5696 5701
5697 /* 5702 /*
5698 * Dirty the working set: 5703 * Dirty the working set:
5699 */ 5704 */
5700 t0 = sched_clock(); 5705 t0 = sched_clock();
5701 touch_cache(cache, size); 5706 touch_cache(cache, size);
5702 t1 = sched_clock(); 5707 t1 = sched_clock();
5703 5708
5704 /* 5709 /*
5705 * Migrate to the target CPU, dirty the L2 cache and access 5710 * Migrate to the target CPU, dirty the L2 cache and access
5706 * the shared buffer. (which represents the working set 5711 * the shared buffer. (which represents the working set
5707 * of a migrated task.) 5712 * of a migrated task.)
5708 */ 5713 */
5709 mask = cpumask_of_cpu(target); 5714 mask = cpumask_of_cpu(target);
5710 set_cpus_allowed(current, mask); 5715 set_cpus_allowed(current, mask);
5711 WARN_ON(smp_processor_id() != target); 5716 WARN_ON(smp_processor_id() != target);
5712 5717
5713 t2 = sched_clock(); 5718 t2 = sched_clock();
5714 touch_cache(cache, size); 5719 touch_cache(cache, size);
5715 t3 = sched_clock(); 5720 t3 = sched_clock();
5716 5721
5717 cost = t1-t0 + t3-t2; 5722 cost = t1-t0 + t3-t2;
5718 5723
5719 if (migration_debug >= 2) 5724 if (migration_debug >= 2)
5720 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", 5725 printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n",
5721 source, target, t1-t0, t1-t0, t3-t2, cost); 5726 source, target, t1-t0, t1-t0, t3-t2, cost);
5722 /* 5727 /*
5723 * Flush target caches to RAM and invalidate them: 5728 * Flush target caches to RAM and invalidate them:
5724 */ 5729 */
5725 sched_cacheflush(); 5730 sched_cacheflush();
5726 5731
5727 set_cpus_allowed(current, saved_mask); 5732 set_cpus_allowed(current, saved_mask);
5728 5733
5729 return cost; 5734 return cost;
5730 } 5735 }
5731 5736
5732 /* 5737 /*
5733 * Measure a series of task migrations and return the average 5738 * Measure a series of task migrations and return the average
5734 * result. Since this code runs early during bootup the system 5739 * result. Since this code runs early during bootup the system
5735 * is 'undisturbed' and the average latency makes sense. 5740 * is 'undisturbed' and the average latency makes sense.
5736 * 5741 *
5737 * The algorithm in essence auto-detects the relevant cache-size, 5742 * The algorithm in essence auto-detects the relevant cache-size,
5738 * so it will properly detect different cachesizes for different 5743 * so it will properly detect different cachesizes for different
5739 * cache-hierarchies, depending on how the CPUs are connected. 5744 * cache-hierarchies, depending on how the CPUs are connected.
5740 * 5745 *
5741 * Architectures can prime the upper limit of the search range via 5746 * Architectures can prime the upper limit of the search range via
5742 * max_cache_size, otherwise the search range defaults to 20MB...64K. 5747 * max_cache_size, otherwise the search range defaults to 20MB...64K.
5743 */ 5748 */
5744 static unsigned long long 5749 static unsigned long long
5745 measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) 5750 measure_cost(int cpu1, int cpu2, void *cache, unsigned int size)
5746 { 5751 {
5747 unsigned long long cost1, cost2; 5752 unsigned long long cost1, cost2;
5748 int i; 5753 int i;
5749 5754
5750 /* 5755 /*
5751 * Measure the migration cost of 'size' bytes, over an 5756 * Measure the migration cost of 'size' bytes, over an
5752 * average of 10 runs: 5757 * average of 10 runs:
5753 * 5758 *
5754 * (We perturb the cache size by a small (0..4k) 5759 * (We perturb the cache size by a small (0..4k)
5755 * value to compensate size/alignment related artifacts. 5760 * value to compensate size/alignment related artifacts.
5756 * We also subtract the cost of the operation done on 5761 * We also subtract the cost of the operation done on
5757 * the same CPU.) 5762 * the same CPU.)
5758 */ 5763 */
5759 cost1 = 0; 5764 cost1 = 0;
5760 5765
5761 /* 5766 /*
5762 * dry run, to make sure we start off cache-cold on cpu1, 5767 * dry run, to make sure we start off cache-cold on cpu1,
5763 * and to get any vmalloc pagefaults in advance: 5768 * and to get any vmalloc pagefaults in advance:
5764 */ 5769 */
5765 measure_one(cache, size, cpu1, cpu2); 5770 measure_one(cache, size, cpu1, cpu2);
5766 for (i = 0; i < ITERATIONS; i++) 5771 for (i = 0; i < ITERATIONS; i++)
5767 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); 5772 cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2);
5768 5773
5769 measure_one(cache, size, cpu2, cpu1); 5774 measure_one(cache, size, cpu2, cpu1);
5770 for (i = 0; i < ITERATIONS; i++) 5775 for (i = 0; i < ITERATIONS; i++)
5771 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); 5776 cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1);
5772 5777
5773 /* 5778 /*
5774 * (We measure the non-migrating [cached] cost on both 5779 * (We measure the non-migrating [cached] cost on both
5775 * cpu1 and cpu2, to handle CPUs with different speeds) 5780 * cpu1 and cpu2, to handle CPUs with different speeds)
5776 */ 5781 */
5777 cost2 = 0; 5782 cost2 = 0;
5778 5783
5779 measure_one(cache, size, cpu1, cpu1); 5784 measure_one(cache, size, cpu1, cpu1);
5780 for (i = 0; i < ITERATIONS; i++) 5785 for (i = 0; i < ITERATIONS; i++)
5781 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); 5786 cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1);
5782 5787
5783 measure_one(cache, size, cpu2, cpu2); 5788 measure_one(cache, size, cpu2, cpu2);
5784 for (i = 0; i < ITERATIONS; i++) 5789 for (i = 0; i < ITERATIONS; i++)
5785 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); 5790 cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2);
5786 5791
5787 /* 5792 /*
5788 * Get the per-iteration migration cost: 5793 * Get the per-iteration migration cost:
5789 */ 5794 */
5790 do_div(cost1, 2 * ITERATIONS); 5795 do_div(cost1, 2 * ITERATIONS);
5791 do_div(cost2, 2 * ITERATIONS); 5796 do_div(cost2, 2 * ITERATIONS);
5792 5797
5793 return cost1 - cost2; 5798 return cost1 - cost2;
5794 } 5799 }
5795 5800
5796 static unsigned long long measure_migration_cost(int cpu1, int cpu2) 5801 static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5797 { 5802 {
5798 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; 5803 unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0;
5799 unsigned int max_size, size, size_found = 0; 5804 unsigned int max_size, size, size_found = 0;
5800 long long cost = 0, prev_cost; 5805 long long cost = 0, prev_cost;
5801 void *cache; 5806 void *cache;
5802 5807
5803 /* 5808 /*
5804 * Search from max_cache_size*5 down to 64K - the real relevant 5809 * Search from max_cache_size*5 down to 64K - the real relevant
5805 * cachesize has to lie somewhere inbetween. 5810 * cachesize has to lie somewhere inbetween.
5806 */ 5811 */
5807 if (max_cache_size) { 5812 if (max_cache_size) {
5808 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); 5813 max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE);
5809 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); 5814 size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE);
5810 } else { 5815 } else {
5811 /* 5816 /*
5812 * Since we have no estimation about the relevant 5817 * Since we have no estimation about the relevant
5813 * search range 5818 * search range
5814 */ 5819 */
5815 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; 5820 max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE;
5816 size = MIN_CACHE_SIZE; 5821 size = MIN_CACHE_SIZE;
5817 } 5822 }
5818 5823
5819 if (!cpu_online(cpu1) || !cpu_online(cpu2)) { 5824 if (!cpu_online(cpu1) || !cpu_online(cpu2)) {
5820 printk("cpu %d and %d not both online!\n", cpu1, cpu2); 5825 printk("cpu %d and %d not both online!\n", cpu1, cpu2);
5821 return 0; 5826 return 0;
5822 } 5827 }
5823 5828
5824 /* 5829 /*
5825 * Allocate the working set: 5830 * Allocate the working set:
5826 */ 5831 */
5827 cache = vmalloc(max_size); 5832 cache = vmalloc(max_size);
5828 if (!cache) { 5833 if (!cache) {
5829 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); 5834 printk("could not vmalloc %d bytes for cache!\n", 2 * max_size);
5830 return 1000000; /* return 1 msec on very small boxen */ 5835 return 1000000; /* return 1 msec on very small boxen */
5831 } 5836 }
5832 5837
5833 while (size <= max_size) { 5838 while (size <= max_size) {
5834 prev_cost = cost; 5839 prev_cost = cost;
5835 cost = measure_cost(cpu1, cpu2, cache, size); 5840 cost = measure_cost(cpu1, cpu2, cache, size);
5836 5841
5837 /* 5842 /*
5838 * Update the max: 5843 * Update the max:
5839 */ 5844 */
5840 if (cost > 0) { 5845 if (cost > 0) {
5841 if (max_cost < cost) { 5846 if (max_cost < cost) {
5842 max_cost = cost; 5847 max_cost = cost;
5843 size_found = size; 5848 size_found = size;
5844 } 5849 }
5845 } 5850 }
5846 /* 5851 /*
5847 * Calculate average fluctuation, we use this to prevent 5852 * Calculate average fluctuation, we use this to prevent
5848 * noise from triggering an early break out of the loop: 5853 * noise from triggering an early break out of the loop:
5849 */ 5854 */
5850 fluct = abs(cost - prev_cost); 5855 fluct = abs(cost - prev_cost);
5851 avg_fluct = (avg_fluct + fluct)/2; 5856 avg_fluct = (avg_fluct + fluct)/2;
5852 5857
5853 if (migration_debug) 5858 if (migration_debug)
5854 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " 5859 printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): "
5855 "(%8Ld %8Ld)\n", 5860 "(%8Ld %8Ld)\n",
5856 cpu1, cpu2, size, 5861 cpu1, cpu2, size,
5857 (long)cost / 1000000, 5862 (long)cost / 1000000,
5858 ((long)cost / 100000) % 10, 5863 ((long)cost / 100000) % 10,
5859 (long)max_cost / 1000000, 5864 (long)max_cost / 1000000,
5860 ((long)max_cost / 100000) % 10, 5865 ((long)max_cost / 100000) % 10,
5861 domain_distance(cpu1, cpu2), 5866 domain_distance(cpu1, cpu2),
5862 cost, avg_fluct); 5867 cost, avg_fluct);
5863 5868
5864 /* 5869 /*
5865 * If we iterated at least 20% past the previous maximum, 5870 * If we iterated at least 20% past the previous maximum,
5866 * and the cost has dropped by more than 20% already, 5871 * and the cost has dropped by more than 20% already,
5867 * (taking fluctuations into account) then we assume to 5872 * (taking fluctuations into account) then we assume to
5868 * have found the maximum and break out of the loop early: 5873 * have found the maximum and break out of the loop early:
5869 */ 5874 */
5870 if (size_found && (size*100 > size_found*SIZE_THRESH)) 5875 if (size_found && (size*100 > size_found*SIZE_THRESH))
5871 if (cost+avg_fluct <= 0 || 5876 if (cost+avg_fluct <= 0 ||
5872 max_cost*100 > (cost+avg_fluct)*COST_THRESH) { 5877 max_cost*100 > (cost+avg_fluct)*COST_THRESH) {
5873 5878
5874 if (migration_debug) 5879 if (migration_debug)
5875 printk("-> found max.\n"); 5880 printk("-> found max.\n");
5876 break; 5881 break;
5877 } 5882 }
5878 /* 5883 /*
5879 * Increase the cachesize in 10% steps: 5884 * Increase the cachesize in 10% steps:
5880 */ 5885 */
5881 size = size * 10 / 9; 5886 size = size * 10 / 9;
5882 } 5887 }
5883 5888
5884 if (migration_debug) 5889 if (migration_debug)
5885 printk("[%d][%d] working set size found: %d, cost: %Ld\n", 5890 printk("[%d][%d] working set size found: %d, cost: %Ld\n",
5886 cpu1, cpu2, size_found, max_cost); 5891 cpu1, cpu2, size_found, max_cost);
5887 5892
5888 vfree(cache); 5893 vfree(cache);
5889 5894
5890 /* 5895 /*
5891 * A task is considered 'cache cold' if at least 2 times 5896 * A task is considered 'cache cold' if at least 2 times
5892 * the worst-case cost of migration has passed. 5897 * the worst-case cost of migration has passed.
5893 * 5898 *
5894 * (this limit is only listened to if the load-balancing 5899 * (this limit is only listened to if the load-balancing
5895 * situation is 'nice' - if there is a large imbalance we 5900 * situation is 'nice' - if there is a large imbalance we
5896 * ignore it for the sake of CPU utilization and 5901 * ignore it for the sake of CPU utilization and
5897 * processing fairness.) 5902 * processing fairness.)
5898 */ 5903 */
5899 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; 5904 return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE;
5900 } 5905 }
5901 5906
5902 static void calibrate_migration_costs(const cpumask_t *cpu_map) 5907 static void calibrate_migration_costs(const cpumask_t *cpu_map)
5903 { 5908 {
5904 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); 5909 int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id();
5905 unsigned long j0, j1, distance, max_distance = 0; 5910 unsigned long j0, j1, distance, max_distance = 0;
5906 struct sched_domain *sd; 5911 struct sched_domain *sd;
5907 5912
5908 j0 = jiffies; 5913 j0 = jiffies;
5909 5914
5910 /* 5915 /*
5911 * First pass - calculate the cacheflush times: 5916 * First pass - calculate the cacheflush times:
5912 */ 5917 */
5913 for_each_cpu_mask(cpu1, *cpu_map) { 5918 for_each_cpu_mask(cpu1, *cpu_map) {
5914 for_each_cpu_mask(cpu2, *cpu_map) { 5919 for_each_cpu_mask(cpu2, *cpu_map) {
5915 if (cpu1 == cpu2) 5920 if (cpu1 == cpu2)
5916 continue; 5921 continue;
5917 distance = domain_distance(cpu1, cpu2); 5922 distance = domain_distance(cpu1, cpu2);
5918 max_distance = max(max_distance, distance); 5923 max_distance = max(max_distance, distance);
5919 /* 5924 /*
5920 * No result cached yet? 5925 * No result cached yet?
5921 */ 5926 */
5922 if (migration_cost[distance] == -1LL) 5927 if (migration_cost[distance] == -1LL)
5923 migration_cost[distance] = 5928 migration_cost[distance] =
5924 measure_migration_cost(cpu1, cpu2); 5929 measure_migration_cost(cpu1, cpu2);
5925 } 5930 }
5926 } 5931 }
5927 /* 5932 /*
5928 * Second pass - update the sched domain hierarchy with 5933 * Second pass - update the sched domain hierarchy with
5929 * the new cache-hot-time estimations: 5934 * the new cache-hot-time estimations:
5930 */ 5935 */
5931 for_each_cpu_mask(cpu, *cpu_map) { 5936 for_each_cpu_mask(cpu, *cpu_map) {
5932 distance = 0; 5937 distance = 0;
5933 for_each_domain(cpu, sd) { 5938 for_each_domain(cpu, sd) {
5934 sd->cache_hot_time = migration_cost[distance]; 5939 sd->cache_hot_time = migration_cost[distance];
5935 distance++; 5940 distance++;
5936 } 5941 }
5937 } 5942 }
5938 /* 5943 /*
5939 * Print the matrix: 5944 * Print the matrix:
5940 */ 5945 */
5941 if (migration_debug) 5946 if (migration_debug)
5942 printk("migration: max_cache_size: %d, cpu: %d MHz:\n", 5947 printk("migration: max_cache_size: %d, cpu: %d MHz:\n",
5943 max_cache_size, 5948 max_cache_size,
5944 #ifdef CONFIG_X86 5949 #ifdef CONFIG_X86
5945 cpu_khz/1000 5950 cpu_khz/1000
5946 #else 5951 #else
5947 -1 5952 -1
5948 #endif 5953 #endif
5949 ); 5954 );
5950 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { 5955 if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) {
5951 printk("migration_cost="); 5956 printk("migration_cost=");
5952 for (distance = 0; distance <= max_distance; distance++) { 5957 for (distance = 0; distance <= max_distance; distance++) {
5953 if (distance) 5958 if (distance)
5954 printk(","); 5959 printk(",");
5955 printk("%ld", (long)migration_cost[distance] / 1000); 5960 printk("%ld", (long)migration_cost[distance] / 1000);
5956 } 5961 }
5957 printk("\n"); 5962 printk("\n");
5958 } 5963 }
5959 j1 = jiffies; 5964 j1 = jiffies;
5960 if (migration_debug) 5965 if (migration_debug)
5961 printk("migration: %ld seconds\n", (j1-j0) / HZ); 5966 printk("migration: %ld seconds\n", (j1-j0) / HZ);
5962 5967
5963 /* 5968 /*
5964 * Move back to the original CPU. NUMA-Q gets confused 5969 * Move back to the original CPU. NUMA-Q gets confused
5965 * if we migrate to another quad during bootup. 5970 * if we migrate to another quad during bootup.
5966 */ 5971 */
5967 if (raw_smp_processor_id() != orig_cpu) { 5972 if (raw_smp_processor_id() != orig_cpu) {
5968 cpumask_t mask = cpumask_of_cpu(orig_cpu), 5973 cpumask_t mask = cpumask_of_cpu(orig_cpu),
5969 saved_mask = current->cpus_allowed; 5974 saved_mask = current->cpus_allowed;
5970 5975
5971 set_cpus_allowed(current, mask); 5976 set_cpus_allowed(current, mask);
5972 set_cpus_allowed(current, saved_mask); 5977 set_cpus_allowed(current, saved_mask);
5973 } 5978 }
5974 } 5979 }
5975 5980
5976 #ifdef CONFIG_NUMA 5981 #ifdef CONFIG_NUMA
5977 5982
5978 /** 5983 /**
5979 * find_next_best_node - find the next node to include in a sched_domain 5984 * find_next_best_node - find the next node to include in a sched_domain
5980 * @node: node whose sched_domain we're building 5985 * @node: node whose sched_domain we're building
5981 * @used_nodes: nodes already in the sched_domain 5986 * @used_nodes: nodes already in the sched_domain
5982 * 5987 *
5983 * Find the next node to include in a given scheduling domain. Simply 5988 * Find the next node to include in a given scheduling domain. Simply
5984 * finds the closest node not already in the @used_nodes map. 5989 * finds the closest node not already in the @used_nodes map.
5985 * 5990 *
5986 * Should use nodemask_t. 5991 * Should use nodemask_t.
5987 */ 5992 */
5988 static int find_next_best_node(int node, unsigned long *used_nodes) 5993 static int find_next_best_node(int node, unsigned long *used_nodes)
5989 { 5994 {
5990 int i, n, val, min_val, best_node = 0; 5995 int i, n, val, min_val, best_node = 0;
5991 5996
5992 min_val = INT_MAX; 5997 min_val = INT_MAX;
5993 5998
5994 for (i = 0; i < MAX_NUMNODES; i++) { 5999 for (i = 0; i < MAX_NUMNODES; i++) {
5995 /* Start at @node */ 6000 /* Start at @node */
5996 n = (node + i) % MAX_NUMNODES; 6001 n = (node + i) % MAX_NUMNODES;
5997 6002
5998 if (!nr_cpus_node(n)) 6003 if (!nr_cpus_node(n))
5999 continue; 6004 continue;
6000 6005
6001 /* Skip already used nodes */ 6006 /* Skip already used nodes */
6002 if (test_bit(n, used_nodes)) 6007 if (test_bit(n, used_nodes))
6003 continue; 6008 continue;
6004 6009
6005 /* Simple min distance search */ 6010 /* Simple min distance search */
6006 val = node_distance(node, n); 6011 val = node_distance(node, n);
6007 6012
6008 if (val < min_val) { 6013 if (val < min_val) {
6009 min_val = val; 6014 min_val = val;
6010 best_node = n; 6015 best_node = n;
6011 } 6016 }
6012 } 6017 }
6013 6018
6014 set_bit(best_node, used_nodes); 6019 set_bit(best_node, used_nodes);
6015 return best_node; 6020 return best_node;
6016 } 6021 }
6017 6022
6018 /** 6023 /**
6019 * sched_domain_node_span - get a cpumask for a node's sched_domain 6024 * sched_domain_node_span - get a cpumask for a node's sched_domain
6020 * @node: node whose cpumask we're constructing 6025 * @node: node whose cpumask we're constructing
6021 * @size: number of nodes to include in this span 6026 * @size: number of nodes to include in this span
6022 * 6027 *
6023 * Given a node, construct a good cpumask for its sched_domain to span. It 6028 * Given a node, construct a good cpumask for its sched_domain to span. It
6024 * should be one that prevents unnecessary balancing, but also spreads tasks 6029 * should be one that prevents unnecessary balancing, but also spreads tasks
6025 * out optimally. 6030 * out optimally.
6026 */ 6031 */
6027 static cpumask_t sched_domain_node_span(int node) 6032 static cpumask_t sched_domain_node_span(int node)
6028 { 6033 {
6029 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 6034 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
6030 cpumask_t span, nodemask; 6035 cpumask_t span, nodemask;
6031 int i; 6036 int i;
6032 6037
6033 cpus_clear(span); 6038 cpus_clear(span);
6034 bitmap_zero(used_nodes, MAX_NUMNODES); 6039 bitmap_zero(used_nodes, MAX_NUMNODES);
6035 6040
6036 nodemask = node_to_cpumask(node); 6041 nodemask = node_to_cpumask(node);
6037 cpus_or(span, span, nodemask); 6042 cpus_or(span, span, nodemask);
6038 set_bit(node, used_nodes); 6043 set_bit(node, used_nodes);
6039 6044
6040 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6045 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6041 int next_node = find_next_best_node(node, used_nodes); 6046 int next_node = find_next_best_node(node, used_nodes);
6042 6047
6043 nodemask = node_to_cpumask(next_node); 6048 nodemask = node_to_cpumask(next_node);
6044 cpus_or(span, span, nodemask); 6049 cpus_or(span, span, nodemask);
6045 } 6050 }
6046 6051
6047 return span; 6052 return span;
6048 } 6053 }
6049 #endif 6054 #endif
6050 6055
6051 int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6056 int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6052 6057
6053 /* 6058 /*
6054 * SMT sched-domains: 6059 * SMT sched-domains:
6055 */ 6060 */
6056 #ifdef CONFIG_SCHED_SMT 6061 #ifdef CONFIG_SCHED_SMT
6057 static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6062 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
6058 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 6063 static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
6059 6064
6060 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, 6065 static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
6061 struct sched_group **sg) 6066 struct sched_group **sg)
6062 { 6067 {
6063 if (sg) 6068 if (sg)
6064 *sg = &per_cpu(sched_group_cpus, cpu); 6069 *sg = &per_cpu(sched_group_cpus, cpu);
6065 return cpu; 6070 return cpu;
6066 } 6071 }
6067 #endif 6072 #endif
6068 6073
6069 /* 6074 /*
6070 * multi-core sched-domains: 6075 * multi-core sched-domains:
6071 */ 6076 */
6072 #ifdef CONFIG_SCHED_MC 6077 #ifdef CONFIG_SCHED_MC
6073 static DEFINE_PER_CPU(struct sched_domain, core_domains); 6078 static DEFINE_PER_CPU(struct sched_domain, core_domains);
6074 static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6079 static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6075 #endif 6080 #endif
6076 6081
6077 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6082 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6078 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 6083 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6079 struct sched_group **sg) 6084 struct sched_group **sg)
6080 { 6085 {
6081 int group; 6086 int group;
6082 cpumask_t mask = cpu_sibling_map[cpu]; 6087 cpumask_t mask = cpu_sibling_map[cpu];
6083 cpus_and(mask, mask, *cpu_map); 6088 cpus_and(mask, mask, *cpu_map);
6084 group = first_cpu(mask); 6089 group = first_cpu(mask);
6085 if (sg) 6090 if (sg)
6086 *sg = &per_cpu(sched_group_core, group); 6091 *sg = &per_cpu(sched_group_core, group);
6087 return group; 6092 return group;
6088 } 6093 }
6089 #elif defined(CONFIG_SCHED_MC) 6094 #elif defined(CONFIG_SCHED_MC)
6090 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 6095 static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6091 struct sched_group **sg) 6096 struct sched_group **sg)
6092 { 6097 {
6093 if (sg) 6098 if (sg)
6094 *sg = &per_cpu(sched_group_core, cpu); 6099 *sg = &per_cpu(sched_group_core, cpu);
6095 return cpu; 6100 return cpu;
6096 } 6101 }
6097 #endif 6102 #endif
6098 6103
6099 static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6104 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6100 static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 6105 static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6101 6106
6102 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, 6107 static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
6103 struct sched_group **sg) 6108 struct sched_group **sg)
6104 { 6109 {
6105 int group; 6110 int group;
6106 #ifdef CONFIG_SCHED_MC 6111 #ifdef CONFIG_SCHED_MC
6107 cpumask_t mask = cpu_coregroup_map(cpu); 6112 cpumask_t mask = cpu_coregroup_map(cpu);
6108 cpus_and(mask, mask, *cpu_map); 6113 cpus_and(mask, mask, *cpu_map);
6109 group = first_cpu(mask); 6114 group = first_cpu(mask);
6110 #elif defined(CONFIG_SCHED_SMT) 6115 #elif defined(CONFIG_SCHED_SMT)
6111 cpumask_t mask = cpu_sibling_map[cpu]; 6116 cpumask_t mask = cpu_sibling_map[cpu];
6112 cpus_and(mask, mask, *cpu_map); 6117 cpus_and(mask, mask, *cpu_map);
6113 group = first_cpu(mask); 6118 group = first_cpu(mask);
6114 #else 6119 #else
6115 group = cpu; 6120 group = cpu;
6116 #endif 6121 #endif
6117 if (sg) 6122 if (sg)
6118 *sg = &per_cpu(sched_group_phys, group); 6123 *sg = &per_cpu(sched_group_phys, group);
6119 return group; 6124 return group;
6120 } 6125 }
6121 6126
6122 #ifdef CONFIG_NUMA 6127 #ifdef CONFIG_NUMA
6123 /* 6128 /*
6124 * The init_sched_build_groups can't handle what we want to do with node 6129 * The init_sched_build_groups can't handle what we want to do with node
6125 * groups, so roll our own. Now each node has its own list of groups which 6130 * groups, so roll our own. Now each node has its own list of groups which
6126 * gets dynamically allocated. 6131 * gets dynamically allocated.
6127 */ 6132 */
6128 static DEFINE_PER_CPU(struct sched_domain, node_domains); 6133 static DEFINE_PER_CPU(struct sched_domain, node_domains);
6129 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; 6134 static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
6130 6135
6131 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); 6136 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
6132 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); 6137 static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
6133 6138
6134 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, 6139 static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
6135 struct sched_group **sg) 6140 struct sched_group **sg)
6136 { 6141 {
6137 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); 6142 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
6138 int group; 6143 int group;
6139 6144
6140 cpus_and(nodemask, nodemask, *cpu_map); 6145 cpus_and(nodemask, nodemask, *cpu_map);
6141 group = first_cpu(nodemask); 6146 group = first_cpu(nodemask);
6142 6147
6143 if (sg) 6148 if (sg)
6144 *sg = &per_cpu(sched_group_allnodes, group); 6149 *sg = &per_cpu(sched_group_allnodes, group);
6145 return group; 6150 return group;
6146 } 6151 }
6147 6152
6148 static void init_numa_sched_groups_power(struct sched_group *group_head) 6153 static void init_numa_sched_groups_power(struct sched_group *group_head)
6149 { 6154 {
6150 struct sched_group *sg = group_head; 6155 struct sched_group *sg = group_head;
6151 int j; 6156 int j;
6152 6157
6153 if (!sg) 6158 if (!sg)
6154 return; 6159 return;
6155 next_sg: 6160 next_sg:
6156 for_each_cpu_mask(j, sg->cpumask) { 6161 for_each_cpu_mask(j, sg->cpumask) {
6157 struct sched_domain *sd; 6162 struct sched_domain *sd;
6158 6163
6159 sd = &per_cpu(phys_domains, j); 6164 sd = &per_cpu(phys_domains, j);
6160 if (j != first_cpu(sd->groups->cpumask)) { 6165 if (j != first_cpu(sd->groups->cpumask)) {
6161 /* 6166 /*
6162 * Only add "power" once for each 6167 * Only add "power" once for each
6163 * physical package. 6168 * physical package.
6164 */ 6169 */
6165 continue; 6170 continue;
6166 } 6171 }
6167 6172
6168 sg->cpu_power += sd->groups->cpu_power; 6173 sg->cpu_power += sd->groups->cpu_power;
6169 } 6174 }
6170 sg = sg->next; 6175 sg = sg->next;
6171 if (sg != group_head) 6176 if (sg != group_head)
6172 goto next_sg; 6177 goto next_sg;
6173 } 6178 }
6174 #endif 6179 #endif
6175 6180
6176 #ifdef CONFIG_NUMA 6181 #ifdef CONFIG_NUMA
6177 /* Free memory allocated for various sched_group structures */ 6182 /* Free memory allocated for various sched_group structures */
6178 static void free_sched_groups(const cpumask_t *cpu_map) 6183 static void free_sched_groups(const cpumask_t *cpu_map)
6179 { 6184 {
6180 int cpu, i; 6185 int cpu, i;
6181 6186
6182 for_each_cpu_mask(cpu, *cpu_map) { 6187 for_each_cpu_mask(cpu, *cpu_map) {
6183 struct sched_group **sched_group_nodes 6188 struct sched_group **sched_group_nodes
6184 = sched_group_nodes_bycpu[cpu]; 6189 = sched_group_nodes_bycpu[cpu];
6185 6190
6186 if (!sched_group_nodes) 6191 if (!sched_group_nodes)
6187 continue; 6192 continue;
6188 6193
6189 for (i = 0; i < MAX_NUMNODES; i++) { 6194 for (i = 0; i < MAX_NUMNODES; i++) {
6190 cpumask_t nodemask = node_to_cpumask(i); 6195 cpumask_t nodemask = node_to_cpumask(i);
6191 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6196 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6192 6197
6193 cpus_and(nodemask, nodemask, *cpu_map); 6198 cpus_and(nodemask, nodemask, *cpu_map);
6194 if (cpus_empty(nodemask)) 6199 if (cpus_empty(nodemask))
6195 continue; 6200 continue;
6196 6201
6197 if (sg == NULL) 6202 if (sg == NULL)
6198 continue; 6203 continue;
6199 sg = sg->next; 6204 sg = sg->next;
6200 next_sg: 6205 next_sg:
6201 oldsg = sg; 6206 oldsg = sg;
6202 sg = sg->next; 6207 sg = sg->next;
6203 kfree(oldsg); 6208 kfree(oldsg);
6204 if (oldsg != sched_group_nodes[i]) 6209 if (oldsg != sched_group_nodes[i])
6205 goto next_sg; 6210 goto next_sg;
6206 } 6211 }
6207 kfree(sched_group_nodes); 6212 kfree(sched_group_nodes);
6208 sched_group_nodes_bycpu[cpu] = NULL; 6213 sched_group_nodes_bycpu[cpu] = NULL;
6209 } 6214 }
6210 } 6215 }
6211 #else 6216 #else
6212 static void free_sched_groups(const cpumask_t *cpu_map) 6217 static void free_sched_groups(const cpumask_t *cpu_map)
6213 { 6218 {
6214 } 6219 }
6215 #endif 6220 #endif
6216 6221
6217 /* 6222 /*
6218 * Initialize sched groups cpu_power. 6223 * Initialize sched groups cpu_power.
6219 * 6224 *
6220 * cpu_power indicates the capacity of sched group, which is used while 6225 * cpu_power indicates the capacity of sched group, which is used while
6221 * distributing the load between different sched groups in a sched domain. 6226 * distributing the load between different sched groups in a sched domain.
6222 * Typically cpu_power for all the groups in a sched domain will be same unless 6227 * Typically cpu_power for all the groups in a sched domain will be same unless
6223 * there are asymmetries in the topology. If there are asymmetries, group 6228 * there are asymmetries in the topology. If there are asymmetries, group
6224 * having more cpu_power will pickup more load compared to the group having 6229 * having more cpu_power will pickup more load compared to the group having
6225 * less cpu_power. 6230 * less cpu_power.
6226 * 6231 *
6227 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents 6232 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
6228 * the maximum number of tasks a group can handle in the presence of other idle 6233 * the maximum number of tasks a group can handle in the presence of other idle
6229 * or lightly loaded groups in the same sched domain. 6234 * or lightly loaded groups in the same sched domain.
6230 */ 6235 */
6231 static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6236 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6232 { 6237 {
6233 struct sched_domain *child; 6238 struct sched_domain *child;
6234 struct sched_group *group; 6239 struct sched_group *group;
6235 6240
6236 WARN_ON(!sd || !sd->groups); 6241 WARN_ON(!sd || !sd->groups);
6237 6242
6238 if (cpu != first_cpu(sd->groups->cpumask)) 6243 if (cpu != first_cpu(sd->groups->cpumask))
6239 return; 6244 return;
6240 6245
6241 child = sd->child; 6246 child = sd->child;
6242 6247
6243 /* 6248 /*
6244 * For perf policy, if the groups in child domain share resources 6249 * For perf policy, if the groups in child domain share resources
6245 * (for example cores sharing some portions of the cache hierarchy 6250 * (for example cores sharing some portions of the cache hierarchy
6246 * or SMT), then set this domain groups cpu_power such that each group 6251 * or SMT), then set this domain groups cpu_power such that each group
6247 * can handle only one task, when there are other idle groups in the 6252 * can handle only one task, when there are other idle groups in the
6248 * same sched domain. 6253 * same sched domain.
6249 */ 6254 */
6250 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 6255 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
6251 (child->flags & 6256 (child->flags &
6252 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 6257 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
6253 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6258 sd->groups->cpu_power = SCHED_LOAD_SCALE;
6254 return; 6259 return;
6255 } 6260 }
6256 6261
6257 sd->groups->cpu_power = 0; 6262 sd->groups->cpu_power = 0;
6258 6263
6259 /* 6264 /*
6260 * add cpu_power of each child group to this groups cpu_power 6265 * add cpu_power of each child group to this groups cpu_power
6261 */ 6266 */
6262 group = child->groups; 6267 group = child->groups;
6263 do { 6268 do {
6264 sd->groups->cpu_power += group->cpu_power; 6269 sd->groups->cpu_power += group->cpu_power;
6265 group = group->next; 6270 group = group->next;
6266 } while (group != child->groups); 6271 } while (group != child->groups);
6267 } 6272 }
6268 6273
6269 /* 6274 /*
6270 * Build sched domains for a given set of cpus and attach the sched domains 6275 * Build sched domains for a given set of cpus and attach the sched domains
6271 * to the individual cpus 6276 * to the individual cpus
6272 */ 6277 */
6273 static int build_sched_domains(const cpumask_t *cpu_map) 6278 static int build_sched_domains(const cpumask_t *cpu_map)
6274 { 6279 {
6275 int i; 6280 int i;
6276 struct sched_domain *sd; 6281 struct sched_domain *sd;
6277 #ifdef CONFIG_NUMA 6282 #ifdef CONFIG_NUMA
6278 struct sched_group **sched_group_nodes = NULL; 6283 struct sched_group **sched_group_nodes = NULL;
6279 int sd_allnodes = 0; 6284 int sd_allnodes = 0;
6280 6285
6281 /* 6286 /*
6282 * Allocate the per-node list of sched groups 6287 * Allocate the per-node list of sched groups
6283 */ 6288 */
6284 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6289 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
6285 GFP_KERNEL); 6290 GFP_KERNEL);
6286 if (!sched_group_nodes) { 6291 if (!sched_group_nodes) {
6287 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6292 printk(KERN_WARNING "Can not alloc sched group node list\n");
6288 return -ENOMEM; 6293 return -ENOMEM;
6289 } 6294 }
6290 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6295 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6291 #endif 6296 #endif
6292 6297
6293 /* 6298 /*
6294 * Set up domains for cpus specified by the cpu_map. 6299 * Set up domains for cpus specified by the cpu_map.
6295 */ 6300 */
6296 for_each_cpu_mask(i, *cpu_map) { 6301 for_each_cpu_mask(i, *cpu_map) {
6297 struct sched_domain *sd = NULL, *p; 6302 struct sched_domain *sd = NULL, *p;
6298 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); 6303 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
6299 6304
6300 cpus_and(nodemask, nodemask, *cpu_map); 6305 cpus_and(nodemask, nodemask, *cpu_map);
6301 6306
6302 #ifdef CONFIG_NUMA 6307 #ifdef CONFIG_NUMA
6303 if (cpus_weight(*cpu_map) 6308 if (cpus_weight(*cpu_map)
6304 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { 6309 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
6305 sd = &per_cpu(allnodes_domains, i); 6310 sd = &per_cpu(allnodes_domains, i);
6306 *sd = SD_ALLNODES_INIT; 6311 *sd = SD_ALLNODES_INIT;
6307 sd->span = *cpu_map; 6312 sd->span = *cpu_map;
6308 cpu_to_allnodes_group(i, cpu_map, &sd->groups); 6313 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
6309 p = sd; 6314 p = sd;
6310 sd_allnodes = 1; 6315 sd_allnodes = 1;
6311 } else 6316 } else
6312 p = NULL; 6317 p = NULL;
6313 6318
6314 sd = &per_cpu(node_domains, i); 6319 sd = &per_cpu(node_domains, i);
6315 *sd = SD_NODE_INIT; 6320 *sd = SD_NODE_INIT;
6316 sd->span = sched_domain_node_span(cpu_to_node(i)); 6321 sd->span = sched_domain_node_span(cpu_to_node(i));
6317 sd->parent = p; 6322 sd->parent = p;
6318 if (p) 6323 if (p)
6319 p->child = sd; 6324 p->child = sd;
6320 cpus_and(sd->span, sd->span, *cpu_map); 6325 cpus_and(sd->span, sd->span, *cpu_map);
6321 #endif 6326 #endif
6322 6327
6323 p = sd; 6328 p = sd;
6324 sd = &per_cpu(phys_domains, i); 6329 sd = &per_cpu(phys_domains, i);
6325 *sd = SD_CPU_INIT; 6330 *sd = SD_CPU_INIT;
6326 sd->span = nodemask; 6331 sd->span = nodemask;
6327 sd->parent = p; 6332 sd->parent = p;
6328 if (p) 6333 if (p)
6329 p->child = sd; 6334 p->child = sd;
6330 cpu_to_phys_group(i, cpu_map, &sd->groups); 6335 cpu_to_phys_group(i, cpu_map, &sd->groups);
6331 6336
6332 #ifdef CONFIG_SCHED_MC 6337 #ifdef CONFIG_SCHED_MC
6333 p = sd; 6338 p = sd;
6334 sd = &per_cpu(core_domains, i); 6339 sd = &per_cpu(core_domains, i);
6335 *sd = SD_MC_INIT; 6340 *sd = SD_MC_INIT;
6336 sd->span = cpu_coregroup_map(i); 6341 sd->span = cpu_coregroup_map(i);
6337 cpus_and(sd->span, sd->span, *cpu_map); 6342 cpus_and(sd->span, sd->span, *cpu_map);
6338 sd->parent = p; 6343 sd->parent = p;
6339 p->child = sd; 6344 p->child = sd;
6340 cpu_to_core_group(i, cpu_map, &sd->groups); 6345 cpu_to_core_group(i, cpu_map, &sd->groups);
6341 #endif 6346 #endif
6342 6347
6343 #ifdef CONFIG_SCHED_SMT 6348 #ifdef CONFIG_SCHED_SMT
6344 p = sd; 6349 p = sd;
6345 sd = &per_cpu(cpu_domains, i); 6350 sd = &per_cpu(cpu_domains, i);
6346 *sd = SD_SIBLING_INIT; 6351 *sd = SD_SIBLING_INIT;
6347 sd->span = cpu_sibling_map[i]; 6352 sd->span = cpu_sibling_map[i];
6348 cpus_and(sd->span, sd->span, *cpu_map); 6353 cpus_and(sd->span, sd->span, *cpu_map);
6349 sd->parent = p; 6354 sd->parent = p;
6350 p->child = sd; 6355 p->child = sd;
6351 cpu_to_cpu_group(i, cpu_map, &sd->groups); 6356 cpu_to_cpu_group(i, cpu_map, &sd->groups);
6352 #endif 6357 #endif
6353 } 6358 }
6354 6359
6355 #ifdef CONFIG_SCHED_SMT 6360 #ifdef CONFIG_SCHED_SMT
6356 /* Set up CPU (sibling) groups */ 6361 /* Set up CPU (sibling) groups */
6357 for_each_cpu_mask(i, *cpu_map) { 6362 for_each_cpu_mask(i, *cpu_map) {
6358 cpumask_t this_sibling_map = cpu_sibling_map[i]; 6363 cpumask_t this_sibling_map = cpu_sibling_map[i];
6359 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 6364 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
6360 if (i != first_cpu(this_sibling_map)) 6365 if (i != first_cpu(this_sibling_map))
6361 continue; 6366 continue;
6362 6367
6363 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); 6368 init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group);
6364 } 6369 }
6365 #endif 6370 #endif
6366 6371
6367 #ifdef CONFIG_SCHED_MC 6372 #ifdef CONFIG_SCHED_MC
6368 /* Set up multi-core groups */ 6373 /* Set up multi-core groups */
6369 for_each_cpu_mask(i, *cpu_map) { 6374 for_each_cpu_mask(i, *cpu_map) {
6370 cpumask_t this_core_map = cpu_coregroup_map(i); 6375 cpumask_t this_core_map = cpu_coregroup_map(i);
6371 cpus_and(this_core_map, this_core_map, *cpu_map); 6376 cpus_and(this_core_map, this_core_map, *cpu_map);
6372 if (i != first_cpu(this_core_map)) 6377 if (i != first_cpu(this_core_map))
6373 continue; 6378 continue;
6374 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); 6379 init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group);
6375 } 6380 }
6376 #endif 6381 #endif
6377 6382
6378 6383
6379 /* Set up physical groups */ 6384 /* Set up physical groups */
6380 for (i = 0; i < MAX_NUMNODES; i++) { 6385 for (i = 0; i < MAX_NUMNODES; i++) {
6381 cpumask_t nodemask = node_to_cpumask(i); 6386 cpumask_t nodemask = node_to_cpumask(i);
6382 6387
6383 cpus_and(nodemask, nodemask, *cpu_map); 6388 cpus_and(nodemask, nodemask, *cpu_map);
6384 if (cpus_empty(nodemask)) 6389 if (cpus_empty(nodemask))
6385 continue; 6390 continue;
6386 6391
6387 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); 6392 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
6388 } 6393 }
6389 6394
6390 #ifdef CONFIG_NUMA 6395 #ifdef CONFIG_NUMA
6391 /* Set up node groups */ 6396 /* Set up node groups */
6392 if (sd_allnodes) 6397 if (sd_allnodes)
6393 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); 6398 init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group);
6394 6399
6395 for (i = 0; i < MAX_NUMNODES; i++) { 6400 for (i = 0; i < MAX_NUMNODES; i++) {
6396 /* Set up node groups */ 6401 /* Set up node groups */
6397 struct sched_group *sg, *prev; 6402 struct sched_group *sg, *prev;
6398 cpumask_t nodemask = node_to_cpumask(i); 6403 cpumask_t nodemask = node_to_cpumask(i);
6399 cpumask_t domainspan; 6404 cpumask_t domainspan;
6400 cpumask_t covered = CPU_MASK_NONE; 6405 cpumask_t covered = CPU_MASK_NONE;
6401 int j; 6406 int j;
6402 6407
6403 cpus_and(nodemask, nodemask, *cpu_map); 6408 cpus_and(nodemask, nodemask, *cpu_map);
6404 if (cpus_empty(nodemask)) { 6409 if (cpus_empty(nodemask)) {
6405 sched_group_nodes[i] = NULL; 6410 sched_group_nodes[i] = NULL;
6406 continue; 6411 continue;
6407 } 6412 }
6408 6413
6409 domainspan = sched_domain_node_span(i); 6414 domainspan = sched_domain_node_span(i);
6410 cpus_and(domainspan, domainspan, *cpu_map); 6415 cpus_and(domainspan, domainspan, *cpu_map);
6411 6416
6412 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); 6417 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6413 if (!sg) { 6418 if (!sg) {
6414 printk(KERN_WARNING "Can not alloc domain group for " 6419 printk(KERN_WARNING "Can not alloc domain group for "
6415 "node %d\n", i); 6420 "node %d\n", i);
6416 goto error; 6421 goto error;
6417 } 6422 }
6418 sched_group_nodes[i] = sg; 6423 sched_group_nodes[i] = sg;
6419 for_each_cpu_mask(j, nodemask) { 6424 for_each_cpu_mask(j, nodemask) {
6420 struct sched_domain *sd; 6425 struct sched_domain *sd;
6421 sd = &per_cpu(node_domains, j); 6426 sd = &per_cpu(node_domains, j);
6422 sd->groups = sg; 6427 sd->groups = sg;
6423 } 6428 }
6424 sg->cpu_power = 0; 6429 sg->cpu_power = 0;
6425 sg->cpumask = nodemask; 6430 sg->cpumask = nodemask;
6426 sg->next = sg; 6431 sg->next = sg;
6427 cpus_or(covered, covered, nodemask); 6432 cpus_or(covered, covered, nodemask);
6428 prev = sg; 6433 prev = sg;
6429 6434
6430 for (j = 0; j < MAX_NUMNODES; j++) { 6435 for (j = 0; j < MAX_NUMNODES; j++) {
6431 cpumask_t tmp, notcovered; 6436 cpumask_t tmp, notcovered;
6432 int n = (i + j) % MAX_NUMNODES; 6437 int n = (i + j) % MAX_NUMNODES;
6433 6438
6434 cpus_complement(notcovered, covered); 6439 cpus_complement(notcovered, covered);
6435 cpus_and(tmp, notcovered, *cpu_map); 6440 cpus_and(tmp, notcovered, *cpu_map);
6436 cpus_and(tmp, tmp, domainspan); 6441 cpus_and(tmp, tmp, domainspan);
6437 if (cpus_empty(tmp)) 6442 if (cpus_empty(tmp))
6438 break; 6443 break;
6439 6444
6440 nodemask = node_to_cpumask(n); 6445 nodemask = node_to_cpumask(n);
6441 cpus_and(tmp, tmp, nodemask); 6446 cpus_and(tmp, tmp, nodemask);
6442 if (cpus_empty(tmp)) 6447 if (cpus_empty(tmp))
6443 continue; 6448 continue;
6444 6449
6445 sg = kmalloc_node(sizeof(struct sched_group), 6450 sg = kmalloc_node(sizeof(struct sched_group),
6446 GFP_KERNEL, i); 6451 GFP_KERNEL, i);
6447 if (!sg) { 6452 if (!sg) {
6448 printk(KERN_WARNING 6453 printk(KERN_WARNING
6449 "Can not alloc domain group for node %d\n", j); 6454 "Can not alloc domain group for node %d\n", j);
6450 goto error; 6455 goto error;
6451 } 6456 }
6452 sg->cpu_power = 0; 6457 sg->cpu_power = 0;
6453 sg->cpumask = tmp; 6458 sg->cpumask = tmp;
6454 sg->next = prev->next; 6459 sg->next = prev->next;
6455 cpus_or(covered, covered, tmp); 6460 cpus_or(covered, covered, tmp);
6456 prev->next = sg; 6461 prev->next = sg;
6457 prev = sg; 6462 prev = sg;
6458 } 6463 }
6459 } 6464 }
6460 #endif 6465 #endif
6461 6466
6462 /* Calculate CPU power for physical packages and nodes */ 6467 /* Calculate CPU power for physical packages and nodes */
6463 #ifdef CONFIG_SCHED_SMT 6468 #ifdef CONFIG_SCHED_SMT
6464 for_each_cpu_mask(i, *cpu_map) { 6469 for_each_cpu_mask(i, *cpu_map) {
6465 sd = &per_cpu(cpu_domains, i); 6470 sd = &per_cpu(cpu_domains, i);
6466 init_sched_groups_power(i, sd); 6471 init_sched_groups_power(i, sd);
6467 } 6472 }
6468 #endif 6473 #endif
6469 #ifdef CONFIG_SCHED_MC 6474 #ifdef CONFIG_SCHED_MC
6470 for_each_cpu_mask(i, *cpu_map) { 6475 for_each_cpu_mask(i, *cpu_map) {
6471 sd = &per_cpu(core_domains, i); 6476 sd = &per_cpu(core_domains, i);
6472 init_sched_groups_power(i, sd); 6477 init_sched_groups_power(i, sd);
6473 } 6478 }
6474 #endif 6479 #endif
6475 6480
6476 for_each_cpu_mask(i, *cpu_map) { 6481 for_each_cpu_mask(i, *cpu_map) {
6477 sd = &per_cpu(phys_domains, i); 6482 sd = &per_cpu(phys_domains, i);
6478 init_sched_groups_power(i, sd); 6483 init_sched_groups_power(i, sd);
6479 } 6484 }
6480 6485
6481 #ifdef CONFIG_NUMA 6486 #ifdef CONFIG_NUMA
6482 for (i = 0; i < MAX_NUMNODES; i++) 6487 for (i = 0; i < MAX_NUMNODES; i++)
6483 init_numa_sched_groups_power(sched_group_nodes[i]); 6488 init_numa_sched_groups_power(sched_group_nodes[i]);
6484 6489
6485 if (sd_allnodes) { 6490 if (sd_allnodes) {
6486 struct sched_group *sg; 6491 struct sched_group *sg;
6487 6492
6488 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); 6493 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
6489 init_numa_sched_groups_power(sg); 6494 init_numa_sched_groups_power(sg);
6490 } 6495 }
6491 #endif 6496 #endif
6492 6497
6493 /* Attach the domains */ 6498 /* Attach the domains */
6494 for_each_cpu_mask(i, *cpu_map) { 6499 for_each_cpu_mask(i, *cpu_map) {
6495 struct sched_domain *sd; 6500 struct sched_domain *sd;
6496 #ifdef CONFIG_SCHED_SMT 6501 #ifdef CONFIG_SCHED_SMT
6497 sd = &per_cpu(cpu_domains, i); 6502 sd = &per_cpu(cpu_domains, i);
6498 #elif defined(CONFIG_SCHED_MC) 6503 #elif defined(CONFIG_SCHED_MC)
6499 sd = &per_cpu(core_domains, i); 6504 sd = &per_cpu(core_domains, i);
6500 #else 6505 #else
6501 sd = &per_cpu(phys_domains, i); 6506 sd = &per_cpu(phys_domains, i);
6502 #endif 6507 #endif
6503 cpu_attach_domain(sd, i); 6508 cpu_attach_domain(sd, i);
6504 } 6509 }
6505 /* 6510 /*
6506 * Tune cache-hot values: 6511 * Tune cache-hot values:
6507 */ 6512 */
6508 calibrate_migration_costs(cpu_map); 6513 calibrate_migration_costs(cpu_map);
6509 6514
6510 return 0; 6515 return 0;
6511 6516
6512 #ifdef CONFIG_NUMA 6517 #ifdef CONFIG_NUMA
6513 error: 6518 error:
6514 free_sched_groups(cpu_map); 6519 free_sched_groups(cpu_map);
6515 return -ENOMEM; 6520 return -ENOMEM;
6516 #endif 6521 #endif
6517 } 6522 }
6518 /* 6523 /*
6519 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6524 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6520 */ 6525 */
6521 static int arch_init_sched_domains(const cpumask_t *cpu_map) 6526 static int arch_init_sched_domains(const cpumask_t *cpu_map)
6522 { 6527 {
6523 cpumask_t cpu_default_map; 6528 cpumask_t cpu_default_map;
6524 int err; 6529 int err;
6525 6530
6526 /* 6531 /*
6527 * Setup mask for cpus without special case scheduling requirements. 6532 * Setup mask for cpus without special case scheduling requirements.
6528 * For now this just excludes isolated cpus, but could be used to 6533 * For now this just excludes isolated cpus, but could be used to
6529 * exclude other special cases in the future. 6534 * exclude other special cases in the future.
6530 */ 6535 */
6531 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6536 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6532 6537
6533 err = build_sched_domains(&cpu_default_map); 6538 err = build_sched_domains(&cpu_default_map);
6534 6539
6535 return err; 6540 return err;
6536 } 6541 }
6537 6542
6538 static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6543 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
6539 { 6544 {
6540 free_sched_groups(cpu_map); 6545 free_sched_groups(cpu_map);
6541 } 6546 }
6542 6547
6543 /* 6548 /*
6544 * Detach sched domains from a group of cpus specified in cpu_map 6549 * Detach sched domains from a group of cpus specified in cpu_map
6545 * These cpus will now be attached to the NULL domain 6550 * These cpus will now be attached to the NULL domain
6546 */ 6551 */
6547 static void detach_destroy_domains(const cpumask_t *cpu_map) 6552 static void detach_destroy_domains(const cpumask_t *cpu_map)
6548 { 6553 {
6549 int i; 6554 int i;
6550 6555
6551 for_each_cpu_mask(i, *cpu_map) 6556 for_each_cpu_mask(i, *cpu_map)
6552 cpu_attach_domain(NULL, i); 6557 cpu_attach_domain(NULL, i);
6553 synchronize_sched(); 6558 synchronize_sched();
6554 arch_destroy_sched_domains(cpu_map); 6559 arch_destroy_sched_domains(cpu_map);
6555 } 6560 }
6556 6561
6557 /* 6562 /*
6558 * Partition sched domains as specified by the cpumasks below. 6563 * Partition sched domains as specified by the cpumasks below.
6559 * This attaches all cpus from the cpumasks to the NULL domain, 6564 * This attaches all cpus from the cpumasks to the NULL domain,
6560 * waits for a RCU quiescent period, recalculates sched 6565 * waits for a RCU quiescent period, recalculates sched
6561 * domain information and then attaches them back to the 6566 * domain information and then attaches them back to the
6562 * correct sched domains 6567 * correct sched domains
6563 * Call with hotplug lock held 6568 * Call with hotplug lock held
6564 */ 6569 */
6565 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6570 int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6566 { 6571 {
6567 cpumask_t change_map; 6572 cpumask_t change_map;
6568 int err = 0; 6573 int err = 0;
6569 6574
6570 cpus_and(*partition1, *partition1, cpu_online_map); 6575 cpus_and(*partition1, *partition1, cpu_online_map);
6571 cpus_and(*partition2, *partition2, cpu_online_map); 6576 cpus_and(*partition2, *partition2, cpu_online_map);
6572 cpus_or(change_map, *partition1, *partition2); 6577 cpus_or(change_map, *partition1, *partition2);
6573 6578
6574 /* Detach sched domains from all of the affected cpus */ 6579 /* Detach sched domains from all of the affected cpus */
6575 detach_destroy_domains(&change_map); 6580 detach_destroy_domains(&change_map);
6576 if (!cpus_empty(*partition1)) 6581 if (!cpus_empty(*partition1))
6577 err = build_sched_domains(partition1); 6582 err = build_sched_domains(partition1);
6578 if (!err && !cpus_empty(*partition2)) 6583 if (!err && !cpus_empty(*partition2))
6579 err = build_sched_domains(partition2); 6584 err = build_sched_domains(partition2);
6580 6585
6581 return err; 6586 return err;
6582 } 6587 }
6583 6588
6584 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6589 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6585 int arch_reinit_sched_domains(void) 6590 int arch_reinit_sched_domains(void)
6586 { 6591 {
6587 int err; 6592 int err;
6588 6593
6589 lock_cpu_hotplug(); 6594 lock_cpu_hotplug();
6590 detach_destroy_domains(&cpu_online_map); 6595 detach_destroy_domains(&cpu_online_map);
6591 err = arch_init_sched_domains(&cpu_online_map); 6596 err = arch_init_sched_domains(&cpu_online_map);
6592 unlock_cpu_hotplug(); 6597 unlock_cpu_hotplug();
6593 6598
6594 return err; 6599 return err;
6595 } 6600 }
6596 6601
6597 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 6602 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6598 { 6603 {
6599 int ret; 6604 int ret;
6600 6605
6601 if (buf[0] != '0' && buf[0] != '1') 6606 if (buf[0] != '0' && buf[0] != '1')
6602 return -EINVAL; 6607 return -EINVAL;
6603 6608
6604 if (smt) 6609 if (smt)
6605 sched_smt_power_savings = (buf[0] == '1'); 6610 sched_smt_power_savings = (buf[0] == '1');
6606 else 6611 else
6607 sched_mc_power_savings = (buf[0] == '1'); 6612 sched_mc_power_savings = (buf[0] == '1');
6608 6613
6609 ret = arch_reinit_sched_domains(); 6614 ret = arch_reinit_sched_domains();
6610 6615
6611 return ret ? ret : count; 6616 return ret ? ret : count;
6612 } 6617 }
6613 6618
6614 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) 6619 int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6615 { 6620 {
6616 int err = 0; 6621 int err = 0;
6617 6622
6618 #ifdef CONFIG_SCHED_SMT 6623 #ifdef CONFIG_SCHED_SMT
6619 if (smt_capable()) 6624 if (smt_capable())
6620 err = sysfs_create_file(&cls->kset.kobj, 6625 err = sysfs_create_file(&cls->kset.kobj,
6621 &attr_sched_smt_power_savings.attr); 6626 &attr_sched_smt_power_savings.attr);
6622 #endif 6627 #endif
6623 #ifdef CONFIG_SCHED_MC 6628 #ifdef CONFIG_SCHED_MC
6624 if (!err && mc_capable()) 6629 if (!err && mc_capable())
6625 err = sysfs_create_file(&cls->kset.kobj, 6630 err = sysfs_create_file(&cls->kset.kobj,
6626 &attr_sched_mc_power_savings.attr); 6631 &attr_sched_mc_power_savings.attr);
6627 #endif 6632 #endif
6628 return err; 6633 return err;
6629 } 6634 }
6630 #endif 6635 #endif
6631 6636
6632 #ifdef CONFIG_SCHED_MC 6637 #ifdef CONFIG_SCHED_MC
6633 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 6638 static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6634 { 6639 {
6635 return sprintf(page, "%u\n", sched_mc_power_savings); 6640 return sprintf(page, "%u\n", sched_mc_power_savings);
6636 } 6641 }
6637 static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 6642 static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6638 const char *buf, size_t count) 6643 const char *buf, size_t count)
6639 { 6644 {
6640 return sched_power_savings_store(buf, count, 0); 6645 return sched_power_savings_store(buf, count, 0);
6641 } 6646 }
6642 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 6647 SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6643 sched_mc_power_savings_store); 6648 sched_mc_power_savings_store);
6644 #endif 6649 #endif
6645 6650
6646 #ifdef CONFIG_SCHED_SMT 6651 #ifdef CONFIG_SCHED_SMT
6647 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 6652 static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6648 { 6653 {
6649 return sprintf(page, "%u\n", sched_smt_power_savings); 6654 return sprintf(page, "%u\n", sched_smt_power_savings);
6650 } 6655 }
6651 static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 6656 static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6652 const char *buf, size_t count) 6657 const char *buf, size_t count)
6653 { 6658 {
6654 return sched_power_savings_store(buf, count, 1); 6659 return sched_power_savings_store(buf, count, 1);
6655 } 6660 }
6656 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 6661 SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6657 sched_smt_power_savings_store); 6662 sched_smt_power_savings_store);
6658 #endif 6663 #endif
6659 6664
6660 /* 6665 /*
6661 * Force a reinitialization of the sched domains hierarchy. The domains 6666 * Force a reinitialization of the sched domains hierarchy. The domains
6662 * and groups cannot be updated in place without racing with the balancing 6667 * and groups cannot be updated in place without racing with the balancing
6663 * code, so we temporarily attach all running cpus to the NULL domain 6668 * code, so we temporarily attach all running cpus to the NULL domain
6664 * which will prevent rebalancing while the sched domains are recalculated. 6669 * which will prevent rebalancing while the sched domains are recalculated.
6665 */ 6670 */
6666 static int update_sched_domains(struct notifier_block *nfb, 6671 static int update_sched_domains(struct notifier_block *nfb,
6667 unsigned long action, void *hcpu) 6672 unsigned long action, void *hcpu)
6668 { 6673 {
6669 switch (action) { 6674 switch (action) {
6670 case CPU_UP_PREPARE: 6675 case CPU_UP_PREPARE:
6671 case CPU_DOWN_PREPARE: 6676 case CPU_DOWN_PREPARE:
6672 detach_destroy_domains(&cpu_online_map); 6677 detach_destroy_domains(&cpu_online_map);
6673 return NOTIFY_OK; 6678 return NOTIFY_OK;
6674 6679
6675 case CPU_UP_CANCELED: 6680 case CPU_UP_CANCELED:
6676 case CPU_DOWN_FAILED: 6681 case CPU_DOWN_FAILED:
6677 case CPU_ONLINE: 6682 case CPU_ONLINE:
6678 case CPU_DEAD: 6683 case CPU_DEAD:
6679 /* 6684 /*
6680 * Fall through and re-initialise the domains. 6685 * Fall through and re-initialise the domains.
6681 */ 6686 */
6682 break; 6687 break;
6683 default: 6688 default:
6684 return NOTIFY_DONE; 6689 return NOTIFY_DONE;
6685 } 6690 }
6686 6691
6687 /* The hotplug lock is already held by cpu_up/cpu_down */ 6692 /* The hotplug lock is already held by cpu_up/cpu_down */
6688 arch_init_sched_domains(&cpu_online_map); 6693 arch_init_sched_domains(&cpu_online_map);
6689 6694
6690 return NOTIFY_OK; 6695 return NOTIFY_OK;
6691 } 6696 }
6692 6697
6693 void __init sched_init_smp(void) 6698 void __init sched_init_smp(void)
6694 { 6699 {
6695 cpumask_t non_isolated_cpus; 6700 cpumask_t non_isolated_cpus;
6696 6701
6697 lock_cpu_hotplug(); 6702 lock_cpu_hotplug();
6698 arch_init_sched_domains(&cpu_online_map); 6703 arch_init_sched_domains(&cpu_online_map);
6699 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 6704 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6700 if (cpus_empty(non_isolated_cpus)) 6705 if (cpus_empty(non_isolated_cpus))
6701 cpu_set(smp_processor_id(), non_isolated_cpus); 6706 cpu_set(smp_processor_id(), non_isolated_cpus);
6702 unlock_cpu_hotplug(); 6707 unlock_cpu_hotplug();
6703 /* XXX: Theoretical race here - CPU may be hotplugged now */ 6708 /* XXX: Theoretical race here - CPU may be hotplugged now */
6704 hotcpu_notifier(update_sched_domains, 0); 6709 hotcpu_notifier(update_sched_domains, 0);
6705 6710
6706 /* Move init over to a non-isolated CPU */ 6711 /* Move init over to a non-isolated CPU */
6707 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 6712 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6708 BUG(); 6713 BUG();
6709 } 6714 }
6710 #else 6715 #else
6711 void __init sched_init_smp(void) 6716 void __init sched_init_smp(void)
6712 { 6717 {
6713 } 6718 }
6714 #endif /* CONFIG_SMP */ 6719 #endif /* CONFIG_SMP */
6715 6720
6716 int in_sched_functions(unsigned long addr) 6721 int in_sched_functions(unsigned long addr)
6717 { 6722 {
6718 /* Linker adds these: start and end of __sched functions */ 6723 /* Linker adds these: start and end of __sched functions */
6719 extern char __sched_text_start[], __sched_text_end[]; 6724 extern char __sched_text_start[], __sched_text_end[];
6720 6725
6721 return in_lock_functions(addr) || 6726 return in_lock_functions(addr) ||
6722 (addr >= (unsigned long)__sched_text_start 6727 (addr >= (unsigned long)__sched_text_start
6723 && addr < (unsigned long)__sched_text_end); 6728 && addr < (unsigned long)__sched_text_end);
6724 } 6729 }
6725 6730
6726 void __init sched_init(void) 6731 void __init sched_init(void)
6727 { 6732 {
6728 int i, j, k; 6733 int i, j, k;
6734 int highest_cpu = 0;
6729 6735
6730 for_each_possible_cpu(i) { 6736 for_each_possible_cpu(i) {
6731 struct prio_array *array; 6737 struct prio_array *array;
6732 struct rq *rq; 6738 struct rq *rq;
6733 6739
6734 rq = cpu_rq(i); 6740 rq = cpu_rq(i);
6735 spin_lock_init(&rq->lock); 6741 spin_lock_init(&rq->lock);
6736 lockdep_set_class(&rq->lock, &rq->rq_lock_key); 6742 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6737 rq->nr_running = 0; 6743 rq->nr_running = 0;
6738 rq->active = rq->arrays; 6744 rq->active = rq->arrays;
6739 rq->expired = rq->arrays + 1; 6745 rq->expired = rq->arrays + 1;
6740 rq->best_expired_prio = MAX_PRIO; 6746 rq->best_expired_prio = MAX_PRIO;
6741 6747
6742 #ifdef CONFIG_SMP 6748 #ifdef CONFIG_SMP
6743 rq->sd = NULL; 6749 rq->sd = NULL;
6744 for (j = 1; j < 3; j++) 6750 for (j = 1; j < 3; j++)
6745 rq->cpu_load[j] = 0; 6751 rq->cpu_load[j] = 0;
6746 rq->active_balance = 0; 6752 rq->active_balance = 0;
6747 rq->push_cpu = 0; 6753 rq->push_cpu = 0;
6748 rq->cpu = i; 6754 rq->cpu = i;
6749 rq->migration_thread = NULL; 6755 rq->migration_thread = NULL;
6750 INIT_LIST_HEAD(&rq->migration_queue); 6756 INIT_LIST_HEAD(&rq->migration_queue);
6751 #endif 6757 #endif
6752 atomic_set(&rq->nr_iowait, 0); 6758 atomic_set(&rq->nr_iowait, 0);
6753 6759
6754 for (j = 0; j < 2; j++) { 6760 for (j = 0; j < 2; j++) {
6755 array = rq->arrays + j; 6761 array = rq->arrays + j;
6756 for (k = 0; k < MAX_PRIO; k++) { 6762 for (k = 0; k < MAX_PRIO; k++) {
6757 INIT_LIST_HEAD(array->queue + k); 6763 INIT_LIST_HEAD(array->queue + k);
6758 __clear_bit(k, array->bitmap); 6764 __clear_bit(k, array->bitmap);
6759 } 6765 }
6760 // delimiter for bitsearch 6766 // delimiter for bitsearch
6761 __set_bit(MAX_PRIO, array->bitmap); 6767 __set_bit(MAX_PRIO, array->bitmap);
6762 } 6768 }
6769 highest_cpu = i;
6763 } 6770 }
6764 6771
6765 set_load_weight(&init_task); 6772 set_load_weight(&init_task);
6766 6773
6767 #ifdef CONFIG_SMP 6774 #ifdef CONFIG_SMP
6775 nr_cpu_ids = highest_cpu + 1;
6768 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 6776 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6769 #endif 6777 #endif
6770 6778
6771 #ifdef CONFIG_RT_MUTEXES 6779 #ifdef CONFIG_RT_MUTEXES
6772 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 6780 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6773 #endif 6781 #endif
6774 6782
6775 /* 6783 /*
6776 * The boot idle thread does lazy MMU switching as well: 6784 * The boot idle thread does lazy MMU switching as well:
6777 */ 6785 */
6778 atomic_inc(&init_mm.mm_count); 6786 atomic_inc(&init_mm.mm_count);
6779 enter_lazy_tlb(&init_mm, current); 6787 enter_lazy_tlb(&init_mm, current);
6780 6788
6781 /* 6789 /*
6782 * Make us the idle thread. Technically, schedule() should not be 6790 * Make us the idle thread. Technically, schedule() should not be
6783 * called from this thread, however somewhere below it might be, 6791 * called from this thread, however somewhere below it might be,
6784 * but because we are the idle thread, we just pick up running again 6792 * but because we are the idle thread, we just pick up running again
6785 * when this runqueue becomes "idle". 6793 * when this runqueue becomes "idle".
6786 */ 6794 */
6787 init_idle(current, smp_processor_id()); 6795 init_idle(current, smp_processor_id());
6788 } 6796 }
6789 6797
6790 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6798 #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6791 void __might_sleep(char *file, int line) 6799 void __might_sleep(char *file, int line)
6792 { 6800 {
6793 #ifdef in_atomic 6801 #ifdef in_atomic
6794 static unsigned long prev_jiffy; /* ratelimiting */ 6802 static unsigned long prev_jiffy; /* ratelimiting */
6795 6803
6796 if ((in_atomic() || irqs_disabled()) && 6804 if ((in_atomic() || irqs_disabled()) &&
6797 system_state == SYSTEM_RUNNING && !oops_in_progress) { 6805 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6798 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 6806 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6799 return; 6807 return;
6800 prev_jiffy = jiffies; 6808 prev_jiffy = jiffies;
6801 printk(KERN_ERR "BUG: sleeping function called from invalid" 6809 printk(KERN_ERR "BUG: sleeping function called from invalid"
6802 " context at %s:%d\n", file, line); 6810 " context at %s:%d\n", file, line);
6803 printk("in_atomic():%d, irqs_disabled():%d\n", 6811 printk("in_atomic():%d, irqs_disabled():%d\n",
6804 in_atomic(), irqs_disabled()); 6812 in_atomic(), irqs_disabled());
6805 debug_show_held_locks(current); 6813 debug_show_held_locks(current);
6806 if (irqs_disabled()) 6814 if (irqs_disabled())
6807 print_irqtrace_events(current); 6815 print_irqtrace_events(current);
6808 dump_stack(); 6816 dump_stack();
6809 } 6817 }
6810 #endif 6818 #endif
6811 } 6819 }
6812 EXPORT_SYMBOL(__might_sleep); 6820 EXPORT_SYMBOL(__might_sleep);
6813 #endif 6821 #endif
6814 6822
6815 #ifdef CONFIG_MAGIC_SYSRQ 6823 #ifdef CONFIG_MAGIC_SYSRQ
6816 void normalize_rt_tasks(void) 6824 void normalize_rt_tasks(void)
6817 { 6825 {
6818 struct prio_array *array; 6826 struct prio_array *array;
6819 struct task_struct *p; 6827 struct task_struct *p;
6820 unsigned long flags; 6828 unsigned long flags;
6821 struct rq *rq; 6829 struct rq *rq;
6822 6830
6823 read_lock_irq(&tasklist_lock); 6831 read_lock_irq(&tasklist_lock);
6824 for_each_process(p) { 6832 for_each_process(p) {
6825 if (!rt_task(p)) 6833 if (!rt_task(p))
6826 continue; 6834 continue;
6827 6835
6828 spin_lock_irqsave(&p->pi_lock, flags); 6836 spin_lock_irqsave(&p->pi_lock, flags);
6829 rq = __task_rq_lock(p); 6837 rq = __task_rq_lock(p);
6830 6838
6831 array = p->array; 6839 array = p->array;
6832 if (array) 6840 if (array)
6833 deactivate_task(p, task_rq(p)); 6841 deactivate_task(p, task_rq(p));
6834 __setscheduler(p, SCHED_NORMAL, 0); 6842 __setscheduler(p, SCHED_NORMAL, 0);
6835 if (array) { 6843 if (array) {
6836 __activate_task(p, task_rq(p)); 6844 __activate_task(p, task_rq(p));
6837 resched_task(rq->curr); 6845 resched_task(rq->curr);
6838 } 6846 }
6839 6847
6840 __task_rq_unlock(rq); 6848 __task_rq_unlock(rq);
6841 spin_unlock_irqrestore(&p->pi_lock, flags); 6849 spin_unlock_irqrestore(&p->pi_lock, flags);
6842 } 6850 }
6843 read_unlock_irq(&tasklist_lock); 6851 read_unlock_irq(&tasklist_lock);
6844 } 6852 }
6845 6853
6846 #endif /* CONFIG_MAGIC_SYSRQ */ 6854 #endif /* CONFIG_MAGIC_SYSRQ */
6847 6855
6848 #ifdef CONFIG_IA64 6856 #ifdef CONFIG_IA64
6849 /* 6857 /*
6850 * These functions are only useful for the IA64 MCA handling. 6858 * These functions are only useful for the IA64 MCA handling.
6851 * 6859 *
6852 * They can only be called when the whole system has been 6860 * They can only be called when the whole system has been
6853 * stopped - every CPU needs to be quiescent, and no scheduling 6861 * stopped - every CPU needs to be quiescent, and no scheduling
6854 * activity can take place. Using them for anything else would 6862 * activity can take place. Using them for anything else would
6855 * be a serious bug, and as a result, they aren't even visible 6863 * be a serious bug, and as a result, they aren't even visible
6856 * under any other configuration. 6864 * under any other configuration.
6857 */ 6865 */
6858 6866
6859 /** 6867 /**
6860 * curr_task - return the current task for a given cpu. 6868 * curr_task - return the current task for a given cpu.
6861 * @cpu: the processor in question. 6869 * @cpu: the processor in question.
6862 * 6870 *
6863 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6871 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6864 */ 6872 */
6865 struct task_struct *curr_task(int cpu) 6873 struct task_struct *curr_task(int cpu)
6866 { 6874 {
6867 return cpu_curr(cpu); 6875 return cpu_curr(cpu);
6868 } 6876 }
6869 6877
6870 /** 6878 /**
6871 * set_curr_task - set the current task for a given cpu. 6879 * set_curr_task - set the current task for a given cpu.
6872 * @cpu: the processor in question. 6880 * @cpu: the processor in question.
6873 * @p: the task pointer to set. 6881 * @p: the task pointer to set.
6874 * 6882 *
6875 * Description: This function must only be used when non-maskable interrupts 6883 * Description: This function must only be used when non-maskable interrupts
6876 * are serviced on a separate stack. It allows the architecture to switch the 6884 * are serviced on a separate stack. It allows the architecture to switch the
6877 * notion of the current task on a cpu in a non-blocking manner. This function 6885 * notion of the current task on a cpu in a non-blocking manner. This function
6878 * must be called with all CPU's synchronized, and interrupts disabled, the 6886 * must be called with all CPU's synchronized, and interrupts disabled, the
6879 * and caller must save the original value of the current task (see 6887 * and caller must save the original value of the current task (see
6880 * curr_task() above) and restore that value before reenabling interrupts and 6888 * curr_task() above) and restore that value before reenabling interrupts and
6881 * re-starting the system. 6889 * re-starting the system.
6882 * 6890 *
6883 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6891 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6884 */ 6892 */
6885 void set_curr_task(int cpu, struct task_struct *p) 6893 void set_curr_task(int cpu, struct task_struct *p)
6886 { 6894 {
6887 cpu_curr(cpu) = p; 6895 cpu_curr(cpu) = p;
6888 } 6896 }
6889 6897
6890 #endif 6898 #endif
6891 6899
1 #include <linux/kernel.h> 1 #include <linux/kernel.h>
2 #include <linux/bitops.h> 2 #include <linux/bitops.h>
3 #include <linux/cpumask.h> 3 #include <linux/cpumask.h>
4 #include <linux/module.h> 4 #include <linux/module.h>
5 5
6 int __first_cpu(const cpumask_t *srcp) 6 int __first_cpu(const cpumask_t *srcp)
7 { 7 {
8 return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS)); 8 return min_t(int, NR_CPUS, find_first_bit(srcp->bits, NR_CPUS));
9 } 9 }
10 EXPORT_SYMBOL(__first_cpu); 10 EXPORT_SYMBOL(__first_cpu);
11 11
12 int __next_cpu(int n, const cpumask_t *srcp) 12 int __next_cpu(int n, const cpumask_t *srcp)
13 { 13 {
14 return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1)); 14 return min_t(int, NR_CPUS, find_next_bit(srcp->bits, NR_CPUS, n+1));
15 } 15 }
16 EXPORT_SYMBOL(__next_cpu); 16 EXPORT_SYMBOL(__next_cpu);
17 17
18 int nr_cpu_ids;
19 EXPORT_SYMBOL(nr_cpu_ids);
20
21 int __any_online_cpu(const cpumask_t *mask) 18 int __any_online_cpu(const cpumask_t *mask)
22 { 19 {
23 int cpu; 20 int cpu;
24 21
25 for_each_cpu_mask(cpu, *mask) { 22 for_each_cpu_mask(cpu, *mask) {
26 if (cpu_online(cpu)) 23 if (cpu_online(cpu))
27 break; 24 break;
28 } 25 }
29 return cpu; 26 return cpu;
30 } 27 }
31 EXPORT_SYMBOL(__any_online_cpu); 28 EXPORT_SYMBOL(__any_online_cpu);
32 29
1 /* 1 /*
2 * linux/mm/page_alloc.c 2 * linux/mm/page_alloc.c
3 * 3 *
4 * Manages the free list, the system allocates free pages here. 4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c 5 * Note that kmalloc() lives in slab.c
6 * 6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds 7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie 8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 13 * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton) 14 * (lots of bits borrowed from Ingo Molnar & Andrew Morton)
15 */ 15 */
16 16
17 #include <linux/stddef.h> 17 #include <linux/stddef.h>
18 #include <linux/mm.h> 18 #include <linux/mm.h>
19 #include <linux/swap.h> 19 #include <linux/swap.h>
20 #include <linux/interrupt.h> 20 #include <linux/interrupt.h>
21 #include <linux/pagemap.h> 21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h> 22 #include <linux/bootmem.h>
23 #include <linux/compiler.h> 23 #include <linux/compiler.h>
24 #include <linux/kernel.h> 24 #include <linux/kernel.h>
25 #include <linux/module.h> 25 #include <linux/module.h>
26 #include <linux/suspend.h> 26 #include <linux/suspend.h>
27 #include <linux/pagevec.h> 27 #include <linux/pagevec.h>
28 #include <linux/blkdev.h> 28 #include <linux/blkdev.h>
29 #include <linux/slab.h> 29 #include <linux/slab.h>
30 #include <linux/notifier.h> 30 #include <linux/notifier.h>
31 #include <linux/topology.h> 31 #include <linux/topology.h>
32 #include <linux/sysctl.h> 32 #include <linux/sysctl.h>
33 #include <linux/cpu.h> 33 #include <linux/cpu.h>
34 #include <linux/cpuset.h> 34 #include <linux/cpuset.h>
35 #include <linux/memory_hotplug.h> 35 #include <linux/memory_hotplug.h>
36 #include <linux/nodemask.h> 36 #include <linux/nodemask.h>
37 #include <linux/vmalloc.h> 37 #include <linux/vmalloc.h>
38 #include <linux/mempolicy.h> 38 #include <linux/mempolicy.h>
39 #include <linux/stop_machine.h> 39 #include <linux/stop_machine.h>
40 #include <linux/sort.h> 40 #include <linux/sort.h>
41 #include <linux/pfn.h> 41 #include <linux/pfn.h>
42 #include <linux/backing-dev.h> 42 #include <linux/backing-dev.h>
43 #include <linux/fault-inject.h> 43 #include <linux/fault-inject.h>
44 44
45 #include <asm/tlbflush.h> 45 #include <asm/tlbflush.h>
46 #include <asm/div64.h> 46 #include <asm/div64.h>
47 #include "internal.h" 47 #include "internal.h"
48 48
49 /* 49 /*
50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this 50 * MCD - HACK: Find somewhere to initialize this EARLY, or make this
51 * initializer cleaner 51 * initializer cleaner
52 */ 52 */
53 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; 53 nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
54 EXPORT_SYMBOL(node_online_map); 54 EXPORT_SYMBOL(node_online_map);
55 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; 55 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
56 EXPORT_SYMBOL(node_possible_map); 56 EXPORT_SYMBOL(node_possible_map);
57 unsigned long totalram_pages __read_mostly; 57 unsigned long totalram_pages __read_mostly;
58 unsigned long totalreserve_pages __read_mostly; 58 unsigned long totalreserve_pages __read_mostly;
59 long nr_swap_pages; 59 long nr_swap_pages;
60 int percpu_pagelist_fraction; 60 int percpu_pagelist_fraction;
61 61
62 static void __free_pages_ok(struct page *page, unsigned int order); 62 static void __free_pages_ok(struct page *page, unsigned int order);
63 63
64 /* 64 /*
65 * results with 256, 32 in the lowmem_reserve sysctl: 65 * results with 256, 32 in the lowmem_reserve sysctl:
66 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 66 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
67 * 1G machine -> (16M dma, 784M normal, 224M high) 67 * 1G machine -> (16M dma, 784M normal, 224M high)
68 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 68 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
69 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 69 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
70 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 70 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
71 * 71 *
72 * TBD: should special case ZONE_DMA32 machines here - in those we normally 72 * TBD: should special case ZONE_DMA32 machines here - in those we normally
73 * don't need any ZONE_NORMAL reservation 73 * don't need any ZONE_NORMAL reservation
74 */ 74 */
75 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 75 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
76 #ifdef CONFIG_ZONE_DMA 76 #ifdef CONFIG_ZONE_DMA
77 256, 77 256,
78 #endif 78 #endif
79 #ifdef CONFIG_ZONE_DMA32 79 #ifdef CONFIG_ZONE_DMA32
80 256, 80 256,
81 #endif 81 #endif
82 #ifdef CONFIG_HIGHMEM 82 #ifdef CONFIG_HIGHMEM
83 32 83 32
84 #endif 84 #endif
85 }; 85 };
86 86
87 EXPORT_SYMBOL(totalram_pages); 87 EXPORT_SYMBOL(totalram_pages);
88 88
89 static char * const zone_names[MAX_NR_ZONES] = { 89 static char * const zone_names[MAX_NR_ZONES] = {
90 #ifdef CONFIG_ZONE_DMA 90 #ifdef CONFIG_ZONE_DMA
91 "DMA", 91 "DMA",
92 #endif 92 #endif
93 #ifdef CONFIG_ZONE_DMA32 93 #ifdef CONFIG_ZONE_DMA32
94 "DMA32", 94 "DMA32",
95 #endif 95 #endif
96 "Normal", 96 "Normal",
97 #ifdef CONFIG_HIGHMEM 97 #ifdef CONFIG_HIGHMEM
98 "HighMem" 98 "HighMem"
99 #endif 99 #endif
100 }; 100 };
101 101
102 int min_free_kbytes = 1024; 102 int min_free_kbytes = 1024;
103 103
104 unsigned long __meminitdata nr_kernel_pages; 104 unsigned long __meminitdata nr_kernel_pages;
105 unsigned long __meminitdata nr_all_pages; 105 unsigned long __meminitdata nr_all_pages;
106 static unsigned long __initdata dma_reserve; 106 static unsigned long __initdata dma_reserve;
107 107
108 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 108 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
109 /* 109 /*
110 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct 110 * MAX_ACTIVE_REGIONS determines the maxmimum number of distinct
111 * ranges of memory (RAM) that may be registered with add_active_range(). 111 * ranges of memory (RAM) that may be registered with add_active_range().
112 * Ranges passed to add_active_range() will be merged if possible 112 * Ranges passed to add_active_range() will be merged if possible
113 * so the number of times add_active_range() can be called is 113 * so the number of times add_active_range() can be called is
114 * related to the number of nodes and the number of holes 114 * related to the number of nodes and the number of holes
115 */ 115 */
116 #ifdef CONFIG_MAX_ACTIVE_REGIONS 116 #ifdef CONFIG_MAX_ACTIVE_REGIONS
117 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ 117 /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */
118 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS 118 #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS
119 #else 119 #else
120 #if MAX_NUMNODES >= 32 120 #if MAX_NUMNODES >= 32
121 /* If there can be many nodes, allow up to 50 holes per node */ 121 /* If there can be many nodes, allow up to 50 holes per node */
122 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) 122 #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50)
123 #else 123 #else
124 /* By default, allow up to 256 distinct regions */ 124 /* By default, allow up to 256 distinct regions */
125 #define MAX_ACTIVE_REGIONS 256 125 #define MAX_ACTIVE_REGIONS 256
126 #endif 126 #endif
127 #endif 127 #endif
128 128
129 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS]; 129 struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
130 int __initdata nr_nodemap_entries; 130 int __initdata nr_nodemap_entries;
131 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 131 unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
132 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 132 unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
133 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 133 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
134 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; 134 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
135 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; 135 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
136 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 136 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
137 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 137 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
138 138
139 #ifdef CONFIG_DEBUG_VM 139 #ifdef CONFIG_DEBUG_VM
140 static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 140 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
141 { 141 {
142 int ret = 0; 142 int ret = 0;
143 unsigned seq; 143 unsigned seq;
144 unsigned long pfn = page_to_pfn(page); 144 unsigned long pfn = page_to_pfn(page);
145 145
146 do { 146 do {
147 seq = zone_span_seqbegin(zone); 147 seq = zone_span_seqbegin(zone);
148 if (pfn >= zone->zone_start_pfn + zone->spanned_pages) 148 if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
149 ret = 1; 149 ret = 1;
150 else if (pfn < zone->zone_start_pfn) 150 else if (pfn < zone->zone_start_pfn)
151 ret = 1; 151 ret = 1;
152 } while (zone_span_seqretry(zone, seq)); 152 } while (zone_span_seqretry(zone, seq));
153 153
154 return ret; 154 return ret;
155 } 155 }
156 156
157 static int page_is_consistent(struct zone *zone, struct page *page) 157 static int page_is_consistent(struct zone *zone, struct page *page)
158 { 158 {
159 #ifdef CONFIG_HOLES_IN_ZONE 159 #ifdef CONFIG_HOLES_IN_ZONE
160 if (!pfn_valid(page_to_pfn(page))) 160 if (!pfn_valid(page_to_pfn(page)))
161 return 0; 161 return 0;
162 #endif 162 #endif
163 if (zone != page_zone(page)) 163 if (zone != page_zone(page))
164 return 0; 164 return 0;
165 165
166 return 1; 166 return 1;
167 } 167 }
168 /* 168 /*
169 * Temporary debugging check for pages not lying within a given zone. 169 * Temporary debugging check for pages not lying within a given zone.
170 */ 170 */
171 static int bad_range(struct zone *zone, struct page *page) 171 static int bad_range(struct zone *zone, struct page *page)
172 { 172 {
173 if (page_outside_zone_boundaries(zone, page)) 173 if (page_outside_zone_boundaries(zone, page))
174 return 1; 174 return 1;
175 if (!page_is_consistent(zone, page)) 175 if (!page_is_consistent(zone, page))
176 return 1; 176 return 1;
177 177
178 return 0; 178 return 0;
179 } 179 }
180 #else 180 #else
181 static inline int bad_range(struct zone *zone, struct page *page) 181 static inline int bad_range(struct zone *zone, struct page *page)
182 { 182 {
183 return 0; 183 return 0;
184 } 184 }
185 #endif 185 #endif
186 186
187 static void bad_page(struct page *page) 187 static void bad_page(struct page *page)
188 { 188 {
189 printk(KERN_EMERG "Bad page state in process '%s'\n" 189 printk(KERN_EMERG "Bad page state in process '%s'\n"
190 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" 190 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
191 KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 191 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
192 KERN_EMERG "Backtrace:\n", 192 KERN_EMERG "Backtrace:\n",
193 current->comm, page, (int)(2*sizeof(unsigned long)), 193 current->comm, page, (int)(2*sizeof(unsigned long)),
194 (unsigned long)page->flags, page->mapping, 194 (unsigned long)page->flags, page->mapping,
195 page_mapcount(page), page_count(page)); 195 page_mapcount(page), page_count(page));
196 dump_stack(); 196 dump_stack();
197 page->flags &= ~(1 << PG_lru | 197 page->flags &= ~(1 << PG_lru |
198 1 << PG_private | 198 1 << PG_private |
199 1 << PG_locked | 199 1 << PG_locked |
200 1 << PG_active | 200 1 << PG_active |
201 1 << PG_dirty | 201 1 << PG_dirty |
202 1 << PG_reclaim | 202 1 << PG_reclaim |
203 1 << PG_slab | 203 1 << PG_slab |
204 1 << PG_swapcache | 204 1 << PG_swapcache |
205 1 << PG_writeback | 205 1 << PG_writeback |
206 1 << PG_buddy ); 206 1 << PG_buddy );
207 set_page_count(page, 0); 207 set_page_count(page, 0);
208 reset_page_mapcount(page); 208 reset_page_mapcount(page);
209 page->mapping = NULL; 209 page->mapping = NULL;
210 add_taint(TAINT_BAD_PAGE); 210 add_taint(TAINT_BAD_PAGE);
211 } 211 }
212 212
213 /* 213 /*
214 * Higher-order pages are called "compound pages". They are structured thusly: 214 * Higher-order pages are called "compound pages". They are structured thusly:
215 * 215 *
216 * The first PAGE_SIZE page is called the "head page". 216 * The first PAGE_SIZE page is called the "head page".
217 * 217 *
218 * The remaining PAGE_SIZE pages are called "tail pages". 218 * The remaining PAGE_SIZE pages are called "tail pages".
219 * 219 *
220 * All pages have PG_compound set. All pages have their ->private pointing at 220 * All pages have PG_compound set. All pages have their ->private pointing at
221 * the head page (even the head page has this). 221 * the head page (even the head page has this).
222 * 222 *
223 * The first tail page's ->lru.next holds the address of the compound page's 223 * The first tail page's ->lru.next holds the address of the compound page's
224 * put_page() function. Its ->lru.prev holds the order of allocation. 224 * put_page() function. Its ->lru.prev holds the order of allocation.
225 * This usage means that zero-order pages may not be compound. 225 * This usage means that zero-order pages may not be compound.
226 */ 226 */
227 227
228 static void free_compound_page(struct page *page) 228 static void free_compound_page(struct page *page)
229 { 229 {
230 __free_pages_ok(page, (unsigned long)page[1].lru.prev); 230 __free_pages_ok(page, (unsigned long)page[1].lru.prev);
231 } 231 }
232 232
233 static void prep_compound_page(struct page *page, unsigned long order) 233 static void prep_compound_page(struct page *page, unsigned long order)
234 { 234 {
235 int i; 235 int i;
236 int nr_pages = 1 << order; 236 int nr_pages = 1 << order;
237 237
238 set_compound_page_dtor(page, free_compound_page); 238 set_compound_page_dtor(page, free_compound_page);
239 page[1].lru.prev = (void *)order; 239 page[1].lru.prev = (void *)order;
240 for (i = 0; i < nr_pages; i++) { 240 for (i = 0; i < nr_pages; i++) {
241 struct page *p = page + i; 241 struct page *p = page + i;
242 242
243 __SetPageCompound(p); 243 __SetPageCompound(p);
244 set_page_private(p, (unsigned long)page); 244 set_page_private(p, (unsigned long)page);
245 } 245 }
246 } 246 }
247 247
248 static void destroy_compound_page(struct page *page, unsigned long order) 248 static void destroy_compound_page(struct page *page, unsigned long order)
249 { 249 {
250 int i; 250 int i;
251 int nr_pages = 1 << order; 251 int nr_pages = 1 << order;
252 252
253 if (unlikely((unsigned long)page[1].lru.prev != order)) 253 if (unlikely((unsigned long)page[1].lru.prev != order))
254 bad_page(page); 254 bad_page(page);
255 255
256 for (i = 0; i < nr_pages; i++) { 256 for (i = 0; i < nr_pages; i++) {
257 struct page *p = page + i; 257 struct page *p = page + i;
258 258
259 if (unlikely(!PageCompound(p) | 259 if (unlikely(!PageCompound(p) |
260 (page_private(p) != (unsigned long)page))) 260 (page_private(p) != (unsigned long)page)))
261 bad_page(page); 261 bad_page(page);
262 __ClearPageCompound(p); 262 __ClearPageCompound(p);
263 } 263 }
264 } 264 }
265 265
266 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 266 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
267 { 267 {
268 int i; 268 int i;
269 269
270 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); 270 VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
271 /* 271 /*
272 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO 272 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
273 * and __GFP_HIGHMEM from hard or soft interrupt context. 273 * and __GFP_HIGHMEM from hard or soft interrupt context.
274 */ 274 */
275 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); 275 VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
276 for (i = 0; i < (1 << order); i++) 276 for (i = 0; i < (1 << order); i++)
277 clear_highpage(page + i); 277 clear_highpage(page + i);
278 } 278 }
279 279
280 /* 280 /*
281 * function for dealing with page's order in buddy system. 281 * function for dealing with page's order in buddy system.
282 * zone->lock is already acquired when we use these. 282 * zone->lock is already acquired when we use these.
283 * So, we don't need atomic page->flags operations here. 283 * So, we don't need atomic page->flags operations here.
284 */ 284 */
285 static inline unsigned long page_order(struct page *page) 285 static inline unsigned long page_order(struct page *page)
286 { 286 {
287 return page_private(page); 287 return page_private(page);
288 } 288 }
289 289
290 static inline void set_page_order(struct page *page, int order) 290 static inline void set_page_order(struct page *page, int order)
291 { 291 {
292 set_page_private(page, order); 292 set_page_private(page, order);
293 __SetPageBuddy(page); 293 __SetPageBuddy(page);
294 } 294 }
295 295
296 static inline void rmv_page_order(struct page *page) 296 static inline void rmv_page_order(struct page *page)
297 { 297 {
298 __ClearPageBuddy(page); 298 __ClearPageBuddy(page);
299 set_page_private(page, 0); 299 set_page_private(page, 0);
300 } 300 }
301 301
302 /* 302 /*
303 * Locate the struct page for both the matching buddy in our 303 * Locate the struct page for both the matching buddy in our
304 * pair (buddy1) and the combined O(n+1) page they form (page). 304 * pair (buddy1) and the combined O(n+1) page they form (page).
305 * 305 *
306 * 1) Any buddy B1 will have an order O twin B2 which satisfies 306 * 1) Any buddy B1 will have an order O twin B2 which satisfies
307 * the following equation: 307 * the following equation:
308 * B2 = B1 ^ (1 << O) 308 * B2 = B1 ^ (1 << O)
309 * For example, if the starting buddy (buddy2) is #8 its order 309 * For example, if the starting buddy (buddy2) is #8 its order
310 * 1 buddy is #10: 310 * 1 buddy is #10:
311 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 311 * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
312 * 312 *
313 * 2) Any buddy B will have an order O+1 parent P which 313 * 2) Any buddy B will have an order O+1 parent P which
314 * satisfies the following equation: 314 * satisfies the following equation:
315 * P = B & ~(1 << O) 315 * P = B & ~(1 << O)
316 * 316 *
317 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER 317 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
318 */ 318 */
319 static inline struct page * 319 static inline struct page *
320 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) 320 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
321 { 321 {
322 unsigned long buddy_idx = page_idx ^ (1 << order); 322 unsigned long buddy_idx = page_idx ^ (1 << order);
323 323
324 return page + (buddy_idx - page_idx); 324 return page + (buddy_idx - page_idx);
325 } 325 }
326 326
327 static inline unsigned long 327 static inline unsigned long
328 __find_combined_index(unsigned long page_idx, unsigned int order) 328 __find_combined_index(unsigned long page_idx, unsigned int order)
329 { 329 {
330 return (page_idx & ~(1 << order)); 330 return (page_idx & ~(1 << order));
331 } 331 }
332 332
333 /* 333 /*
334 * This function checks whether a page is free && is the buddy 334 * This function checks whether a page is free && is the buddy
335 * we can do coalesce a page and its buddy if 335 * we can do coalesce a page and its buddy if
336 * (a) the buddy is not in a hole && 336 * (a) the buddy is not in a hole &&
337 * (b) the buddy is in the buddy system && 337 * (b) the buddy is in the buddy system &&
338 * (c) a page and its buddy have the same order && 338 * (c) a page and its buddy have the same order &&
339 * (d) a page and its buddy are in the same zone. 339 * (d) a page and its buddy are in the same zone.
340 * 340 *
341 * For recording whether a page is in the buddy system, we use PG_buddy. 341 * For recording whether a page is in the buddy system, we use PG_buddy.
342 * Setting, clearing, and testing PG_buddy is serialized by zone->lock. 342 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
343 * 343 *
344 * For recording page's order, we use page_private(page). 344 * For recording page's order, we use page_private(page).
345 */ 345 */
346 static inline int page_is_buddy(struct page *page, struct page *buddy, 346 static inline int page_is_buddy(struct page *page, struct page *buddy,
347 int order) 347 int order)
348 { 348 {
349 #ifdef CONFIG_HOLES_IN_ZONE 349 #ifdef CONFIG_HOLES_IN_ZONE
350 if (!pfn_valid(page_to_pfn(buddy))) 350 if (!pfn_valid(page_to_pfn(buddy)))
351 return 0; 351 return 0;
352 #endif 352 #endif
353 353
354 if (page_zone_id(page) != page_zone_id(buddy)) 354 if (page_zone_id(page) != page_zone_id(buddy))
355 return 0; 355 return 0;
356 356
357 if (PageBuddy(buddy) && page_order(buddy) == order) { 357 if (PageBuddy(buddy) && page_order(buddy) == order) {
358 BUG_ON(page_count(buddy) != 0); 358 BUG_ON(page_count(buddy) != 0);
359 return 1; 359 return 1;
360 } 360 }
361 return 0; 361 return 0;
362 } 362 }
363 363
364 /* 364 /*
365 * Freeing function for a buddy system allocator. 365 * Freeing function for a buddy system allocator.
366 * 366 *
367 * The concept of a buddy system is to maintain direct-mapped table 367 * The concept of a buddy system is to maintain direct-mapped table
368 * (containing bit values) for memory blocks of various "orders". 368 * (containing bit values) for memory blocks of various "orders".
369 * The bottom level table contains the map for the smallest allocatable 369 * The bottom level table contains the map for the smallest allocatable
370 * units of memory (here, pages), and each level above it describes 370 * units of memory (here, pages), and each level above it describes
371 * pairs of units from the levels below, hence, "buddies". 371 * pairs of units from the levels below, hence, "buddies".
372 * At a high level, all that happens here is marking the table entry 372 * At a high level, all that happens here is marking the table entry
373 * at the bottom level available, and propagating the changes upward 373 * at the bottom level available, and propagating the changes upward
374 * as necessary, plus some accounting needed to play nicely with other 374 * as necessary, plus some accounting needed to play nicely with other
375 * parts of the VM system. 375 * parts of the VM system.
376 * At each level, we keep a list of pages, which are heads of continuous 376 * At each level, we keep a list of pages, which are heads of continuous
377 * free pages of length of (1 << order) and marked with PG_buddy. Page's 377 * free pages of length of (1 << order) and marked with PG_buddy. Page's
378 * order is recorded in page_private(page) field. 378 * order is recorded in page_private(page) field.
379 * So when we are allocating or freeing one, we can derive the state of the 379 * So when we are allocating or freeing one, we can derive the state of the
380 * other. That is, if we allocate a small block, and both were 380 * other. That is, if we allocate a small block, and both were
381 * free, the remainder of the region must be split into blocks. 381 * free, the remainder of the region must be split into blocks.
382 * If a block is freed, and its buddy is also free, then this 382 * If a block is freed, and its buddy is also free, then this
383 * triggers coalescing into a block of larger size. 383 * triggers coalescing into a block of larger size.
384 * 384 *
385 * -- wli 385 * -- wli
386 */ 386 */
387 387
388 static inline void __free_one_page(struct page *page, 388 static inline void __free_one_page(struct page *page,
389 struct zone *zone, unsigned int order) 389 struct zone *zone, unsigned int order)
390 { 390 {
391 unsigned long page_idx; 391 unsigned long page_idx;
392 int order_size = 1 << order; 392 int order_size = 1 << order;
393 393
394 if (unlikely(PageCompound(page))) 394 if (unlikely(PageCompound(page)))
395 destroy_compound_page(page, order); 395 destroy_compound_page(page, order);
396 396
397 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 397 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
398 398
399 VM_BUG_ON(page_idx & (order_size - 1)); 399 VM_BUG_ON(page_idx & (order_size - 1));
400 VM_BUG_ON(bad_range(zone, page)); 400 VM_BUG_ON(bad_range(zone, page));
401 401
402 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); 402 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
403 while (order < MAX_ORDER-1) { 403 while (order < MAX_ORDER-1) {
404 unsigned long combined_idx; 404 unsigned long combined_idx;
405 struct free_area *area; 405 struct free_area *area;
406 struct page *buddy; 406 struct page *buddy;
407 407
408 buddy = __page_find_buddy(page, page_idx, order); 408 buddy = __page_find_buddy(page, page_idx, order);
409 if (!page_is_buddy(page, buddy, order)) 409 if (!page_is_buddy(page, buddy, order))
410 break; /* Move the buddy up one level. */ 410 break; /* Move the buddy up one level. */
411 411
412 list_del(&buddy->lru); 412 list_del(&buddy->lru);
413 area = zone->free_area + order; 413 area = zone->free_area + order;
414 area->nr_free--; 414 area->nr_free--;
415 rmv_page_order(buddy); 415 rmv_page_order(buddy);
416 combined_idx = __find_combined_index(page_idx, order); 416 combined_idx = __find_combined_index(page_idx, order);
417 page = page + (combined_idx - page_idx); 417 page = page + (combined_idx - page_idx);
418 page_idx = combined_idx; 418 page_idx = combined_idx;
419 order++; 419 order++;
420 } 420 }
421 set_page_order(page, order); 421 set_page_order(page, order);
422 list_add(&page->lru, &zone->free_area[order].free_list); 422 list_add(&page->lru, &zone->free_area[order].free_list);
423 zone->free_area[order].nr_free++; 423 zone->free_area[order].nr_free++;
424 } 424 }
425 425
426 static inline int free_pages_check(struct page *page) 426 static inline int free_pages_check(struct page *page)
427 { 427 {
428 if (unlikely(page_mapcount(page) | 428 if (unlikely(page_mapcount(page) |
429 (page->mapping != NULL) | 429 (page->mapping != NULL) |
430 (page_count(page) != 0) | 430 (page_count(page) != 0) |
431 (page->flags & ( 431 (page->flags & (
432 1 << PG_lru | 432 1 << PG_lru |
433 1 << PG_private | 433 1 << PG_private |
434 1 << PG_locked | 434 1 << PG_locked |
435 1 << PG_active | 435 1 << PG_active |
436 1 << PG_reclaim | 436 1 << PG_reclaim |
437 1 << PG_slab | 437 1 << PG_slab |
438 1 << PG_swapcache | 438 1 << PG_swapcache |
439 1 << PG_writeback | 439 1 << PG_writeback |
440 1 << PG_reserved | 440 1 << PG_reserved |
441 1 << PG_buddy )))) 441 1 << PG_buddy ))))
442 bad_page(page); 442 bad_page(page);
443 if (PageDirty(page)) 443 if (PageDirty(page))
444 __ClearPageDirty(page); 444 __ClearPageDirty(page);
445 /* 445 /*
446 * For now, we report if PG_reserved was found set, but do not 446 * For now, we report if PG_reserved was found set, but do not
447 * clear it, and do not free the page. But we shall soon need 447 * clear it, and do not free the page. But we shall soon need
448 * to do more, for when the ZERO_PAGE count wraps negative. 448 * to do more, for when the ZERO_PAGE count wraps negative.
449 */ 449 */
450 return PageReserved(page); 450 return PageReserved(page);
451 } 451 }
452 452
453 /* 453 /*
454 * Frees a list of pages. 454 * Frees a list of pages.
455 * Assumes all pages on list are in same zone, and of same order. 455 * Assumes all pages on list are in same zone, and of same order.
456 * count is the number of pages to free. 456 * count is the number of pages to free.
457 * 457 *
458 * If the zone was previously in an "all pages pinned" state then look to 458 * If the zone was previously in an "all pages pinned" state then look to
459 * see if this freeing clears that state. 459 * see if this freeing clears that state.
460 * 460 *
461 * And clear the zone's pages_scanned counter, to hold off the "all pages are 461 * And clear the zone's pages_scanned counter, to hold off the "all pages are
462 * pinned" detection logic. 462 * pinned" detection logic.
463 */ 463 */
464 static void free_pages_bulk(struct zone *zone, int count, 464 static void free_pages_bulk(struct zone *zone, int count,
465 struct list_head *list, int order) 465 struct list_head *list, int order)
466 { 466 {
467 spin_lock(&zone->lock); 467 spin_lock(&zone->lock);
468 zone->all_unreclaimable = 0; 468 zone->all_unreclaimable = 0;
469 zone->pages_scanned = 0; 469 zone->pages_scanned = 0;
470 while (count--) { 470 while (count--) {
471 struct page *page; 471 struct page *page;
472 472
473 VM_BUG_ON(list_empty(list)); 473 VM_BUG_ON(list_empty(list));
474 page = list_entry(list->prev, struct page, lru); 474 page = list_entry(list->prev, struct page, lru);
475 /* have to delete it as __free_one_page list manipulates */ 475 /* have to delete it as __free_one_page list manipulates */
476 list_del(&page->lru); 476 list_del(&page->lru);
477 __free_one_page(page, zone, order); 477 __free_one_page(page, zone, order);
478 } 478 }
479 spin_unlock(&zone->lock); 479 spin_unlock(&zone->lock);
480 } 480 }
481 481
482 static void free_one_page(struct zone *zone, struct page *page, int order) 482 static void free_one_page(struct zone *zone, struct page *page, int order)
483 { 483 {
484 spin_lock(&zone->lock); 484 spin_lock(&zone->lock);
485 zone->all_unreclaimable = 0; 485 zone->all_unreclaimable = 0;
486 zone->pages_scanned = 0; 486 zone->pages_scanned = 0;
487 __free_one_page(page, zone, order); 487 __free_one_page(page, zone, order);
488 spin_unlock(&zone->lock); 488 spin_unlock(&zone->lock);
489 } 489 }
490 490
491 static void __free_pages_ok(struct page *page, unsigned int order) 491 static void __free_pages_ok(struct page *page, unsigned int order)
492 { 492 {
493 unsigned long flags; 493 unsigned long flags;
494 int i; 494 int i;
495 int reserved = 0; 495 int reserved = 0;
496 496
497 for (i = 0 ; i < (1 << order) ; ++i) 497 for (i = 0 ; i < (1 << order) ; ++i)
498 reserved += free_pages_check(page + i); 498 reserved += free_pages_check(page + i);
499 if (reserved) 499 if (reserved)
500 return; 500 return;
501 501
502 if (!PageHighMem(page)) 502 if (!PageHighMem(page))
503 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); 503 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
504 arch_free_page(page, order); 504 arch_free_page(page, order);
505 kernel_map_pages(page, 1 << order, 0); 505 kernel_map_pages(page, 1 << order, 0);
506 506
507 local_irq_save(flags); 507 local_irq_save(flags);
508 __count_vm_events(PGFREE, 1 << order); 508 __count_vm_events(PGFREE, 1 << order);
509 free_one_page(page_zone(page), page, order); 509 free_one_page(page_zone(page), page, order);
510 local_irq_restore(flags); 510 local_irq_restore(flags);
511 } 511 }
512 512
513 /* 513 /*
514 * permit the bootmem allocator to evade page validation on high-order frees 514 * permit the bootmem allocator to evade page validation on high-order frees
515 */ 515 */
516 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) 516 void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
517 { 517 {
518 if (order == 0) { 518 if (order == 0) {
519 __ClearPageReserved(page); 519 __ClearPageReserved(page);
520 set_page_count(page, 0); 520 set_page_count(page, 0);
521 set_page_refcounted(page); 521 set_page_refcounted(page);
522 __free_page(page); 522 __free_page(page);
523 } else { 523 } else {
524 int loop; 524 int loop;
525 525
526 prefetchw(page); 526 prefetchw(page);
527 for (loop = 0; loop < BITS_PER_LONG; loop++) { 527 for (loop = 0; loop < BITS_PER_LONG; loop++) {
528 struct page *p = &page[loop]; 528 struct page *p = &page[loop];
529 529
530 if (loop + 1 < BITS_PER_LONG) 530 if (loop + 1 < BITS_PER_LONG)
531 prefetchw(p + 1); 531 prefetchw(p + 1);
532 __ClearPageReserved(p); 532 __ClearPageReserved(p);
533 set_page_count(p, 0); 533 set_page_count(p, 0);
534 } 534 }
535 535
536 set_page_refcounted(page); 536 set_page_refcounted(page);
537 __free_pages(page, order); 537 __free_pages(page, order);
538 } 538 }
539 } 539 }
540 540
541 541
542 /* 542 /*
543 * The order of subdivision here is critical for the IO subsystem. 543 * The order of subdivision here is critical for the IO subsystem.
544 * Please do not alter this order without good reasons and regression 544 * Please do not alter this order without good reasons and regression
545 * testing. Specifically, as large blocks of memory are subdivided, 545 * testing. Specifically, as large blocks of memory are subdivided,
546 * the order in which smaller blocks are delivered depends on the order 546 * the order in which smaller blocks are delivered depends on the order
547 * they're subdivided in this function. This is the primary factor 547 * they're subdivided in this function. This is the primary factor
548 * influencing the order in which pages are delivered to the IO 548 * influencing the order in which pages are delivered to the IO
549 * subsystem according to empirical testing, and this is also justified 549 * subsystem according to empirical testing, and this is also justified
550 * by considering the behavior of a buddy system containing a single 550 * by considering the behavior of a buddy system containing a single
551 * large block of memory acted on by a series of small allocations. 551 * large block of memory acted on by a series of small allocations.
552 * This behavior is a critical factor in sglist merging's success. 552 * This behavior is a critical factor in sglist merging's success.
553 * 553 *
554 * -- wli 554 * -- wli
555 */ 555 */
556 static inline void expand(struct zone *zone, struct page *page, 556 static inline void expand(struct zone *zone, struct page *page,
557 int low, int high, struct free_area *area) 557 int low, int high, struct free_area *area)
558 { 558 {
559 unsigned long size = 1 << high; 559 unsigned long size = 1 << high;
560 560
561 while (high > low) { 561 while (high > low) {
562 area--; 562 area--;
563 high--; 563 high--;
564 size >>= 1; 564 size >>= 1;
565 VM_BUG_ON(bad_range(zone, &page[size])); 565 VM_BUG_ON(bad_range(zone, &page[size]));
566 list_add(&page[size].lru, &area->free_list); 566 list_add(&page[size].lru, &area->free_list);
567 area->nr_free++; 567 area->nr_free++;
568 set_page_order(&page[size], high); 568 set_page_order(&page[size], high);
569 } 569 }
570 } 570 }
571 571
572 /* 572 /*
573 * This page is about to be returned from the page allocator 573 * This page is about to be returned from the page allocator
574 */ 574 */
575 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 575 static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
576 { 576 {
577 if (unlikely(page_mapcount(page) | 577 if (unlikely(page_mapcount(page) |
578 (page->mapping != NULL) | 578 (page->mapping != NULL) |
579 (page_count(page) != 0) | 579 (page_count(page) != 0) |
580 (page->flags & ( 580 (page->flags & (
581 1 << PG_lru | 581 1 << PG_lru |
582 1 << PG_private | 582 1 << PG_private |
583 1 << PG_locked | 583 1 << PG_locked |
584 1 << PG_active | 584 1 << PG_active |
585 1 << PG_dirty | 585 1 << PG_dirty |
586 1 << PG_reclaim | 586 1 << PG_reclaim |
587 1 << PG_slab | 587 1 << PG_slab |
588 1 << PG_swapcache | 588 1 << PG_swapcache |
589 1 << PG_writeback | 589 1 << PG_writeback |
590 1 << PG_reserved | 590 1 << PG_reserved |
591 1 << PG_buddy )))) 591 1 << PG_buddy ))))
592 bad_page(page); 592 bad_page(page);
593 593
594 /* 594 /*
595 * For now, we report if PG_reserved was found set, but do not 595 * For now, we report if PG_reserved was found set, but do not
596 * clear it, and do not allocate the page: as a safety net. 596 * clear it, and do not allocate the page: as a safety net.
597 */ 597 */
598 if (PageReserved(page)) 598 if (PageReserved(page))
599 return 1; 599 return 1;
600 600
601 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 601 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
602 1 << PG_referenced | 1 << PG_arch_1 | 602 1 << PG_referenced | 1 << PG_arch_1 |
603 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 603 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
604 set_page_private(page, 0); 604 set_page_private(page, 0);
605 set_page_refcounted(page); 605 set_page_refcounted(page);
606 606
607 arch_alloc_page(page, order); 607 arch_alloc_page(page, order);
608 kernel_map_pages(page, 1 << order, 1); 608 kernel_map_pages(page, 1 << order, 1);
609 609
610 if (gfp_flags & __GFP_ZERO) 610 if (gfp_flags & __GFP_ZERO)
611 prep_zero_page(page, order, gfp_flags); 611 prep_zero_page(page, order, gfp_flags);
612 612
613 if (order && (gfp_flags & __GFP_COMP)) 613 if (order && (gfp_flags & __GFP_COMP))
614 prep_compound_page(page, order); 614 prep_compound_page(page, order);
615 615
616 return 0; 616 return 0;
617 } 617 }
618 618
619 /* 619 /*
620 * Do the hard work of removing an element from the buddy allocator. 620 * Do the hard work of removing an element from the buddy allocator.
621 * Call me with the zone->lock already held. 621 * Call me with the zone->lock already held.
622 */ 622 */
623 static struct page *__rmqueue(struct zone *zone, unsigned int order) 623 static struct page *__rmqueue(struct zone *zone, unsigned int order)
624 { 624 {
625 struct free_area * area; 625 struct free_area * area;
626 unsigned int current_order; 626 unsigned int current_order;
627 struct page *page; 627 struct page *page;
628 628
629 for (current_order = order; current_order < MAX_ORDER; ++current_order) { 629 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
630 area = zone->free_area + current_order; 630 area = zone->free_area + current_order;
631 if (list_empty(&area->free_list)) 631 if (list_empty(&area->free_list))
632 continue; 632 continue;
633 633
634 page = list_entry(area->free_list.next, struct page, lru); 634 page = list_entry(area->free_list.next, struct page, lru);
635 list_del(&page->lru); 635 list_del(&page->lru);
636 rmv_page_order(page); 636 rmv_page_order(page);
637 area->nr_free--; 637 area->nr_free--;
638 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); 638 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
639 expand(zone, page, order, current_order, area); 639 expand(zone, page, order, current_order, area);
640 return page; 640 return page;
641 } 641 }
642 642
643 return NULL; 643 return NULL;
644 } 644 }
645 645
646 /* 646 /*
647 * Obtain a specified number of elements from the buddy allocator, all under 647 * Obtain a specified number of elements from the buddy allocator, all under
648 * a single hold of the lock, for efficiency. Add them to the supplied list. 648 * a single hold of the lock, for efficiency. Add them to the supplied list.
649 * Returns the number of new pages which were placed at *list. 649 * Returns the number of new pages which were placed at *list.
650 */ 650 */
651 static int rmqueue_bulk(struct zone *zone, unsigned int order, 651 static int rmqueue_bulk(struct zone *zone, unsigned int order,
652 unsigned long count, struct list_head *list) 652 unsigned long count, struct list_head *list)
653 { 653 {
654 int i; 654 int i;
655 655
656 spin_lock(&zone->lock); 656 spin_lock(&zone->lock);
657 for (i = 0; i < count; ++i) { 657 for (i = 0; i < count; ++i) {
658 struct page *page = __rmqueue(zone, order); 658 struct page *page = __rmqueue(zone, order);
659 if (unlikely(page == NULL)) 659 if (unlikely(page == NULL))
660 break; 660 break;
661 list_add_tail(&page->lru, list); 661 list_add_tail(&page->lru, list);
662 } 662 }
663 spin_unlock(&zone->lock); 663 spin_unlock(&zone->lock);
664 return i; 664 return i;
665 } 665 }
666 666
667 #if MAX_NUMNODES > 1 667 #if MAX_NUMNODES > 1
668 int nr_node_ids __read_mostly; 668 int nr_node_ids __read_mostly = MAX_NUMNODES;
669 EXPORT_SYMBOL(nr_node_ids); 669 EXPORT_SYMBOL(nr_node_ids);
670 670
671 /* 671 /*
672 * Figure out the number of possible node ids. 672 * Figure out the number of possible node ids.
673 */ 673 */
674 static void __init setup_nr_node_ids(void) 674 static void __init setup_nr_node_ids(void)
675 { 675 {
676 unsigned int node; 676 unsigned int node;
677 unsigned int highest = 0; 677 unsigned int highest = 0;
678 678
679 for_each_node_mask(node, node_possible_map) 679 for_each_node_mask(node, node_possible_map)
680 highest = node; 680 highest = node;
681 nr_node_ids = highest + 1; 681 nr_node_ids = highest + 1;
682 } 682 }
683 #else 683 #else
684 static void __init setup_nr_node_ids(void) {} 684 static void __init setup_nr_node_ids(void) {}
685 #endif 685 #endif
686 686
687 #ifdef CONFIG_NUMA 687 #ifdef CONFIG_NUMA
688 /* 688 /*
689 * Called from the slab reaper to drain pagesets on a particular node that 689 * Called from the slab reaper to drain pagesets on a particular node that
690 * belongs to the currently executing processor. 690 * belongs to the currently executing processor.
691 * Note that this function must be called with the thread pinned to 691 * Note that this function must be called with the thread pinned to
692 * a single processor. 692 * a single processor.
693 */ 693 */
694 void drain_node_pages(int nodeid) 694 void drain_node_pages(int nodeid)
695 { 695 {
696 int i; 696 int i;
697 enum zone_type z; 697 enum zone_type z;
698 unsigned long flags; 698 unsigned long flags;
699 699
700 for (z = 0; z < MAX_NR_ZONES; z++) { 700 for (z = 0; z < MAX_NR_ZONES; z++) {
701 struct zone *zone = NODE_DATA(nodeid)->node_zones + z; 701 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
702 struct per_cpu_pageset *pset; 702 struct per_cpu_pageset *pset;
703 703
704 if (!populated_zone(zone)) 704 if (!populated_zone(zone))
705 continue; 705 continue;
706 706
707 pset = zone_pcp(zone, smp_processor_id()); 707 pset = zone_pcp(zone, smp_processor_id());
708 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 708 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
709 struct per_cpu_pages *pcp; 709 struct per_cpu_pages *pcp;
710 710
711 pcp = &pset->pcp[i]; 711 pcp = &pset->pcp[i];
712 if (pcp->count) { 712 if (pcp->count) {
713 int to_drain; 713 int to_drain;
714 714
715 local_irq_save(flags); 715 local_irq_save(flags);
716 if (pcp->count >= pcp->batch) 716 if (pcp->count >= pcp->batch)
717 to_drain = pcp->batch; 717 to_drain = pcp->batch;
718 else 718 else
719 to_drain = pcp->count; 719 to_drain = pcp->count;
720 free_pages_bulk(zone, to_drain, &pcp->list, 0); 720 free_pages_bulk(zone, to_drain, &pcp->list, 0);
721 pcp->count -= to_drain; 721 pcp->count -= to_drain;
722 local_irq_restore(flags); 722 local_irq_restore(flags);
723 } 723 }
724 } 724 }
725 } 725 }
726 } 726 }
727 #endif 727 #endif
728 728
729 static void __drain_pages(unsigned int cpu) 729 static void __drain_pages(unsigned int cpu)
730 { 730 {
731 unsigned long flags; 731 unsigned long flags;
732 struct zone *zone; 732 struct zone *zone;
733 int i; 733 int i;
734 734
735 for_each_zone(zone) { 735 for_each_zone(zone) {
736 struct per_cpu_pageset *pset; 736 struct per_cpu_pageset *pset;
737 737
738 if (!populated_zone(zone)) 738 if (!populated_zone(zone))
739 continue; 739 continue;
740 740
741 pset = zone_pcp(zone, cpu); 741 pset = zone_pcp(zone, cpu);
742 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 742 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
743 struct per_cpu_pages *pcp; 743 struct per_cpu_pages *pcp;
744 744
745 pcp = &pset->pcp[i]; 745 pcp = &pset->pcp[i];
746 local_irq_save(flags); 746 local_irq_save(flags);
747 free_pages_bulk(zone, pcp->count, &pcp->list, 0); 747 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
748 pcp->count = 0; 748 pcp->count = 0;
749 local_irq_restore(flags); 749 local_irq_restore(flags);
750 } 750 }
751 } 751 }
752 } 752 }
753 753
754 #ifdef CONFIG_PM 754 #ifdef CONFIG_PM
755 755
756 void mark_free_pages(struct zone *zone) 756 void mark_free_pages(struct zone *zone)
757 { 757 {
758 unsigned long pfn, max_zone_pfn; 758 unsigned long pfn, max_zone_pfn;
759 unsigned long flags; 759 unsigned long flags;
760 int order; 760 int order;
761 struct list_head *curr; 761 struct list_head *curr;
762 762
763 if (!zone->spanned_pages) 763 if (!zone->spanned_pages)
764 return; 764 return;
765 765
766 spin_lock_irqsave(&zone->lock, flags); 766 spin_lock_irqsave(&zone->lock, flags);
767 767
768 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; 768 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
769 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) 769 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
770 if (pfn_valid(pfn)) { 770 if (pfn_valid(pfn)) {
771 struct page *page = pfn_to_page(pfn); 771 struct page *page = pfn_to_page(pfn);
772 772
773 if (!PageNosave(page)) 773 if (!PageNosave(page))
774 ClearPageNosaveFree(page); 774 ClearPageNosaveFree(page);
775 } 775 }
776 776
777 for (order = MAX_ORDER - 1; order >= 0; --order) 777 for (order = MAX_ORDER - 1; order >= 0; --order)
778 list_for_each(curr, &zone->free_area[order].free_list) { 778 list_for_each(curr, &zone->free_area[order].free_list) {
779 unsigned long i; 779 unsigned long i;
780 780
781 pfn = page_to_pfn(list_entry(curr, struct page, lru)); 781 pfn = page_to_pfn(list_entry(curr, struct page, lru));
782 for (i = 0; i < (1UL << order); i++) 782 for (i = 0; i < (1UL << order); i++)
783 SetPageNosaveFree(pfn_to_page(pfn + i)); 783 SetPageNosaveFree(pfn_to_page(pfn + i));
784 } 784 }
785 785
786 spin_unlock_irqrestore(&zone->lock, flags); 786 spin_unlock_irqrestore(&zone->lock, flags);
787 } 787 }
788 788
789 /* 789 /*
790 * Spill all of this CPU's per-cpu pages back into the buddy allocator. 790 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
791 */ 791 */
792 void drain_local_pages(void) 792 void drain_local_pages(void)
793 { 793 {
794 unsigned long flags; 794 unsigned long flags;
795 795
796 local_irq_save(flags); 796 local_irq_save(flags);
797 __drain_pages(smp_processor_id()); 797 __drain_pages(smp_processor_id());
798 local_irq_restore(flags); 798 local_irq_restore(flags);
799 } 799 }
800 #endif /* CONFIG_PM */ 800 #endif /* CONFIG_PM */
801 801
802 /* 802 /*
803 * Free a 0-order page 803 * Free a 0-order page
804 */ 804 */
805 static void fastcall free_hot_cold_page(struct page *page, int cold) 805 static void fastcall free_hot_cold_page(struct page *page, int cold)
806 { 806 {
807 struct zone *zone = page_zone(page); 807 struct zone *zone = page_zone(page);
808 struct per_cpu_pages *pcp; 808 struct per_cpu_pages *pcp;
809 unsigned long flags; 809 unsigned long flags;
810 810
811 if (PageAnon(page)) 811 if (PageAnon(page))
812 page->mapping = NULL; 812 page->mapping = NULL;
813 if (free_pages_check(page)) 813 if (free_pages_check(page))
814 return; 814 return;
815 815
816 if (!PageHighMem(page)) 816 if (!PageHighMem(page))
817 debug_check_no_locks_freed(page_address(page), PAGE_SIZE); 817 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
818 arch_free_page(page, 0); 818 arch_free_page(page, 0);
819 kernel_map_pages(page, 1, 0); 819 kernel_map_pages(page, 1, 0);
820 820
821 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 821 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
822 local_irq_save(flags); 822 local_irq_save(flags);
823 __count_vm_event(PGFREE); 823 __count_vm_event(PGFREE);
824 list_add(&page->lru, &pcp->list); 824 list_add(&page->lru, &pcp->list);
825 pcp->count++; 825 pcp->count++;
826 if (pcp->count >= pcp->high) { 826 if (pcp->count >= pcp->high) {
827 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 827 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
828 pcp->count -= pcp->batch; 828 pcp->count -= pcp->batch;
829 } 829 }
830 local_irq_restore(flags); 830 local_irq_restore(flags);
831 put_cpu(); 831 put_cpu();
832 } 832 }
833 833
834 void fastcall free_hot_page(struct page *page) 834 void fastcall free_hot_page(struct page *page)
835 { 835 {
836 free_hot_cold_page(page, 0); 836 free_hot_cold_page(page, 0);
837 } 837 }
838 838
839 void fastcall free_cold_page(struct page *page) 839 void fastcall free_cold_page(struct page *page)
840 { 840 {
841 free_hot_cold_page(page, 1); 841 free_hot_cold_page(page, 1);
842 } 842 }
843 843
844 /* 844 /*
845 * split_page takes a non-compound higher-order page, and splits it into 845 * split_page takes a non-compound higher-order page, and splits it into
846 * n (1<<order) sub-pages: page[0..n] 846 * n (1<<order) sub-pages: page[0..n]
847 * Each sub-page must be freed individually. 847 * Each sub-page must be freed individually.
848 * 848 *
849 * Note: this is probably too low level an operation for use in drivers. 849 * Note: this is probably too low level an operation for use in drivers.
850 * Please consult with lkml before using this in your driver. 850 * Please consult with lkml before using this in your driver.
851 */ 851 */
852 void split_page(struct page *page, unsigned int order) 852 void split_page(struct page *page, unsigned int order)
853 { 853 {
854 int i; 854 int i;
855 855
856 VM_BUG_ON(PageCompound(page)); 856 VM_BUG_ON(PageCompound(page));
857 VM_BUG_ON(!page_count(page)); 857 VM_BUG_ON(!page_count(page));
858 for (i = 1; i < (1 << order); i++) 858 for (i = 1; i < (1 << order); i++)
859 set_page_refcounted(page + i); 859 set_page_refcounted(page + i);
860 } 860 }
861 861
862 /* 862 /*
863 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But 863 * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
864 * we cheat by calling it from here, in the order > 0 path. Saves a branch 864 * we cheat by calling it from here, in the order > 0 path. Saves a branch
865 * or two. 865 * or two.
866 */ 866 */
867 static struct page *buffered_rmqueue(struct zonelist *zonelist, 867 static struct page *buffered_rmqueue(struct zonelist *zonelist,
868 struct zone *zone, int order, gfp_t gfp_flags) 868 struct zone *zone, int order, gfp_t gfp_flags)
869 { 869 {
870 unsigned long flags; 870 unsigned long flags;
871 struct page *page; 871 struct page *page;
872 int cold = !!(gfp_flags & __GFP_COLD); 872 int cold = !!(gfp_flags & __GFP_COLD);
873 int cpu; 873 int cpu;
874 874
875 again: 875 again:
876 cpu = get_cpu(); 876 cpu = get_cpu();
877 if (likely(order == 0)) { 877 if (likely(order == 0)) {
878 struct per_cpu_pages *pcp; 878 struct per_cpu_pages *pcp;
879 879
880 pcp = &zone_pcp(zone, cpu)->pcp[cold]; 880 pcp = &zone_pcp(zone, cpu)->pcp[cold];
881 local_irq_save(flags); 881 local_irq_save(flags);
882 if (!pcp->count) { 882 if (!pcp->count) {
883 pcp->count = rmqueue_bulk(zone, 0, 883 pcp->count = rmqueue_bulk(zone, 0,
884 pcp->batch, &pcp->list); 884 pcp->batch, &pcp->list);
885 if (unlikely(!pcp->count)) 885 if (unlikely(!pcp->count))
886 goto failed; 886 goto failed;
887 } 887 }
888 page = list_entry(pcp->list.next, struct page, lru); 888 page = list_entry(pcp->list.next, struct page, lru);
889 list_del(&page->lru); 889 list_del(&page->lru);
890 pcp->count--; 890 pcp->count--;
891 } else { 891 } else {
892 spin_lock_irqsave(&zone->lock, flags); 892 spin_lock_irqsave(&zone->lock, flags);
893 page = __rmqueue(zone, order); 893 page = __rmqueue(zone, order);
894 spin_unlock(&zone->lock); 894 spin_unlock(&zone->lock);
895 if (!page) 895 if (!page)
896 goto failed; 896 goto failed;
897 } 897 }
898 898
899 __count_zone_vm_events(PGALLOC, zone, 1 << order); 899 __count_zone_vm_events(PGALLOC, zone, 1 << order);
900 zone_statistics(zonelist, zone); 900 zone_statistics(zonelist, zone);
901 local_irq_restore(flags); 901 local_irq_restore(flags);
902 put_cpu(); 902 put_cpu();
903 903
904 VM_BUG_ON(bad_range(zone, page)); 904 VM_BUG_ON(bad_range(zone, page));
905 if (prep_new_page(page, order, gfp_flags)) 905 if (prep_new_page(page, order, gfp_flags))
906 goto again; 906 goto again;
907 return page; 907 return page;
908 908
909 failed: 909 failed:
910 local_irq_restore(flags); 910 local_irq_restore(flags);
911 put_cpu(); 911 put_cpu();
912 return NULL; 912 return NULL;
913 } 913 }
914 914
915 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 915 #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
916 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 916 #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
917 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 917 #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
918 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 918 #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
919 #define ALLOC_HARDER 0x10 /* try to alloc harder */ 919 #define ALLOC_HARDER 0x10 /* try to alloc harder */
920 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 920 #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
921 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 921 #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
922 922
923 #ifdef CONFIG_FAIL_PAGE_ALLOC 923 #ifdef CONFIG_FAIL_PAGE_ALLOC
924 924
925 static struct fail_page_alloc_attr { 925 static struct fail_page_alloc_attr {
926 struct fault_attr attr; 926 struct fault_attr attr;
927 927
928 u32 ignore_gfp_highmem; 928 u32 ignore_gfp_highmem;
929 u32 ignore_gfp_wait; 929 u32 ignore_gfp_wait;
930 930
931 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 931 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
932 932
933 struct dentry *ignore_gfp_highmem_file; 933 struct dentry *ignore_gfp_highmem_file;
934 struct dentry *ignore_gfp_wait_file; 934 struct dentry *ignore_gfp_wait_file;
935 935
936 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 936 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
937 937
938 } fail_page_alloc = { 938 } fail_page_alloc = {
939 .attr = FAULT_ATTR_INITIALIZER, 939 .attr = FAULT_ATTR_INITIALIZER,
940 .ignore_gfp_wait = 1, 940 .ignore_gfp_wait = 1,
941 .ignore_gfp_highmem = 1, 941 .ignore_gfp_highmem = 1,
942 }; 942 };
943 943
944 static int __init setup_fail_page_alloc(char *str) 944 static int __init setup_fail_page_alloc(char *str)
945 { 945 {
946 return setup_fault_attr(&fail_page_alloc.attr, str); 946 return setup_fault_attr(&fail_page_alloc.attr, str);
947 } 947 }
948 __setup("fail_page_alloc=", setup_fail_page_alloc); 948 __setup("fail_page_alloc=", setup_fail_page_alloc);
949 949
950 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 950 static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
951 { 951 {
952 if (gfp_mask & __GFP_NOFAIL) 952 if (gfp_mask & __GFP_NOFAIL)
953 return 0; 953 return 0;
954 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 954 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
955 return 0; 955 return 0;
956 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 956 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
957 return 0; 957 return 0;
958 958
959 return should_fail(&fail_page_alloc.attr, 1 << order); 959 return should_fail(&fail_page_alloc.attr, 1 << order);
960 } 960 }
961 961
962 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 962 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
963 963
964 static int __init fail_page_alloc_debugfs(void) 964 static int __init fail_page_alloc_debugfs(void)
965 { 965 {
966 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; 966 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
967 struct dentry *dir; 967 struct dentry *dir;
968 int err; 968 int err;
969 969
970 err = init_fault_attr_dentries(&fail_page_alloc.attr, 970 err = init_fault_attr_dentries(&fail_page_alloc.attr,
971 "fail_page_alloc"); 971 "fail_page_alloc");
972 if (err) 972 if (err)
973 return err; 973 return err;
974 dir = fail_page_alloc.attr.dentries.dir; 974 dir = fail_page_alloc.attr.dentries.dir;
975 975
976 fail_page_alloc.ignore_gfp_wait_file = 976 fail_page_alloc.ignore_gfp_wait_file =
977 debugfs_create_bool("ignore-gfp-wait", mode, dir, 977 debugfs_create_bool("ignore-gfp-wait", mode, dir,
978 &fail_page_alloc.ignore_gfp_wait); 978 &fail_page_alloc.ignore_gfp_wait);
979 979
980 fail_page_alloc.ignore_gfp_highmem_file = 980 fail_page_alloc.ignore_gfp_highmem_file =
981 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 981 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
982 &fail_page_alloc.ignore_gfp_highmem); 982 &fail_page_alloc.ignore_gfp_highmem);
983 983
984 if (!fail_page_alloc.ignore_gfp_wait_file || 984 if (!fail_page_alloc.ignore_gfp_wait_file ||
985 !fail_page_alloc.ignore_gfp_highmem_file) { 985 !fail_page_alloc.ignore_gfp_highmem_file) {
986 err = -ENOMEM; 986 err = -ENOMEM;
987 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); 987 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
988 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); 988 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
989 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 989 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
990 } 990 }
991 991
992 return err; 992 return err;
993 } 993 }
994 994
995 late_initcall(fail_page_alloc_debugfs); 995 late_initcall(fail_page_alloc_debugfs);
996 996
997 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 997 #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
998 998
999 #else /* CONFIG_FAIL_PAGE_ALLOC */ 999 #else /* CONFIG_FAIL_PAGE_ALLOC */
1000 1000
1001 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1001 static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1002 { 1002 {
1003 return 0; 1003 return 0;
1004 } 1004 }
1005 1005
1006 #endif /* CONFIG_FAIL_PAGE_ALLOC */ 1006 #endif /* CONFIG_FAIL_PAGE_ALLOC */
1007 1007
1008 /* 1008 /*
1009 * Return 1 if free pages are above 'mark'. This takes into account the order 1009 * Return 1 if free pages are above 'mark'. This takes into account the order
1010 * of the allocation. 1010 * of the allocation.
1011 */ 1011 */
1012 int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1012 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1013 int classzone_idx, int alloc_flags) 1013 int classzone_idx, int alloc_flags)
1014 { 1014 {
1015 /* free_pages my go negative - that's OK */ 1015 /* free_pages my go negative - that's OK */
1016 long min = mark; 1016 long min = mark;
1017 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; 1017 long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
1018 int o; 1018 int o;
1019 1019
1020 if (alloc_flags & ALLOC_HIGH) 1020 if (alloc_flags & ALLOC_HIGH)
1021 min -= min / 2; 1021 min -= min / 2;
1022 if (alloc_flags & ALLOC_HARDER) 1022 if (alloc_flags & ALLOC_HARDER)
1023 min -= min / 4; 1023 min -= min / 4;
1024 1024
1025 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1025 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
1026 return 0; 1026 return 0;
1027 for (o = 0; o < order; o++) { 1027 for (o = 0; o < order; o++) {
1028 /* At the next order, this order's pages become unavailable */ 1028 /* At the next order, this order's pages become unavailable */
1029 free_pages -= z->free_area[o].nr_free << o; 1029 free_pages -= z->free_area[o].nr_free << o;
1030 1030
1031 /* Require fewer higher order pages to be free */ 1031 /* Require fewer higher order pages to be free */
1032 min >>= 1; 1032 min >>= 1;
1033 1033
1034 if (free_pages <= min) 1034 if (free_pages <= min)
1035 return 0; 1035 return 0;
1036 } 1036 }
1037 return 1; 1037 return 1;
1038 } 1038 }
1039 1039
1040 #ifdef CONFIG_NUMA 1040 #ifdef CONFIG_NUMA
1041 /* 1041 /*
1042 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to 1042 * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
1043 * skip over zones that are not allowed by the cpuset, or that have 1043 * skip over zones that are not allowed by the cpuset, or that have
1044 * been recently (in last second) found to be nearly full. See further 1044 * been recently (in last second) found to be nearly full. See further
1045 * comments in mmzone.h. Reduces cache footprint of zonelist scans 1045 * comments in mmzone.h. Reduces cache footprint of zonelist scans
1046 * that have to skip over alot of full or unallowed zones. 1046 * that have to skip over alot of full or unallowed zones.
1047 * 1047 *
1048 * If the zonelist cache is present in the passed in zonelist, then 1048 * If the zonelist cache is present in the passed in zonelist, then
1049 * returns a pointer to the allowed node mask (either the current 1049 * returns a pointer to the allowed node mask (either the current
1050 * tasks mems_allowed, or node_online_map.) 1050 * tasks mems_allowed, or node_online_map.)
1051 * 1051 *
1052 * If the zonelist cache is not available for this zonelist, does 1052 * If the zonelist cache is not available for this zonelist, does
1053 * nothing and returns NULL. 1053 * nothing and returns NULL.
1054 * 1054 *
1055 * If the fullzones BITMAP in the zonelist cache is stale (more than 1055 * If the fullzones BITMAP in the zonelist cache is stale (more than
1056 * a second since last zap'd) then we zap it out (clear its bits.) 1056 * a second since last zap'd) then we zap it out (clear its bits.)
1057 * 1057 *
1058 * We hold off even calling zlc_setup, until after we've checked the 1058 * We hold off even calling zlc_setup, until after we've checked the
1059 * first zone in the zonelist, on the theory that most allocations will 1059 * first zone in the zonelist, on the theory that most allocations will
1060 * be satisfied from that first zone, so best to examine that zone as 1060 * be satisfied from that first zone, so best to examine that zone as
1061 * quickly as we can. 1061 * quickly as we can.
1062 */ 1062 */
1063 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1063 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1064 { 1064 {
1065 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1065 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1066 nodemask_t *allowednodes; /* zonelist_cache approximation */ 1066 nodemask_t *allowednodes; /* zonelist_cache approximation */
1067 1067
1068 zlc = zonelist->zlcache_ptr; 1068 zlc = zonelist->zlcache_ptr;
1069 if (!zlc) 1069 if (!zlc)
1070 return NULL; 1070 return NULL;
1071 1071
1072 if (jiffies - zlc->last_full_zap > 1 * HZ) { 1072 if (jiffies - zlc->last_full_zap > 1 * HZ) {
1073 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1073 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1074 zlc->last_full_zap = jiffies; 1074 zlc->last_full_zap = jiffies;
1075 } 1075 }
1076 1076
1077 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1077 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1078 &cpuset_current_mems_allowed : 1078 &cpuset_current_mems_allowed :
1079 &node_online_map; 1079 &node_online_map;
1080 return allowednodes; 1080 return allowednodes;
1081 } 1081 }
1082 1082
1083 /* 1083 /*
1084 * Given 'z' scanning a zonelist, run a couple of quick checks to see 1084 * Given 'z' scanning a zonelist, run a couple of quick checks to see
1085 * if it is worth looking at further for free memory: 1085 * if it is worth looking at further for free memory:
1086 * 1) Check that the zone isn't thought to be full (doesn't have its 1086 * 1) Check that the zone isn't thought to be full (doesn't have its
1087 * bit set in the zonelist_cache fullzones BITMAP). 1087 * bit set in the zonelist_cache fullzones BITMAP).
1088 * 2) Check that the zones node (obtained from the zonelist_cache 1088 * 2) Check that the zones node (obtained from the zonelist_cache
1089 * z_to_n[] mapping) is allowed in the passed in allowednodes mask. 1089 * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
1090 * Return true (non-zero) if zone is worth looking at further, or 1090 * Return true (non-zero) if zone is worth looking at further, or
1091 * else return false (zero) if it is not. 1091 * else return false (zero) if it is not.
1092 * 1092 *
1093 * This check -ignores- the distinction between various watermarks, 1093 * This check -ignores- the distinction between various watermarks,
1094 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is 1094 * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
1095 * found to be full for any variation of these watermarks, it will 1095 * found to be full for any variation of these watermarks, it will
1096 * be considered full for up to one second by all requests, unless 1096 * be considered full for up to one second by all requests, unless
1097 * we are so low on memory on all allowed nodes that we are forced 1097 * we are so low on memory on all allowed nodes that we are forced
1098 * into the second scan of the zonelist. 1098 * into the second scan of the zonelist.
1099 * 1099 *
1100 * In the second scan we ignore this zonelist cache and exactly 1100 * In the second scan we ignore this zonelist cache and exactly
1101 * apply the watermarks to all zones, even it is slower to do so. 1101 * apply the watermarks to all zones, even it is slower to do so.
1102 * We are low on memory in the second scan, and should leave no stone 1102 * We are low on memory in the second scan, and should leave no stone
1103 * unturned looking for a free page. 1103 * unturned looking for a free page.
1104 */ 1104 */
1105 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1105 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1106 nodemask_t *allowednodes) 1106 nodemask_t *allowednodes)
1107 { 1107 {
1108 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1108 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1109 int i; /* index of *z in zonelist zones */ 1109 int i; /* index of *z in zonelist zones */
1110 int n; /* node that zone *z is on */ 1110 int n; /* node that zone *z is on */
1111 1111
1112 zlc = zonelist->zlcache_ptr; 1112 zlc = zonelist->zlcache_ptr;
1113 if (!zlc) 1113 if (!zlc)
1114 return 1; 1114 return 1;
1115 1115
1116 i = z - zonelist->zones; 1116 i = z - zonelist->zones;
1117 n = zlc->z_to_n[i]; 1117 n = zlc->z_to_n[i];
1118 1118
1119 /* This zone is worth trying if it is allowed but not full */ 1119 /* This zone is worth trying if it is allowed but not full */
1120 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones); 1120 return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
1121 } 1121 }
1122 1122
1123 /* 1123 /*
1124 * Given 'z' scanning a zonelist, set the corresponding bit in 1124 * Given 'z' scanning a zonelist, set the corresponding bit in
1125 * zlc->fullzones, so that subsequent attempts to allocate a page 1125 * zlc->fullzones, so that subsequent attempts to allocate a page
1126 * from that zone don't waste time re-examining it. 1126 * from that zone don't waste time re-examining it.
1127 */ 1127 */
1128 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1128 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1129 { 1129 {
1130 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1130 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1131 int i; /* index of *z in zonelist zones */ 1131 int i; /* index of *z in zonelist zones */
1132 1132
1133 zlc = zonelist->zlcache_ptr; 1133 zlc = zonelist->zlcache_ptr;
1134 if (!zlc) 1134 if (!zlc)
1135 return; 1135 return;
1136 1136
1137 i = z - zonelist->zones; 1137 i = z - zonelist->zones;
1138 1138
1139 set_bit(i, zlc->fullzones); 1139 set_bit(i, zlc->fullzones);
1140 } 1140 }
1141 1141
1142 #else /* CONFIG_NUMA */ 1142 #else /* CONFIG_NUMA */
1143 1143
1144 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) 1144 static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1145 { 1145 {
1146 return NULL; 1146 return NULL;
1147 } 1147 }
1148 1148
1149 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1149 static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1150 nodemask_t *allowednodes) 1150 nodemask_t *allowednodes)
1151 { 1151 {
1152 return 1; 1152 return 1;
1153 } 1153 }
1154 1154
1155 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1155 static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1156 { 1156 {
1157 } 1157 }
1158 #endif /* CONFIG_NUMA */ 1158 #endif /* CONFIG_NUMA */
1159 1159
1160 /* 1160 /*
1161 * get_page_from_freelist goes through the zonelist trying to allocate 1161 * get_page_from_freelist goes through the zonelist trying to allocate
1162 * a page. 1162 * a page.
1163 */ 1163 */
1164 static struct page * 1164 static struct page *
1165 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1165 get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1166 struct zonelist *zonelist, int alloc_flags) 1166 struct zonelist *zonelist, int alloc_flags)
1167 { 1167 {
1168 struct zone **z; 1168 struct zone **z;
1169 struct page *page = NULL; 1169 struct page *page = NULL;
1170 int classzone_idx = zone_idx(zonelist->zones[0]); 1170 int classzone_idx = zone_idx(zonelist->zones[0]);
1171 struct zone *zone; 1171 struct zone *zone;
1172 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1172 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1173 int zlc_active = 0; /* set if using zonelist_cache */ 1173 int zlc_active = 0; /* set if using zonelist_cache */
1174 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1174 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1175 1175
1176 zonelist_scan: 1176 zonelist_scan:
1177 /* 1177 /*
1178 * Scan zonelist, looking for a zone with enough free. 1178 * Scan zonelist, looking for a zone with enough free.
1179 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1179 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1180 */ 1180 */
1181 z = zonelist->zones; 1181 z = zonelist->zones;
1182 1182
1183 do { 1183 do {
1184 if (NUMA_BUILD && zlc_active && 1184 if (NUMA_BUILD && zlc_active &&
1185 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1185 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1186 continue; 1186 continue;
1187 zone = *z; 1187 zone = *z;
1188 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) && 1188 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
1189 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1189 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1190 break; 1190 break;
1191 if ((alloc_flags & ALLOC_CPUSET) && 1191 if ((alloc_flags & ALLOC_CPUSET) &&
1192 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1192 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1193 goto try_next_zone; 1193 goto try_next_zone;
1194 1194
1195 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1195 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1196 unsigned long mark; 1196 unsigned long mark;
1197 if (alloc_flags & ALLOC_WMARK_MIN) 1197 if (alloc_flags & ALLOC_WMARK_MIN)
1198 mark = zone->pages_min; 1198 mark = zone->pages_min;
1199 else if (alloc_flags & ALLOC_WMARK_LOW) 1199 else if (alloc_flags & ALLOC_WMARK_LOW)
1200 mark = zone->pages_low; 1200 mark = zone->pages_low;
1201 else 1201 else
1202 mark = zone->pages_high; 1202 mark = zone->pages_high;
1203 if (!zone_watermark_ok(zone, order, mark, 1203 if (!zone_watermark_ok(zone, order, mark,
1204 classzone_idx, alloc_flags)) { 1204 classzone_idx, alloc_flags)) {
1205 if (!zone_reclaim_mode || 1205 if (!zone_reclaim_mode ||
1206 !zone_reclaim(zone, gfp_mask, order)) 1206 !zone_reclaim(zone, gfp_mask, order))
1207 goto this_zone_full; 1207 goto this_zone_full;
1208 } 1208 }
1209 } 1209 }
1210 1210
1211 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1211 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
1212 if (page) 1212 if (page)
1213 break; 1213 break;
1214 this_zone_full: 1214 this_zone_full:
1215 if (NUMA_BUILD) 1215 if (NUMA_BUILD)
1216 zlc_mark_zone_full(zonelist, z); 1216 zlc_mark_zone_full(zonelist, z);
1217 try_next_zone: 1217 try_next_zone:
1218 if (NUMA_BUILD && !did_zlc_setup) { 1218 if (NUMA_BUILD && !did_zlc_setup) {
1219 /* we do zlc_setup after the first zone is tried */ 1219 /* we do zlc_setup after the first zone is tried */
1220 allowednodes = zlc_setup(zonelist, alloc_flags); 1220 allowednodes = zlc_setup(zonelist, alloc_flags);
1221 zlc_active = 1; 1221 zlc_active = 1;
1222 did_zlc_setup = 1; 1222 did_zlc_setup = 1;
1223 } 1223 }
1224 } while (*(++z) != NULL); 1224 } while (*(++z) != NULL);
1225 1225
1226 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1226 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1227 /* Disable zlc cache for second zonelist scan */ 1227 /* Disable zlc cache for second zonelist scan */
1228 zlc_active = 0; 1228 zlc_active = 0;
1229 goto zonelist_scan; 1229 goto zonelist_scan;
1230 } 1230 }
1231 return page; 1231 return page;
1232 } 1232 }
1233 1233
1234 /* 1234 /*
1235 * This is the 'heart' of the zoned buddy allocator. 1235 * This is the 'heart' of the zoned buddy allocator.
1236 */ 1236 */
1237 struct page * fastcall 1237 struct page * fastcall
1238 __alloc_pages(gfp_t gfp_mask, unsigned int order, 1238 __alloc_pages(gfp_t gfp_mask, unsigned int order,
1239 struct zonelist *zonelist) 1239 struct zonelist *zonelist)
1240 { 1240 {
1241 const gfp_t wait = gfp_mask & __GFP_WAIT; 1241 const gfp_t wait = gfp_mask & __GFP_WAIT;
1242 struct zone **z; 1242 struct zone **z;
1243 struct page *page; 1243 struct page *page;
1244 struct reclaim_state reclaim_state; 1244 struct reclaim_state reclaim_state;
1245 struct task_struct *p = current; 1245 struct task_struct *p = current;
1246 int do_retry; 1246 int do_retry;
1247 int alloc_flags; 1247 int alloc_flags;
1248 int did_some_progress; 1248 int did_some_progress;
1249 1249
1250 might_sleep_if(wait); 1250 might_sleep_if(wait);
1251 1251
1252 if (should_fail_alloc_page(gfp_mask, order)) 1252 if (should_fail_alloc_page(gfp_mask, order))
1253 return NULL; 1253 return NULL;
1254 1254
1255 restart: 1255 restart:
1256 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1256 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
1257 1257
1258 if (unlikely(*z == NULL)) { 1258 if (unlikely(*z == NULL)) {
1259 /* Should this ever happen?? */ 1259 /* Should this ever happen?? */
1260 return NULL; 1260 return NULL;
1261 } 1261 }
1262 1262
1263 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1263 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1264 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1264 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1265 if (page) 1265 if (page)
1266 goto got_pg; 1266 goto got_pg;
1267 1267
1268 /* 1268 /*
1269 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1269 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
1270 * __GFP_NOWARN set) should not cause reclaim since the subsystem 1270 * __GFP_NOWARN set) should not cause reclaim since the subsystem
1271 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim 1271 * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
1272 * using a larger set of nodes after it has established that the 1272 * using a larger set of nodes after it has established that the
1273 * allowed per node queues are empty and that nodes are 1273 * allowed per node queues are empty and that nodes are
1274 * over allocated. 1274 * over allocated.
1275 */ 1275 */
1276 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1276 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1277 goto nopage; 1277 goto nopage;
1278 1278
1279 for (z = zonelist->zones; *z; z++) 1279 for (z = zonelist->zones; *z; z++)
1280 wakeup_kswapd(*z, order); 1280 wakeup_kswapd(*z, order);
1281 1281
1282 /* 1282 /*
1283 * OK, we're below the kswapd watermark and have kicked background 1283 * OK, we're below the kswapd watermark and have kicked background
1284 * reclaim. Now things get more complex, so set up alloc_flags according 1284 * reclaim. Now things get more complex, so set up alloc_flags according
1285 * to how we want to proceed. 1285 * to how we want to proceed.
1286 * 1286 *
1287 * The caller may dip into page reserves a bit more if the caller 1287 * The caller may dip into page reserves a bit more if the caller
1288 * cannot run direct reclaim, or if the caller has realtime scheduling 1288 * cannot run direct reclaim, or if the caller has realtime scheduling
1289 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will 1289 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1290 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). 1290 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1291 */ 1291 */
1292 alloc_flags = ALLOC_WMARK_MIN; 1292 alloc_flags = ALLOC_WMARK_MIN;
1293 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 1293 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1294 alloc_flags |= ALLOC_HARDER; 1294 alloc_flags |= ALLOC_HARDER;
1295 if (gfp_mask & __GFP_HIGH) 1295 if (gfp_mask & __GFP_HIGH)
1296 alloc_flags |= ALLOC_HIGH; 1296 alloc_flags |= ALLOC_HIGH;
1297 if (wait) 1297 if (wait)
1298 alloc_flags |= ALLOC_CPUSET; 1298 alloc_flags |= ALLOC_CPUSET;
1299 1299
1300 /* 1300 /*
1301 * Go through the zonelist again. Let __GFP_HIGH and allocations 1301 * Go through the zonelist again. Let __GFP_HIGH and allocations
1302 * coming from realtime tasks go deeper into reserves. 1302 * coming from realtime tasks go deeper into reserves.
1303 * 1303 *
1304 * This is the last chance, in general, before the goto nopage. 1304 * This is the last chance, in general, before the goto nopage.
1305 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1305 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1306 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1306 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1307 */ 1307 */
1308 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1308 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
1309 if (page) 1309 if (page)
1310 goto got_pg; 1310 goto got_pg;
1311 1311
1312 /* This allocation should allow future memory freeing. */ 1312 /* This allocation should allow future memory freeing. */
1313 1313
1314 rebalance: 1314 rebalance:
1315 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1315 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1316 && !in_interrupt()) { 1316 && !in_interrupt()) {
1317 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1317 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1318 nofail_alloc: 1318 nofail_alloc:
1319 /* go through the zonelist yet again, ignoring mins */ 1319 /* go through the zonelist yet again, ignoring mins */
1320 page = get_page_from_freelist(gfp_mask, order, 1320 page = get_page_from_freelist(gfp_mask, order,
1321 zonelist, ALLOC_NO_WATERMARKS); 1321 zonelist, ALLOC_NO_WATERMARKS);
1322 if (page) 1322 if (page)
1323 goto got_pg; 1323 goto got_pg;
1324 if (gfp_mask & __GFP_NOFAIL) { 1324 if (gfp_mask & __GFP_NOFAIL) {
1325 congestion_wait(WRITE, HZ/50); 1325 congestion_wait(WRITE, HZ/50);
1326 goto nofail_alloc; 1326 goto nofail_alloc;
1327 } 1327 }
1328 } 1328 }
1329 goto nopage; 1329 goto nopage;
1330 } 1330 }
1331 1331
1332 /* Atomic allocations - we can't balance anything */ 1332 /* Atomic allocations - we can't balance anything */
1333 if (!wait) 1333 if (!wait)
1334 goto nopage; 1334 goto nopage;
1335 1335
1336 cond_resched(); 1336 cond_resched();
1337 1337
1338 /* We now go into synchronous reclaim */ 1338 /* We now go into synchronous reclaim */
1339 cpuset_memory_pressure_bump(); 1339 cpuset_memory_pressure_bump();
1340 p->flags |= PF_MEMALLOC; 1340 p->flags |= PF_MEMALLOC;
1341 reclaim_state.reclaimed_slab = 0; 1341 reclaim_state.reclaimed_slab = 0;
1342 p->reclaim_state = &reclaim_state; 1342 p->reclaim_state = &reclaim_state;
1343 1343
1344 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask); 1344 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
1345 1345
1346 p->reclaim_state = NULL; 1346 p->reclaim_state = NULL;
1347 p->flags &= ~PF_MEMALLOC; 1347 p->flags &= ~PF_MEMALLOC;
1348 1348
1349 cond_resched(); 1349 cond_resched();
1350 1350
1351 if (likely(did_some_progress)) { 1351 if (likely(did_some_progress)) {
1352 page = get_page_from_freelist(gfp_mask, order, 1352 page = get_page_from_freelist(gfp_mask, order,
1353 zonelist, alloc_flags); 1353 zonelist, alloc_flags);
1354 if (page) 1354 if (page)
1355 goto got_pg; 1355 goto got_pg;
1356 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1356 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1357 /* 1357 /*
1358 * Go through the zonelist yet one more time, keep 1358 * Go through the zonelist yet one more time, keep
1359 * very high watermark here, this is only to catch 1359 * very high watermark here, this is only to catch
1360 * a parallel oom killing, we must fail if we're still 1360 * a parallel oom killing, we must fail if we're still
1361 * under heavy pressure. 1361 * under heavy pressure.
1362 */ 1362 */
1363 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1363 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1364 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1364 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1365 if (page) 1365 if (page)
1366 goto got_pg; 1366 goto got_pg;
1367 1367
1368 out_of_memory(zonelist, gfp_mask, order); 1368 out_of_memory(zonelist, gfp_mask, order);
1369 goto restart; 1369 goto restart;
1370 } 1370 }
1371 1371
1372 /* 1372 /*
1373 * Don't let big-order allocations loop unless the caller explicitly 1373 * Don't let big-order allocations loop unless the caller explicitly
1374 * requests that. Wait for some write requests to complete then retry. 1374 * requests that. Wait for some write requests to complete then retry.
1375 * 1375 *
1376 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order 1376 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
1377 * <= 3, but that may not be true in other implementations. 1377 * <= 3, but that may not be true in other implementations.
1378 */ 1378 */
1379 do_retry = 0; 1379 do_retry = 0;
1380 if (!(gfp_mask & __GFP_NORETRY)) { 1380 if (!(gfp_mask & __GFP_NORETRY)) {
1381 if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) 1381 if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
1382 do_retry = 1; 1382 do_retry = 1;
1383 if (gfp_mask & __GFP_NOFAIL) 1383 if (gfp_mask & __GFP_NOFAIL)
1384 do_retry = 1; 1384 do_retry = 1;
1385 } 1385 }
1386 if (do_retry) { 1386 if (do_retry) {
1387 congestion_wait(WRITE, HZ/50); 1387 congestion_wait(WRITE, HZ/50);
1388 goto rebalance; 1388 goto rebalance;
1389 } 1389 }
1390 1390
1391 nopage: 1391 nopage:
1392 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { 1392 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
1393 printk(KERN_WARNING "%s: page allocation failure." 1393 printk(KERN_WARNING "%s: page allocation failure."
1394 " order:%d, mode:0x%x\n", 1394 " order:%d, mode:0x%x\n",
1395 p->comm, order, gfp_mask); 1395 p->comm, order, gfp_mask);
1396 dump_stack(); 1396 dump_stack();
1397 show_mem(); 1397 show_mem();
1398 } 1398 }
1399 got_pg: 1399 got_pg:
1400 return page; 1400 return page;
1401 } 1401 }
1402 1402
1403 EXPORT_SYMBOL(__alloc_pages); 1403 EXPORT_SYMBOL(__alloc_pages);
1404 1404
1405 /* 1405 /*
1406 * Common helper functions. 1406 * Common helper functions.
1407 */ 1407 */
1408 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) 1408 fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
1409 { 1409 {
1410 struct page * page; 1410 struct page * page;
1411 page = alloc_pages(gfp_mask, order); 1411 page = alloc_pages(gfp_mask, order);
1412 if (!page) 1412 if (!page)
1413 return 0; 1413 return 0;
1414 return (unsigned long) page_address(page); 1414 return (unsigned long) page_address(page);
1415 } 1415 }
1416 1416
1417 EXPORT_SYMBOL(__get_free_pages); 1417 EXPORT_SYMBOL(__get_free_pages);
1418 1418
1419 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask) 1419 fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
1420 { 1420 {
1421 struct page * page; 1421 struct page * page;
1422 1422
1423 /* 1423 /*
1424 * get_zeroed_page() returns a 32-bit address, which cannot represent 1424 * get_zeroed_page() returns a 32-bit address, which cannot represent
1425 * a highmem page 1425 * a highmem page
1426 */ 1426 */
1427 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); 1427 VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
1428 1428
1429 page = alloc_pages(gfp_mask | __GFP_ZERO, 0); 1429 page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
1430 if (page) 1430 if (page)
1431 return (unsigned long) page_address(page); 1431 return (unsigned long) page_address(page);
1432 return 0; 1432 return 0;
1433 } 1433 }
1434 1434
1435 EXPORT_SYMBOL(get_zeroed_page); 1435 EXPORT_SYMBOL(get_zeroed_page);
1436 1436
1437 void __pagevec_free(struct pagevec *pvec) 1437 void __pagevec_free(struct pagevec *pvec)
1438 { 1438 {
1439 int i = pagevec_count(pvec); 1439 int i = pagevec_count(pvec);
1440 1440
1441 while (--i >= 0) 1441 while (--i >= 0)
1442 free_hot_cold_page(pvec->pages[i], pvec->cold); 1442 free_hot_cold_page(pvec->pages[i], pvec->cold);
1443 } 1443 }
1444 1444
1445 fastcall void __free_pages(struct page *page, unsigned int order) 1445 fastcall void __free_pages(struct page *page, unsigned int order)
1446 { 1446 {
1447 if (put_page_testzero(page)) { 1447 if (put_page_testzero(page)) {
1448 if (order == 0) 1448 if (order == 0)
1449 free_hot_page(page); 1449 free_hot_page(page);
1450 else 1450 else
1451 __free_pages_ok(page, order); 1451 __free_pages_ok(page, order);
1452 } 1452 }
1453 } 1453 }
1454 1454
1455 EXPORT_SYMBOL(__free_pages); 1455 EXPORT_SYMBOL(__free_pages);
1456 1456
1457 fastcall void free_pages(unsigned long addr, unsigned int order) 1457 fastcall void free_pages(unsigned long addr, unsigned int order)
1458 { 1458 {
1459 if (addr != 0) { 1459 if (addr != 0) {
1460 VM_BUG_ON(!virt_addr_valid((void *)addr)); 1460 VM_BUG_ON(!virt_addr_valid((void *)addr));
1461 __free_pages(virt_to_page((void *)addr), order); 1461 __free_pages(virt_to_page((void *)addr), order);
1462 } 1462 }
1463 } 1463 }
1464 1464
1465 EXPORT_SYMBOL(free_pages); 1465 EXPORT_SYMBOL(free_pages);
1466 1466
1467 static unsigned int nr_free_zone_pages(int offset) 1467 static unsigned int nr_free_zone_pages(int offset)
1468 { 1468 {
1469 /* Just pick one node, since fallback list is circular */ 1469 /* Just pick one node, since fallback list is circular */
1470 pg_data_t *pgdat = NODE_DATA(numa_node_id()); 1470 pg_data_t *pgdat = NODE_DATA(numa_node_id());
1471 unsigned int sum = 0; 1471 unsigned int sum = 0;
1472 1472
1473 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1473 struct zonelist *zonelist = pgdat->node_zonelists + offset;
1474 struct zone **zonep = zonelist->zones; 1474 struct zone **zonep = zonelist->zones;
1475 struct zone *zone; 1475 struct zone *zone;
1476 1476
1477 for (zone = *zonep++; zone; zone = *zonep++) { 1477 for (zone = *zonep++; zone; zone = *zonep++) {
1478 unsigned long size = zone->present_pages; 1478 unsigned long size = zone->present_pages;
1479 unsigned long high = zone->pages_high; 1479 unsigned long high = zone->pages_high;
1480 if (size > high) 1480 if (size > high)
1481 sum += size - high; 1481 sum += size - high;
1482 } 1482 }
1483 1483
1484 return sum; 1484 return sum;
1485 } 1485 }
1486 1486
1487 /* 1487 /*
1488 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL 1488 * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
1489 */ 1489 */
1490 unsigned int nr_free_buffer_pages(void) 1490 unsigned int nr_free_buffer_pages(void)
1491 { 1491 {
1492 return nr_free_zone_pages(gfp_zone(GFP_USER)); 1492 return nr_free_zone_pages(gfp_zone(GFP_USER));
1493 } 1493 }
1494 1494
1495 /* 1495 /*
1496 * Amount of free RAM allocatable within all zones 1496 * Amount of free RAM allocatable within all zones
1497 */ 1497 */
1498 unsigned int nr_free_pagecache_pages(void) 1498 unsigned int nr_free_pagecache_pages(void)
1499 { 1499 {
1500 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER)); 1500 return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
1501 } 1501 }
1502 1502
1503 static inline void show_node(struct zone *zone) 1503 static inline void show_node(struct zone *zone)
1504 { 1504 {
1505 if (NUMA_BUILD) 1505 if (NUMA_BUILD)
1506 printk("Node %d ", zone_to_nid(zone)); 1506 printk("Node %d ", zone_to_nid(zone));
1507 } 1507 }
1508 1508
1509 void si_meminfo(struct sysinfo *val) 1509 void si_meminfo(struct sysinfo *val)
1510 { 1510 {
1511 val->totalram = totalram_pages; 1511 val->totalram = totalram_pages;
1512 val->sharedram = 0; 1512 val->sharedram = 0;
1513 val->freeram = global_page_state(NR_FREE_PAGES); 1513 val->freeram = global_page_state(NR_FREE_PAGES);
1514 val->bufferram = nr_blockdev_pages(); 1514 val->bufferram = nr_blockdev_pages();
1515 val->totalhigh = totalhigh_pages; 1515 val->totalhigh = totalhigh_pages;
1516 val->freehigh = nr_free_highpages(); 1516 val->freehigh = nr_free_highpages();
1517 val->mem_unit = PAGE_SIZE; 1517 val->mem_unit = PAGE_SIZE;
1518 } 1518 }
1519 1519
1520 EXPORT_SYMBOL(si_meminfo); 1520 EXPORT_SYMBOL(si_meminfo);
1521 1521
1522 #ifdef CONFIG_NUMA 1522 #ifdef CONFIG_NUMA
1523 void si_meminfo_node(struct sysinfo *val, int nid) 1523 void si_meminfo_node(struct sysinfo *val, int nid)
1524 { 1524 {
1525 pg_data_t *pgdat = NODE_DATA(nid); 1525 pg_data_t *pgdat = NODE_DATA(nid);
1526 1526
1527 val->totalram = pgdat->node_present_pages; 1527 val->totalram = pgdat->node_present_pages;
1528 val->freeram = node_page_state(nid, NR_FREE_PAGES); 1528 val->freeram = node_page_state(nid, NR_FREE_PAGES);
1529 #ifdef CONFIG_HIGHMEM 1529 #ifdef CONFIG_HIGHMEM
1530 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages; 1530 val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
1531 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], 1531 val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
1532 NR_FREE_PAGES); 1532 NR_FREE_PAGES);
1533 #else 1533 #else
1534 val->totalhigh = 0; 1534 val->totalhigh = 0;
1535 val->freehigh = 0; 1535 val->freehigh = 0;
1536 #endif 1536 #endif
1537 val->mem_unit = PAGE_SIZE; 1537 val->mem_unit = PAGE_SIZE;
1538 } 1538 }
1539 #endif 1539 #endif
1540 1540
1541 #define K(x) ((x) << (PAGE_SHIFT-10)) 1541 #define K(x) ((x) << (PAGE_SHIFT-10))
1542 1542
1543 /* 1543 /*
1544 * Show free area list (used inside shift_scroll-lock stuff) 1544 * Show free area list (used inside shift_scroll-lock stuff)
1545 * We also calculate the percentage fragmentation. We do this by counting the 1545 * We also calculate the percentage fragmentation. We do this by counting the
1546 * memory on each free list with the exception of the first item on the list. 1546 * memory on each free list with the exception of the first item on the list.
1547 */ 1547 */
1548 void show_free_areas(void) 1548 void show_free_areas(void)
1549 { 1549 {
1550 int cpu; 1550 int cpu;
1551 struct zone *zone; 1551 struct zone *zone;
1552 1552
1553 for_each_zone(zone) { 1553 for_each_zone(zone) {
1554 if (!populated_zone(zone)) 1554 if (!populated_zone(zone))
1555 continue; 1555 continue;
1556 1556
1557 show_node(zone); 1557 show_node(zone);
1558 printk("%s per-cpu:\n", zone->name); 1558 printk("%s per-cpu:\n", zone->name);
1559 1559
1560 for_each_online_cpu(cpu) { 1560 for_each_online_cpu(cpu) {
1561 struct per_cpu_pageset *pageset; 1561 struct per_cpu_pageset *pageset;
1562 1562
1563 pageset = zone_pcp(zone, cpu); 1563 pageset = zone_pcp(zone, cpu);
1564 1564
1565 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d " 1565 printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
1566 "Cold: hi:%5d, btch:%4d usd:%4d\n", 1566 "Cold: hi:%5d, btch:%4d usd:%4d\n",
1567 cpu, pageset->pcp[0].high, 1567 cpu, pageset->pcp[0].high,
1568 pageset->pcp[0].batch, pageset->pcp[0].count, 1568 pageset->pcp[0].batch, pageset->pcp[0].count,
1569 pageset->pcp[1].high, pageset->pcp[1].batch, 1569 pageset->pcp[1].high, pageset->pcp[1].batch,
1570 pageset->pcp[1].count); 1570 pageset->pcp[1].count);
1571 } 1571 }
1572 } 1572 }
1573 1573
1574 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1574 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
1575 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1575 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1576 global_page_state(NR_ACTIVE), 1576 global_page_state(NR_ACTIVE),
1577 global_page_state(NR_INACTIVE), 1577 global_page_state(NR_INACTIVE),
1578 global_page_state(NR_FILE_DIRTY), 1578 global_page_state(NR_FILE_DIRTY),
1579 global_page_state(NR_WRITEBACK), 1579 global_page_state(NR_WRITEBACK),
1580 global_page_state(NR_UNSTABLE_NFS), 1580 global_page_state(NR_UNSTABLE_NFS),
1581 global_page_state(NR_FREE_PAGES), 1581 global_page_state(NR_FREE_PAGES),
1582 global_page_state(NR_SLAB_RECLAIMABLE) + 1582 global_page_state(NR_SLAB_RECLAIMABLE) +
1583 global_page_state(NR_SLAB_UNRECLAIMABLE), 1583 global_page_state(NR_SLAB_UNRECLAIMABLE),
1584 global_page_state(NR_FILE_MAPPED), 1584 global_page_state(NR_FILE_MAPPED),
1585 global_page_state(NR_PAGETABLE), 1585 global_page_state(NR_PAGETABLE),
1586 global_page_state(NR_BOUNCE)); 1586 global_page_state(NR_BOUNCE));
1587 1587
1588 for_each_zone(zone) { 1588 for_each_zone(zone) {
1589 int i; 1589 int i;
1590 1590
1591 if (!populated_zone(zone)) 1591 if (!populated_zone(zone))
1592 continue; 1592 continue;
1593 1593
1594 show_node(zone); 1594 show_node(zone);
1595 printk("%s" 1595 printk("%s"
1596 " free:%lukB" 1596 " free:%lukB"
1597 " min:%lukB" 1597 " min:%lukB"
1598 " low:%lukB" 1598 " low:%lukB"
1599 " high:%lukB" 1599 " high:%lukB"
1600 " active:%lukB" 1600 " active:%lukB"
1601 " inactive:%lukB" 1601 " inactive:%lukB"
1602 " present:%lukB" 1602 " present:%lukB"
1603 " pages_scanned:%lu" 1603 " pages_scanned:%lu"
1604 " all_unreclaimable? %s" 1604 " all_unreclaimable? %s"
1605 "\n", 1605 "\n",
1606 zone->name, 1606 zone->name,
1607 K(zone_page_state(zone, NR_FREE_PAGES)), 1607 K(zone_page_state(zone, NR_FREE_PAGES)),
1608 K(zone->pages_min), 1608 K(zone->pages_min),
1609 K(zone->pages_low), 1609 K(zone->pages_low),
1610 K(zone->pages_high), 1610 K(zone->pages_high),
1611 K(zone_page_state(zone, NR_ACTIVE)), 1611 K(zone_page_state(zone, NR_ACTIVE)),
1612 K(zone_page_state(zone, NR_INACTIVE)), 1612 K(zone_page_state(zone, NR_INACTIVE)),
1613 K(zone->present_pages), 1613 K(zone->present_pages),
1614 zone->pages_scanned, 1614 zone->pages_scanned,
1615 (zone->all_unreclaimable ? "yes" : "no") 1615 (zone->all_unreclaimable ? "yes" : "no")
1616 ); 1616 );
1617 printk("lowmem_reserve[]:"); 1617 printk("lowmem_reserve[]:");
1618 for (i = 0; i < MAX_NR_ZONES; i++) 1618 for (i = 0; i < MAX_NR_ZONES; i++)
1619 printk(" %lu", zone->lowmem_reserve[i]); 1619 printk(" %lu", zone->lowmem_reserve[i]);
1620 printk("\n"); 1620 printk("\n");
1621 } 1621 }
1622 1622
1623 for_each_zone(zone) { 1623 for_each_zone(zone) {
1624 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1624 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1625 1625
1626 if (!populated_zone(zone)) 1626 if (!populated_zone(zone))
1627 continue; 1627 continue;
1628 1628
1629 show_node(zone); 1629 show_node(zone);
1630 printk("%s: ", zone->name); 1630 printk("%s: ", zone->name);
1631 1631
1632 spin_lock_irqsave(&zone->lock, flags); 1632 spin_lock_irqsave(&zone->lock, flags);
1633 for (order = 0; order < MAX_ORDER; order++) { 1633 for (order = 0; order < MAX_ORDER; order++) {
1634 nr[order] = zone->free_area[order].nr_free; 1634 nr[order] = zone->free_area[order].nr_free;
1635 total += nr[order] << order; 1635 total += nr[order] << order;
1636 } 1636 }
1637 spin_unlock_irqrestore(&zone->lock, flags); 1637 spin_unlock_irqrestore(&zone->lock, flags);
1638 for (order = 0; order < MAX_ORDER; order++) 1638 for (order = 0; order < MAX_ORDER; order++)
1639 printk("%lu*%lukB ", nr[order], K(1UL) << order); 1639 printk("%lu*%lukB ", nr[order], K(1UL) << order);
1640 printk("= %lukB\n", K(total)); 1640 printk("= %lukB\n", K(total));
1641 } 1641 }
1642 1642
1643 show_swap_cache_info(); 1643 show_swap_cache_info();
1644 } 1644 }
1645 1645
1646 /* 1646 /*
1647 * Builds allocation fallback zone lists. 1647 * Builds allocation fallback zone lists.
1648 * 1648 *
1649 * Add all populated zones of a node to the zonelist. 1649 * Add all populated zones of a node to the zonelist.
1650 */ 1650 */
1651 static int __meminit build_zonelists_node(pg_data_t *pgdat, 1651 static int __meminit build_zonelists_node(pg_data_t *pgdat,
1652 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) 1652 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type)
1653 { 1653 {
1654 struct zone *zone; 1654 struct zone *zone;
1655 1655
1656 BUG_ON(zone_type >= MAX_NR_ZONES); 1656 BUG_ON(zone_type >= MAX_NR_ZONES);
1657 zone_type++; 1657 zone_type++;
1658 1658
1659 do { 1659 do {
1660 zone_type--; 1660 zone_type--;
1661 zone = pgdat->node_zones + zone_type; 1661 zone = pgdat->node_zones + zone_type;
1662 if (populated_zone(zone)) { 1662 if (populated_zone(zone)) {
1663 zonelist->zones[nr_zones++] = zone; 1663 zonelist->zones[nr_zones++] = zone;
1664 check_highest_zone(zone_type); 1664 check_highest_zone(zone_type);
1665 } 1665 }
1666 1666
1667 } while (zone_type); 1667 } while (zone_type);
1668 return nr_zones; 1668 return nr_zones;
1669 } 1669 }
1670 1670
1671 #ifdef CONFIG_NUMA 1671 #ifdef CONFIG_NUMA
1672 #define MAX_NODE_LOAD (num_online_nodes()) 1672 #define MAX_NODE_LOAD (num_online_nodes())
1673 static int __meminitdata node_load[MAX_NUMNODES]; 1673 static int __meminitdata node_load[MAX_NUMNODES];
1674 /** 1674 /**
1675 * find_next_best_node - find the next node that should appear in a given node's fallback list 1675 * find_next_best_node - find the next node that should appear in a given node's fallback list
1676 * @node: node whose fallback list we're appending 1676 * @node: node whose fallback list we're appending
1677 * @used_node_mask: nodemask_t of already used nodes 1677 * @used_node_mask: nodemask_t of already used nodes
1678 * 1678 *
1679 * We use a number of factors to determine which is the next node that should 1679 * We use a number of factors to determine which is the next node that should
1680 * appear on a given node's fallback list. The node should not have appeared 1680 * appear on a given node's fallback list. The node should not have appeared
1681 * already in @node's fallback list, and it should be the next closest node 1681 * already in @node's fallback list, and it should be the next closest node
1682 * according to the distance array (which contains arbitrary distance values 1682 * according to the distance array (which contains arbitrary distance values
1683 * from each node to each node in the system), and should also prefer nodes 1683 * from each node to each node in the system), and should also prefer nodes
1684 * with no CPUs, since presumably they'll have very little allocation pressure 1684 * with no CPUs, since presumably they'll have very little allocation pressure
1685 * on them otherwise. 1685 * on them otherwise.
1686 * It returns -1 if no node is found. 1686 * It returns -1 if no node is found.
1687 */ 1687 */
1688 static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) 1688 static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1689 { 1689 {
1690 int n, val; 1690 int n, val;
1691 int min_val = INT_MAX; 1691 int min_val = INT_MAX;
1692 int best_node = -1; 1692 int best_node = -1;
1693 1693
1694 /* Use the local node if we haven't already */ 1694 /* Use the local node if we haven't already */
1695 if (!node_isset(node, *used_node_mask)) { 1695 if (!node_isset(node, *used_node_mask)) {
1696 node_set(node, *used_node_mask); 1696 node_set(node, *used_node_mask);
1697 return node; 1697 return node;
1698 } 1698 }
1699 1699
1700 for_each_online_node(n) { 1700 for_each_online_node(n) {
1701 cpumask_t tmp; 1701 cpumask_t tmp;
1702 1702
1703 /* Don't want a node to appear more than once */ 1703 /* Don't want a node to appear more than once */
1704 if (node_isset(n, *used_node_mask)) 1704 if (node_isset(n, *used_node_mask))
1705 continue; 1705 continue;
1706 1706
1707 /* Use the distance array to find the distance */ 1707 /* Use the distance array to find the distance */
1708 val = node_distance(node, n); 1708 val = node_distance(node, n);
1709 1709
1710 /* Penalize nodes under us ("prefer the next node") */ 1710 /* Penalize nodes under us ("prefer the next node") */
1711 val += (n < node); 1711 val += (n < node);
1712 1712
1713 /* Give preference to headless and unused nodes */ 1713 /* Give preference to headless and unused nodes */
1714 tmp = node_to_cpumask(n); 1714 tmp = node_to_cpumask(n);
1715 if (!cpus_empty(tmp)) 1715 if (!cpus_empty(tmp))
1716 val += PENALTY_FOR_NODE_WITH_CPUS; 1716 val += PENALTY_FOR_NODE_WITH_CPUS;
1717 1717
1718 /* Slight preference for less loaded node */ 1718 /* Slight preference for less loaded node */
1719 val *= (MAX_NODE_LOAD*MAX_NUMNODES); 1719 val *= (MAX_NODE_LOAD*MAX_NUMNODES);
1720 val += node_load[n]; 1720 val += node_load[n];
1721 1721
1722 if (val < min_val) { 1722 if (val < min_val) {
1723 min_val = val; 1723 min_val = val;
1724 best_node = n; 1724 best_node = n;
1725 } 1725 }
1726 } 1726 }
1727 1727
1728 if (best_node >= 0) 1728 if (best_node >= 0)
1729 node_set(best_node, *used_node_mask); 1729 node_set(best_node, *used_node_mask);
1730 1730
1731 return best_node; 1731 return best_node;
1732 } 1732 }
1733 1733
1734 static void __meminit build_zonelists(pg_data_t *pgdat) 1734 static void __meminit build_zonelists(pg_data_t *pgdat)
1735 { 1735 {
1736 int j, node, local_node; 1736 int j, node, local_node;
1737 enum zone_type i; 1737 enum zone_type i;
1738 int prev_node, load; 1738 int prev_node, load;
1739 struct zonelist *zonelist; 1739 struct zonelist *zonelist;
1740 nodemask_t used_mask; 1740 nodemask_t used_mask;
1741 1741
1742 /* initialize zonelists */ 1742 /* initialize zonelists */
1743 for (i = 0; i < MAX_NR_ZONES; i++) { 1743 for (i = 0; i < MAX_NR_ZONES; i++) {
1744 zonelist = pgdat->node_zonelists + i; 1744 zonelist = pgdat->node_zonelists + i;
1745 zonelist->zones[0] = NULL; 1745 zonelist->zones[0] = NULL;
1746 } 1746 }
1747 1747
1748 /* NUMA-aware ordering of nodes */ 1748 /* NUMA-aware ordering of nodes */
1749 local_node = pgdat->node_id; 1749 local_node = pgdat->node_id;
1750 load = num_online_nodes(); 1750 load = num_online_nodes();
1751 prev_node = local_node; 1751 prev_node = local_node;
1752 nodes_clear(used_mask); 1752 nodes_clear(used_mask);
1753 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1753 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1754 int distance = node_distance(local_node, node); 1754 int distance = node_distance(local_node, node);
1755 1755
1756 /* 1756 /*
1757 * If another node is sufficiently far away then it is better 1757 * If another node is sufficiently far away then it is better
1758 * to reclaim pages in a zone before going off node. 1758 * to reclaim pages in a zone before going off node.
1759 */ 1759 */
1760 if (distance > RECLAIM_DISTANCE) 1760 if (distance > RECLAIM_DISTANCE)
1761 zone_reclaim_mode = 1; 1761 zone_reclaim_mode = 1;
1762 1762
1763 /* 1763 /*
1764 * We don't want to pressure a particular node. 1764 * We don't want to pressure a particular node.
1765 * So adding penalty to the first node in same 1765 * So adding penalty to the first node in same
1766 * distance group to make it round-robin. 1766 * distance group to make it round-robin.
1767 */ 1767 */
1768 1768
1769 if (distance != node_distance(local_node, prev_node)) 1769 if (distance != node_distance(local_node, prev_node))
1770 node_load[node] += load; 1770 node_load[node] += load;
1771 prev_node = node; 1771 prev_node = node;
1772 load--; 1772 load--;
1773 for (i = 0; i < MAX_NR_ZONES; i++) { 1773 for (i = 0; i < MAX_NR_ZONES; i++) {
1774 zonelist = pgdat->node_zonelists + i; 1774 zonelist = pgdat->node_zonelists + i;
1775 for (j = 0; zonelist->zones[j] != NULL; j++); 1775 for (j = 0; zonelist->zones[j] != NULL; j++);
1776 1776
1777 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1777 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1778 zonelist->zones[j] = NULL; 1778 zonelist->zones[j] = NULL;
1779 } 1779 }
1780 } 1780 }
1781 } 1781 }
1782 1782
1783 /* Construct the zonelist performance cache - see further mmzone.h */ 1783 /* Construct the zonelist performance cache - see further mmzone.h */
1784 static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1784 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1785 { 1785 {
1786 int i; 1786 int i;
1787 1787
1788 for (i = 0; i < MAX_NR_ZONES; i++) { 1788 for (i = 0; i < MAX_NR_ZONES; i++) {
1789 struct zonelist *zonelist; 1789 struct zonelist *zonelist;
1790 struct zonelist_cache *zlc; 1790 struct zonelist_cache *zlc;
1791 struct zone **z; 1791 struct zone **z;
1792 1792
1793 zonelist = pgdat->node_zonelists + i; 1793 zonelist = pgdat->node_zonelists + i;
1794 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 1794 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
1795 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1795 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1796 for (z = zonelist->zones; *z; z++) 1796 for (z = zonelist->zones; *z; z++)
1797 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 1797 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
1798 } 1798 }
1799 } 1799 }
1800 1800
1801 #else /* CONFIG_NUMA */ 1801 #else /* CONFIG_NUMA */
1802 1802
1803 static void __meminit build_zonelists(pg_data_t *pgdat) 1803 static void __meminit build_zonelists(pg_data_t *pgdat)
1804 { 1804 {
1805 int node, local_node; 1805 int node, local_node;
1806 enum zone_type i,j; 1806 enum zone_type i,j;
1807 1807
1808 local_node = pgdat->node_id; 1808 local_node = pgdat->node_id;
1809 for (i = 0; i < MAX_NR_ZONES; i++) { 1809 for (i = 0; i < MAX_NR_ZONES; i++) {
1810 struct zonelist *zonelist; 1810 struct zonelist *zonelist;
1811 1811
1812 zonelist = pgdat->node_zonelists + i; 1812 zonelist = pgdat->node_zonelists + i;
1813 1813
1814 j = build_zonelists_node(pgdat, zonelist, 0, i); 1814 j = build_zonelists_node(pgdat, zonelist, 0, i);
1815 /* 1815 /*
1816 * Now we build the zonelist so that it contains the zones 1816 * Now we build the zonelist so that it contains the zones
1817 * of all the other nodes. 1817 * of all the other nodes.
1818 * We don't want to pressure a particular node, so when 1818 * We don't want to pressure a particular node, so when
1819 * building the zones for node N, we make sure that the 1819 * building the zones for node N, we make sure that the
1820 * zones coming right after the local ones are those from 1820 * zones coming right after the local ones are those from
1821 * node N+1 (modulo N) 1821 * node N+1 (modulo N)
1822 */ 1822 */
1823 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 1823 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
1824 if (!node_online(node)) 1824 if (!node_online(node))
1825 continue; 1825 continue;
1826 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1826 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1827 } 1827 }
1828 for (node = 0; node < local_node; node++) { 1828 for (node = 0; node < local_node; node++) {
1829 if (!node_online(node)) 1829 if (!node_online(node))
1830 continue; 1830 continue;
1831 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1831 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1832 } 1832 }
1833 1833
1834 zonelist->zones[j] = NULL; 1834 zonelist->zones[j] = NULL;
1835 } 1835 }
1836 } 1836 }
1837 1837
1838 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 1838 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1839 static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1839 static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1840 { 1840 {
1841 int i; 1841 int i;
1842 1842
1843 for (i = 0; i < MAX_NR_ZONES; i++) 1843 for (i = 0; i < MAX_NR_ZONES; i++)
1844 pgdat->node_zonelists[i].zlcache_ptr = NULL; 1844 pgdat->node_zonelists[i].zlcache_ptr = NULL;
1845 } 1845 }
1846 1846
1847 #endif /* CONFIG_NUMA */ 1847 #endif /* CONFIG_NUMA */
1848 1848
1849 /* return values int ....just for stop_machine_run() */ 1849 /* return values int ....just for stop_machine_run() */
1850 static int __meminit __build_all_zonelists(void *dummy) 1850 static int __meminit __build_all_zonelists(void *dummy)
1851 { 1851 {
1852 int nid; 1852 int nid;
1853 1853
1854 for_each_online_node(nid) { 1854 for_each_online_node(nid) {
1855 build_zonelists(NODE_DATA(nid)); 1855 build_zonelists(NODE_DATA(nid));
1856 build_zonelist_cache(NODE_DATA(nid)); 1856 build_zonelist_cache(NODE_DATA(nid));
1857 } 1857 }
1858 return 0; 1858 return 0;
1859 } 1859 }
1860 1860
1861 void __meminit build_all_zonelists(void) 1861 void __meminit build_all_zonelists(void)
1862 { 1862 {
1863 if (system_state == SYSTEM_BOOTING) { 1863 if (system_state == SYSTEM_BOOTING) {
1864 __build_all_zonelists(NULL); 1864 __build_all_zonelists(NULL);
1865 cpuset_init_current_mems_allowed(); 1865 cpuset_init_current_mems_allowed();
1866 } else { 1866 } else {
1867 /* we have to stop all cpus to guaranntee there is no user 1867 /* we have to stop all cpus to guaranntee there is no user
1868 of zonelist */ 1868 of zonelist */
1869 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); 1869 stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
1870 /* cpuset refresh routine should be here */ 1870 /* cpuset refresh routine should be here */
1871 } 1871 }
1872 vm_total_pages = nr_free_pagecache_pages(); 1872 vm_total_pages = nr_free_pagecache_pages();
1873 printk("Built %i zonelists. Total pages: %ld\n", 1873 printk("Built %i zonelists. Total pages: %ld\n",
1874 num_online_nodes(), vm_total_pages); 1874 num_online_nodes(), vm_total_pages);
1875 } 1875 }
1876 1876
1877 /* 1877 /*
1878 * Helper functions to size the waitqueue hash table. 1878 * Helper functions to size the waitqueue hash table.
1879 * Essentially these want to choose hash table sizes sufficiently 1879 * Essentially these want to choose hash table sizes sufficiently
1880 * large so that collisions trying to wait on pages are rare. 1880 * large so that collisions trying to wait on pages are rare.
1881 * But in fact, the number of active page waitqueues on typical 1881 * But in fact, the number of active page waitqueues on typical
1882 * systems is ridiculously low, less than 200. So this is even 1882 * systems is ridiculously low, less than 200. So this is even
1883 * conservative, even though it seems large. 1883 * conservative, even though it seems large.
1884 * 1884 *
1885 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to 1885 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
1886 * waitqueues, i.e. the size of the waitq table given the number of pages. 1886 * waitqueues, i.e. the size of the waitq table given the number of pages.
1887 */ 1887 */
1888 #define PAGES_PER_WAITQUEUE 256 1888 #define PAGES_PER_WAITQUEUE 256
1889 1889
1890 #ifndef CONFIG_MEMORY_HOTPLUG 1890 #ifndef CONFIG_MEMORY_HOTPLUG
1891 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 1891 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1892 { 1892 {
1893 unsigned long size = 1; 1893 unsigned long size = 1;
1894 1894
1895 pages /= PAGES_PER_WAITQUEUE; 1895 pages /= PAGES_PER_WAITQUEUE;
1896 1896
1897 while (size < pages) 1897 while (size < pages)
1898 size <<= 1; 1898 size <<= 1;
1899 1899
1900 /* 1900 /*
1901 * Once we have dozens or even hundreds of threads sleeping 1901 * Once we have dozens or even hundreds of threads sleeping
1902 * on IO we've got bigger problems than wait queue collision. 1902 * on IO we've got bigger problems than wait queue collision.
1903 * Limit the size of the wait table to a reasonable size. 1903 * Limit the size of the wait table to a reasonable size.
1904 */ 1904 */
1905 size = min(size, 4096UL); 1905 size = min(size, 4096UL);
1906 1906
1907 return max(size, 4UL); 1907 return max(size, 4UL);
1908 } 1908 }
1909 #else 1909 #else
1910 /* 1910 /*
1911 * A zone's size might be changed by hot-add, so it is not possible to determine 1911 * A zone's size might be changed by hot-add, so it is not possible to determine
1912 * a suitable size for its wait_table. So we use the maximum size now. 1912 * a suitable size for its wait_table. So we use the maximum size now.
1913 * 1913 *
1914 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: 1914 * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie:
1915 * 1915 *
1916 * i386 (preemption config) : 4096 x 16 = 64Kbyte. 1916 * i386 (preemption config) : 4096 x 16 = 64Kbyte.
1917 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. 1917 * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
1918 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. 1918 * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte.
1919 * 1919 *
1920 * The maximum entries are prepared when a zone's memory is (512K + 256) pages 1920 * The maximum entries are prepared when a zone's memory is (512K + 256) pages
1921 * or more by the traditional way. (See above). It equals: 1921 * or more by the traditional way. (See above). It equals:
1922 * 1922 *
1923 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. 1923 * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte.
1924 * ia64(16K page size) : = ( 8G + 4M)byte. 1924 * ia64(16K page size) : = ( 8G + 4M)byte.
1925 * powerpc (64K page size) : = (32G +16M)byte. 1925 * powerpc (64K page size) : = (32G +16M)byte.
1926 */ 1926 */
1927 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) 1927 static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
1928 { 1928 {
1929 return 4096UL; 1929 return 4096UL;
1930 } 1930 }
1931 #endif 1931 #endif
1932 1932
1933 /* 1933 /*
1934 * This is an integer logarithm so that shifts can be used later 1934 * This is an integer logarithm so that shifts can be used later
1935 * to extract the more random high bits from the multiplicative 1935 * to extract the more random high bits from the multiplicative
1936 * hash function before the remainder is taken. 1936 * hash function before the remainder is taken.
1937 */ 1937 */
1938 static inline unsigned long wait_table_bits(unsigned long size) 1938 static inline unsigned long wait_table_bits(unsigned long size)
1939 { 1939 {
1940 return ffz(~size); 1940 return ffz(~size);
1941 } 1941 }
1942 1942
1943 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) 1943 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
1944 1944
1945 /* 1945 /*
1946 * Initially all pages are reserved - free ones are freed 1946 * Initially all pages are reserved - free ones are freed
1947 * up by free_all_bootmem() once the early boot process is 1947 * up by free_all_bootmem() once the early boot process is
1948 * done. Non-atomic initialization, single-pass. 1948 * done. Non-atomic initialization, single-pass.
1949 */ 1949 */
1950 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1950 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1951 unsigned long start_pfn, enum memmap_context context) 1951 unsigned long start_pfn, enum memmap_context context)
1952 { 1952 {
1953 struct page *page; 1953 struct page *page;
1954 unsigned long end_pfn = start_pfn + size; 1954 unsigned long end_pfn = start_pfn + size;
1955 unsigned long pfn; 1955 unsigned long pfn;
1956 1956
1957 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 1957 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1958 /* 1958 /*
1959 * There can be holes in boot-time mem_map[]s 1959 * There can be holes in boot-time mem_map[]s
1960 * handed to this function. They do not 1960 * handed to this function. They do not
1961 * exist on hotplugged memory. 1961 * exist on hotplugged memory.
1962 */ 1962 */
1963 if (context == MEMMAP_EARLY) { 1963 if (context == MEMMAP_EARLY) {
1964 if (!early_pfn_valid(pfn)) 1964 if (!early_pfn_valid(pfn))
1965 continue; 1965 continue;
1966 if (!early_pfn_in_nid(pfn, nid)) 1966 if (!early_pfn_in_nid(pfn, nid))
1967 continue; 1967 continue;
1968 } 1968 }
1969 page = pfn_to_page(pfn); 1969 page = pfn_to_page(pfn);
1970 set_page_links(page, zone, nid, pfn); 1970 set_page_links(page, zone, nid, pfn);
1971 init_page_count(page); 1971 init_page_count(page);
1972 reset_page_mapcount(page); 1972 reset_page_mapcount(page);
1973 SetPageReserved(page); 1973 SetPageReserved(page);
1974 INIT_LIST_HEAD(&page->lru); 1974 INIT_LIST_HEAD(&page->lru);
1975 #ifdef WANT_PAGE_VIRTUAL 1975 #ifdef WANT_PAGE_VIRTUAL
1976 /* The shift won't overflow because ZONE_NORMAL is below 4G. */ 1976 /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1977 if (!is_highmem_idx(zone)) 1977 if (!is_highmem_idx(zone))
1978 set_page_address(page, __va(pfn << PAGE_SHIFT)); 1978 set_page_address(page, __va(pfn << PAGE_SHIFT));
1979 #endif 1979 #endif
1980 } 1980 }
1981 } 1981 }
1982 1982
1983 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 1983 void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1984 unsigned long size) 1984 unsigned long size)
1985 { 1985 {
1986 int order; 1986 int order;
1987 for (order = 0; order < MAX_ORDER ; order++) { 1987 for (order = 0; order < MAX_ORDER ; order++) {
1988 INIT_LIST_HEAD(&zone->free_area[order].free_list); 1988 INIT_LIST_HEAD(&zone->free_area[order].free_list);
1989 zone->free_area[order].nr_free = 0; 1989 zone->free_area[order].nr_free = 0;
1990 } 1990 }
1991 } 1991 }
1992 1992
1993 #ifndef __HAVE_ARCH_MEMMAP_INIT 1993 #ifndef __HAVE_ARCH_MEMMAP_INIT
1994 #define memmap_init(size, nid, zone, start_pfn) \ 1994 #define memmap_init(size, nid, zone, start_pfn) \
1995 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 1995 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1996 #endif 1996 #endif
1997 1997
1998 static int __cpuinit zone_batchsize(struct zone *zone) 1998 static int __cpuinit zone_batchsize(struct zone *zone)
1999 { 1999 {
2000 int batch; 2000 int batch;
2001 2001
2002 /* 2002 /*
2003 * The per-cpu-pages pools are set to around 1000th of the 2003 * The per-cpu-pages pools are set to around 1000th of the
2004 * size of the zone. But no more than 1/2 of a meg. 2004 * size of the zone. But no more than 1/2 of a meg.
2005 * 2005 *
2006 * OK, so we don't know how big the cache is. So guess. 2006 * OK, so we don't know how big the cache is. So guess.
2007 */ 2007 */
2008 batch = zone->present_pages / 1024; 2008 batch = zone->present_pages / 1024;
2009 if (batch * PAGE_SIZE > 512 * 1024) 2009 if (batch * PAGE_SIZE > 512 * 1024)
2010 batch = (512 * 1024) / PAGE_SIZE; 2010 batch = (512 * 1024) / PAGE_SIZE;
2011 batch /= 4; /* We effectively *= 4 below */ 2011 batch /= 4; /* We effectively *= 4 below */
2012 if (batch < 1) 2012 if (batch < 1)
2013 batch = 1; 2013 batch = 1;
2014 2014
2015 /* 2015 /*
2016 * Clamp the batch to a 2^n - 1 value. Having a power 2016 * Clamp the batch to a 2^n - 1 value. Having a power
2017 * of 2 value was found to be more likely to have 2017 * of 2 value was found to be more likely to have
2018 * suboptimal cache aliasing properties in some cases. 2018 * suboptimal cache aliasing properties in some cases.
2019 * 2019 *
2020 * For example if 2 tasks are alternately allocating 2020 * For example if 2 tasks are alternately allocating
2021 * batches of pages, one task can end up with a lot 2021 * batches of pages, one task can end up with a lot
2022 * of pages of one half of the possible page colors 2022 * of pages of one half of the possible page colors
2023 * and the other with pages of the other colors. 2023 * and the other with pages of the other colors.
2024 */ 2024 */
2025 batch = (1 << (fls(batch + batch/2)-1)) - 1; 2025 batch = (1 << (fls(batch + batch/2)-1)) - 1;
2026 2026
2027 return batch; 2027 return batch;
2028 } 2028 }
2029 2029
2030 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) 2030 inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
2031 { 2031 {
2032 struct per_cpu_pages *pcp; 2032 struct per_cpu_pages *pcp;
2033 2033
2034 memset(p, 0, sizeof(*p)); 2034 memset(p, 0, sizeof(*p));
2035 2035
2036 pcp = &p->pcp[0]; /* hot */ 2036 pcp = &p->pcp[0]; /* hot */
2037 pcp->count = 0; 2037 pcp->count = 0;
2038 pcp->high = 6 * batch; 2038 pcp->high = 6 * batch;
2039 pcp->batch = max(1UL, 1 * batch); 2039 pcp->batch = max(1UL, 1 * batch);
2040 INIT_LIST_HEAD(&pcp->list); 2040 INIT_LIST_HEAD(&pcp->list);
2041 2041
2042 pcp = &p->pcp[1]; /* cold*/ 2042 pcp = &p->pcp[1]; /* cold*/
2043 pcp->count = 0; 2043 pcp->count = 0;
2044 pcp->high = 2 * batch; 2044 pcp->high = 2 * batch;
2045 pcp->batch = max(1UL, batch/2); 2045 pcp->batch = max(1UL, batch/2);
2046 INIT_LIST_HEAD(&pcp->list); 2046 INIT_LIST_HEAD(&pcp->list);
2047 } 2047 }
2048 2048
2049 /* 2049 /*
2050 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist 2050 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
2051 * to the value high for the pageset p. 2051 * to the value high for the pageset p.
2052 */ 2052 */
2053 2053
2054 static void setup_pagelist_highmark(struct per_cpu_pageset *p, 2054 static void setup_pagelist_highmark(struct per_cpu_pageset *p,
2055 unsigned long high) 2055 unsigned long high)
2056 { 2056 {
2057 struct per_cpu_pages *pcp; 2057 struct per_cpu_pages *pcp;
2058 2058
2059 pcp = &p->pcp[0]; /* hot list */ 2059 pcp = &p->pcp[0]; /* hot list */
2060 pcp->high = high; 2060 pcp->high = high;
2061 pcp->batch = max(1UL, high/4); 2061 pcp->batch = max(1UL, high/4);
2062 if ((high/4) > (PAGE_SHIFT * 8)) 2062 if ((high/4) > (PAGE_SHIFT * 8))
2063 pcp->batch = PAGE_SHIFT * 8; 2063 pcp->batch = PAGE_SHIFT * 8;
2064 } 2064 }
2065 2065
2066 2066
2067 #ifdef CONFIG_NUMA 2067 #ifdef CONFIG_NUMA
2068 /* 2068 /*
2069 * Boot pageset table. One per cpu which is going to be used for all 2069 * Boot pageset table. One per cpu which is going to be used for all
2070 * zones and all nodes. The parameters will be set in such a way 2070 * zones and all nodes. The parameters will be set in such a way
2071 * that an item put on a list will immediately be handed over to 2071 * that an item put on a list will immediately be handed over to
2072 * the buddy list. This is safe since pageset manipulation is done 2072 * the buddy list. This is safe since pageset manipulation is done
2073 * with interrupts disabled. 2073 * with interrupts disabled.
2074 * 2074 *
2075 * Some NUMA counter updates may also be caught by the boot pagesets. 2075 * Some NUMA counter updates may also be caught by the boot pagesets.
2076 * 2076 *
2077 * The boot_pagesets must be kept even after bootup is complete for 2077 * The boot_pagesets must be kept even after bootup is complete for
2078 * unused processors and/or zones. They do play a role for bootstrapping 2078 * unused processors and/or zones. They do play a role for bootstrapping
2079 * hotplugged processors. 2079 * hotplugged processors.
2080 * 2080 *
2081 * zoneinfo_show() and maybe other functions do 2081 * zoneinfo_show() and maybe other functions do
2082 * not check if the processor is online before following the pageset pointer. 2082 * not check if the processor is online before following the pageset pointer.
2083 * Other parts of the kernel may not check if the zone is available. 2083 * Other parts of the kernel may not check if the zone is available.
2084 */ 2084 */
2085 static struct per_cpu_pageset boot_pageset[NR_CPUS]; 2085 static struct per_cpu_pageset boot_pageset[NR_CPUS];
2086 2086
2087 /* 2087 /*
2088 * Dynamically allocate memory for the 2088 * Dynamically allocate memory for the
2089 * per cpu pageset array in struct zone. 2089 * per cpu pageset array in struct zone.
2090 */ 2090 */
2091 static int __cpuinit process_zones(int cpu) 2091 static int __cpuinit process_zones(int cpu)
2092 { 2092 {
2093 struct zone *zone, *dzone; 2093 struct zone *zone, *dzone;
2094 2094
2095 for_each_zone(zone) { 2095 for_each_zone(zone) {
2096 2096
2097 if (!populated_zone(zone)) 2097 if (!populated_zone(zone))
2098 continue; 2098 continue;
2099 2099
2100 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2100 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2101 GFP_KERNEL, cpu_to_node(cpu)); 2101 GFP_KERNEL, cpu_to_node(cpu));
2102 if (!zone_pcp(zone, cpu)) 2102 if (!zone_pcp(zone, cpu))
2103 goto bad; 2103 goto bad;
2104 2104
2105 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); 2105 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
2106 2106
2107 if (percpu_pagelist_fraction) 2107 if (percpu_pagelist_fraction)
2108 setup_pagelist_highmark(zone_pcp(zone, cpu), 2108 setup_pagelist_highmark(zone_pcp(zone, cpu),
2109 (zone->present_pages / percpu_pagelist_fraction)); 2109 (zone->present_pages / percpu_pagelist_fraction));
2110 } 2110 }
2111 2111
2112 return 0; 2112 return 0;
2113 bad: 2113 bad:
2114 for_each_zone(dzone) { 2114 for_each_zone(dzone) {
2115 if (dzone == zone) 2115 if (dzone == zone)
2116 break; 2116 break;
2117 kfree(zone_pcp(dzone, cpu)); 2117 kfree(zone_pcp(dzone, cpu));
2118 zone_pcp(dzone, cpu) = NULL; 2118 zone_pcp(dzone, cpu) = NULL;
2119 } 2119 }
2120 return -ENOMEM; 2120 return -ENOMEM;
2121 } 2121 }
2122 2122
2123 static inline void free_zone_pagesets(int cpu) 2123 static inline void free_zone_pagesets(int cpu)
2124 { 2124 {
2125 struct zone *zone; 2125 struct zone *zone;
2126 2126
2127 for_each_zone(zone) { 2127 for_each_zone(zone) {
2128 struct per_cpu_pageset *pset = zone_pcp(zone, cpu); 2128 struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
2129 2129
2130 /* Free per_cpu_pageset if it is slab allocated */ 2130 /* Free per_cpu_pageset if it is slab allocated */
2131 if (pset != &boot_pageset[cpu]) 2131 if (pset != &boot_pageset[cpu])
2132 kfree(pset); 2132 kfree(pset);
2133 zone_pcp(zone, cpu) = NULL; 2133 zone_pcp(zone, cpu) = NULL;
2134 } 2134 }
2135 } 2135 }
2136 2136
2137 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, 2137 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
2138 unsigned long action, 2138 unsigned long action,
2139 void *hcpu) 2139 void *hcpu)
2140 { 2140 {
2141 int cpu = (long)hcpu; 2141 int cpu = (long)hcpu;
2142 int ret = NOTIFY_OK; 2142 int ret = NOTIFY_OK;
2143 2143
2144 switch (action) { 2144 switch (action) {
2145 case CPU_UP_PREPARE: 2145 case CPU_UP_PREPARE:
2146 if (process_zones(cpu)) 2146 if (process_zones(cpu))
2147 ret = NOTIFY_BAD; 2147 ret = NOTIFY_BAD;
2148 break; 2148 break;
2149 case CPU_UP_CANCELED: 2149 case CPU_UP_CANCELED:
2150 case CPU_DEAD: 2150 case CPU_DEAD:
2151 free_zone_pagesets(cpu); 2151 free_zone_pagesets(cpu);
2152 break; 2152 break;
2153 default: 2153 default:
2154 break; 2154 break;
2155 } 2155 }
2156 return ret; 2156 return ret;
2157 } 2157 }
2158 2158
2159 static struct notifier_block __cpuinitdata pageset_notifier = 2159 static struct notifier_block __cpuinitdata pageset_notifier =
2160 { &pageset_cpuup_callback, NULL, 0 }; 2160 { &pageset_cpuup_callback, NULL, 0 };
2161 2161
2162 void __init setup_per_cpu_pageset(void) 2162 void __init setup_per_cpu_pageset(void)
2163 { 2163 {
2164 int err; 2164 int err;
2165 2165
2166 /* Initialize per_cpu_pageset for cpu 0. 2166 /* Initialize per_cpu_pageset for cpu 0.
2167 * A cpuup callback will do this for every cpu 2167 * A cpuup callback will do this for every cpu
2168 * as it comes online 2168 * as it comes online
2169 */ 2169 */
2170 err = process_zones(smp_processor_id()); 2170 err = process_zones(smp_processor_id());
2171 BUG_ON(err); 2171 BUG_ON(err);
2172 register_cpu_notifier(&pageset_notifier); 2172 register_cpu_notifier(&pageset_notifier);
2173 } 2173 }
2174 2174
2175 #endif 2175 #endif
2176 2176
2177 static __meminit 2177 static __meminit
2178 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 2178 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
2179 { 2179 {
2180 int i; 2180 int i;
2181 struct pglist_data *pgdat = zone->zone_pgdat; 2181 struct pglist_data *pgdat = zone->zone_pgdat;
2182 size_t alloc_size; 2182 size_t alloc_size;
2183 2183
2184 /* 2184 /*
2185 * The per-page waitqueue mechanism uses hashed waitqueues 2185 * The per-page waitqueue mechanism uses hashed waitqueues
2186 * per zone. 2186 * per zone.
2187 */ 2187 */
2188 zone->wait_table_hash_nr_entries = 2188 zone->wait_table_hash_nr_entries =
2189 wait_table_hash_nr_entries(zone_size_pages); 2189 wait_table_hash_nr_entries(zone_size_pages);
2190 zone->wait_table_bits = 2190 zone->wait_table_bits =
2191 wait_table_bits(zone->wait_table_hash_nr_entries); 2191 wait_table_bits(zone->wait_table_hash_nr_entries);
2192 alloc_size = zone->wait_table_hash_nr_entries 2192 alloc_size = zone->wait_table_hash_nr_entries
2193 * sizeof(wait_queue_head_t); 2193 * sizeof(wait_queue_head_t);
2194 2194
2195 if (system_state == SYSTEM_BOOTING) { 2195 if (system_state == SYSTEM_BOOTING) {
2196 zone->wait_table = (wait_queue_head_t *) 2196 zone->wait_table = (wait_queue_head_t *)
2197 alloc_bootmem_node(pgdat, alloc_size); 2197 alloc_bootmem_node(pgdat, alloc_size);
2198 } else { 2198 } else {
2199 /* 2199 /*
2200 * This case means that a zone whose size was 0 gets new memory 2200 * This case means that a zone whose size was 0 gets new memory
2201 * via memory hot-add. 2201 * via memory hot-add.
2202 * But it may be the case that a new node was hot-added. In 2202 * But it may be the case that a new node was hot-added. In
2203 * this case vmalloc() will not be able to use this new node's 2203 * this case vmalloc() will not be able to use this new node's
2204 * memory - this wait_table must be initialized to use this new 2204 * memory - this wait_table must be initialized to use this new
2205 * node itself as well. 2205 * node itself as well.
2206 * To use this new node's memory, further consideration will be 2206 * To use this new node's memory, further consideration will be
2207 * necessary. 2207 * necessary.
2208 */ 2208 */
2209 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); 2209 zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
2210 } 2210 }
2211 if (!zone->wait_table) 2211 if (!zone->wait_table)
2212 return -ENOMEM; 2212 return -ENOMEM;
2213 2213
2214 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) 2214 for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
2215 init_waitqueue_head(zone->wait_table + i); 2215 init_waitqueue_head(zone->wait_table + i);
2216 2216
2217 return 0; 2217 return 0;
2218 } 2218 }
2219 2219
2220 static __meminit void zone_pcp_init(struct zone *zone) 2220 static __meminit void zone_pcp_init(struct zone *zone)
2221 { 2221 {
2222 int cpu; 2222 int cpu;
2223 unsigned long batch = zone_batchsize(zone); 2223 unsigned long batch = zone_batchsize(zone);
2224 2224
2225 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2225 for (cpu = 0; cpu < NR_CPUS; cpu++) {
2226 #ifdef CONFIG_NUMA 2226 #ifdef CONFIG_NUMA
2227 /* Early boot. Slab allocator not functional yet */ 2227 /* Early boot. Slab allocator not functional yet */
2228 zone_pcp(zone, cpu) = &boot_pageset[cpu]; 2228 zone_pcp(zone, cpu) = &boot_pageset[cpu];
2229 setup_pageset(&boot_pageset[cpu],0); 2229 setup_pageset(&boot_pageset[cpu],0);
2230 #else 2230 #else
2231 setup_pageset(zone_pcp(zone,cpu), batch); 2231 setup_pageset(zone_pcp(zone,cpu), batch);
2232 #endif 2232 #endif
2233 } 2233 }
2234 if (zone->present_pages) 2234 if (zone->present_pages)
2235 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", 2235 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
2236 zone->name, zone->present_pages, batch); 2236 zone->name, zone->present_pages, batch);
2237 } 2237 }
2238 2238
2239 __meminit int init_currently_empty_zone(struct zone *zone, 2239 __meminit int init_currently_empty_zone(struct zone *zone,
2240 unsigned long zone_start_pfn, 2240 unsigned long zone_start_pfn,
2241 unsigned long size, 2241 unsigned long size,
2242 enum memmap_context context) 2242 enum memmap_context context)
2243 { 2243 {
2244 struct pglist_data *pgdat = zone->zone_pgdat; 2244 struct pglist_data *pgdat = zone->zone_pgdat;
2245 int ret; 2245 int ret;
2246 ret = zone_wait_table_init(zone, size); 2246 ret = zone_wait_table_init(zone, size);
2247 if (ret) 2247 if (ret)
2248 return ret; 2248 return ret;
2249 pgdat->nr_zones = zone_idx(zone) + 1; 2249 pgdat->nr_zones = zone_idx(zone) + 1;
2250 2250
2251 zone->zone_start_pfn = zone_start_pfn; 2251 zone->zone_start_pfn = zone_start_pfn;
2252 2252
2253 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); 2253 memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
2254 2254
2255 zone_init_free_lists(pgdat, zone, zone->spanned_pages); 2255 zone_init_free_lists(pgdat, zone, zone->spanned_pages);
2256 2256
2257 return 0; 2257 return 0;
2258 } 2258 }
2259 2259
2260 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2260 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2261 /* 2261 /*
2262 * Basic iterator support. Return the first range of PFNs for a node 2262 * Basic iterator support. Return the first range of PFNs for a node
2263 * Note: nid == MAX_NUMNODES returns first region regardless of node 2263 * Note: nid == MAX_NUMNODES returns first region regardless of node
2264 */ 2264 */
2265 static int __init first_active_region_index_in_nid(int nid) 2265 static int __init first_active_region_index_in_nid(int nid)
2266 { 2266 {
2267 int i; 2267 int i;
2268 2268
2269 for (i = 0; i < nr_nodemap_entries; i++) 2269 for (i = 0; i < nr_nodemap_entries; i++)
2270 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) 2270 if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
2271 return i; 2271 return i;
2272 2272
2273 return -1; 2273 return -1;
2274 } 2274 }
2275 2275
2276 /* 2276 /*
2277 * Basic iterator support. Return the next active range of PFNs for a node 2277 * Basic iterator support. Return the next active range of PFNs for a node
2278 * Note: nid == MAX_NUMNODES returns next region regardles of node 2278 * Note: nid == MAX_NUMNODES returns next region regardles of node
2279 */ 2279 */
2280 static int __init next_active_region_index_in_nid(int index, int nid) 2280 static int __init next_active_region_index_in_nid(int index, int nid)
2281 { 2281 {
2282 for (index = index + 1; index < nr_nodemap_entries; index++) 2282 for (index = index + 1; index < nr_nodemap_entries; index++)
2283 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) 2283 if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
2284 return index; 2284 return index;
2285 2285
2286 return -1; 2286 return -1;
2287 } 2287 }
2288 2288
2289 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID 2289 #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
2290 /* 2290 /*
2291 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. 2291 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
2292 * Architectures may implement their own version but if add_active_range() 2292 * Architectures may implement their own version but if add_active_range()
2293 * was used and there are no special requirements, this is a convenient 2293 * was used and there are no special requirements, this is a convenient
2294 * alternative 2294 * alternative
2295 */ 2295 */
2296 int __init early_pfn_to_nid(unsigned long pfn) 2296 int __init early_pfn_to_nid(unsigned long pfn)
2297 { 2297 {
2298 int i; 2298 int i;
2299 2299
2300 for (i = 0; i < nr_nodemap_entries; i++) { 2300 for (i = 0; i < nr_nodemap_entries; i++) {
2301 unsigned long start_pfn = early_node_map[i].start_pfn; 2301 unsigned long start_pfn = early_node_map[i].start_pfn;
2302 unsigned long end_pfn = early_node_map[i].end_pfn; 2302 unsigned long end_pfn = early_node_map[i].end_pfn;
2303 2303
2304 if (start_pfn <= pfn && pfn < end_pfn) 2304 if (start_pfn <= pfn && pfn < end_pfn)
2305 return early_node_map[i].nid; 2305 return early_node_map[i].nid;
2306 } 2306 }
2307 2307
2308 return 0; 2308 return 0;
2309 } 2309 }
2310 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ 2310 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
2311 2311
2312 /* Basic iterator support to walk early_node_map[] */ 2312 /* Basic iterator support to walk early_node_map[] */
2313 #define for_each_active_range_index_in_nid(i, nid) \ 2313 #define for_each_active_range_index_in_nid(i, nid) \
2314 for (i = first_active_region_index_in_nid(nid); i != -1; \ 2314 for (i = first_active_region_index_in_nid(nid); i != -1; \
2315 i = next_active_region_index_in_nid(i, nid)) 2315 i = next_active_region_index_in_nid(i, nid))
2316 2316
2317 /** 2317 /**
2318 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range 2318 * free_bootmem_with_active_regions - Call free_bootmem_node for each active range
2319 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. 2319 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
2320 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node 2320 * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
2321 * 2321 *
2322 * If an architecture guarantees that all ranges registered with 2322 * If an architecture guarantees that all ranges registered with
2323 * add_active_ranges() contain no holes and may be freed, this 2323 * add_active_ranges() contain no holes and may be freed, this
2324 * this function may be used instead of calling free_bootmem() manually. 2324 * this function may be used instead of calling free_bootmem() manually.
2325 */ 2325 */
2326 void __init free_bootmem_with_active_regions(int nid, 2326 void __init free_bootmem_with_active_regions(int nid,
2327 unsigned long max_low_pfn) 2327 unsigned long max_low_pfn)
2328 { 2328 {
2329 int i; 2329 int i;
2330 2330
2331 for_each_active_range_index_in_nid(i, nid) { 2331 for_each_active_range_index_in_nid(i, nid) {
2332 unsigned long size_pages = 0; 2332 unsigned long size_pages = 0;
2333 unsigned long end_pfn = early_node_map[i].end_pfn; 2333 unsigned long end_pfn = early_node_map[i].end_pfn;
2334 2334
2335 if (early_node_map[i].start_pfn >= max_low_pfn) 2335 if (early_node_map[i].start_pfn >= max_low_pfn)
2336 continue; 2336 continue;
2337 2337
2338 if (end_pfn > max_low_pfn) 2338 if (end_pfn > max_low_pfn)
2339 end_pfn = max_low_pfn; 2339 end_pfn = max_low_pfn;
2340 2340
2341 size_pages = end_pfn - early_node_map[i].start_pfn; 2341 size_pages = end_pfn - early_node_map[i].start_pfn;
2342 free_bootmem_node(NODE_DATA(early_node_map[i].nid), 2342 free_bootmem_node(NODE_DATA(early_node_map[i].nid),
2343 PFN_PHYS(early_node_map[i].start_pfn), 2343 PFN_PHYS(early_node_map[i].start_pfn),
2344 size_pages << PAGE_SHIFT); 2344 size_pages << PAGE_SHIFT);
2345 } 2345 }
2346 } 2346 }
2347 2347
2348 /** 2348 /**
2349 * sparse_memory_present_with_active_regions - Call memory_present for each active range 2349 * sparse_memory_present_with_active_regions - Call memory_present for each active range
2350 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. 2350 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
2351 * 2351 *
2352 * If an architecture guarantees that all ranges registered with 2352 * If an architecture guarantees that all ranges registered with
2353 * add_active_ranges() contain no holes and may be freed, this 2353 * add_active_ranges() contain no holes and may be freed, this
2354 * function may be used instead of calling memory_present() manually. 2354 * function may be used instead of calling memory_present() manually.
2355 */ 2355 */
2356 void __init sparse_memory_present_with_active_regions(int nid) 2356 void __init sparse_memory_present_with_active_regions(int nid)
2357 { 2357 {
2358 int i; 2358 int i;
2359 2359
2360 for_each_active_range_index_in_nid(i, nid) 2360 for_each_active_range_index_in_nid(i, nid)
2361 memory_present(early_node_map[i].nid, 2361 memory_present(early_node_map[i].nid,
2362 early_node_map[i].start_pfn, 2362 early_node_map[i].start_pfn,
2363 early_node_map[i].end_pfn); 2363 early_node_map[i].end_pfn);
2364 } 2364 }
2365 2365
2366 /** 2366 /**
2367 * push_node_boundaries - Push node boundaries to at least the requested boundary 2367 * push_node_boundaries - Push node boundaries to at least the requested boundary
2368 * @nid: The nid of the node to push the boundary for 2368 * @nid: The nid of the node to push the boundary for
2369 * @start_pfn: The start pfn of the node 2369 * @start_pfn: The start pfn of the node
2370 * @end_pfn: The end pfn of the node 2370 * @end_pfn: The end pfn of the node
2371 * 2371 *
2372 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd 2372 * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
2373 * time. Specifically, on x86_64, SRAT will report ranges that can potentially 2373 * time. Specifically, on x86_64, SRAT will report ranges that can potentially
2374 * be hotplugged even though no physical memory exists. This function allows 2374 * be hotplugged even though no physical memory exists. This function allows
2375 * an arch to push out the node boundaries so mem_map is allocated that can 2375 * an arch to push out the node boundaries so mem_map is allocated that can
2376 * be used later. 2376 * be used later.
2377 */ 2377 */
2378 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2378 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2379 void __init push_node_boundaries(unsigned int nid, 2379 void __init push_node_boundaries(unsigned int nid,
2380 unsigned long start_pfn, unsigned long end_pfn) 2380 unsigned long start_pfn, unsigned long end_pfn)
2381 { 2381 {
2382 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", 2382 printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
2383 nid, start_pfn, end_pfn); 2383 nid, start_pfn, end_pfn);
2384 2384
2385 /* Initialise the boundary for this node if necessary */ 2385 /* Initialise the boundary for this node if necessary */
2386 if (node_boundary_end_pfn[nid] == 0) 2386 if (node_boundary_end_pfn[nid] == 0)
2387 node_boundary_start_pfn[nid] = -1UL; 2387 node_boundary_start_pfn[nid] = -1UL;
2388 2388
2389 /* Update the boundaries */ 2389 /* Update the boundaries */
2390 if (node_boundary_start_pfn[nid] > start_pfn) 2390 if (node_boundary_start_pfn[nid] > start_pfn)
2391 node_boundary_start_pfn[nid] = start_pfn; 2391 node_boundary_start_pfn[nid] = start_pfn;
2392 if (node_boundary_end_pfn[nid] < end_pfn) 2392 if (node_boundary_end_pfn[nid] < end_pfn)
2393 node_boundary_end_pfn[nid] = end_pfn; 2393 node_boundary_end_pfn[nid] = end_pfn;
2394 } 2394 }
2395 2395
2396 /* If necessary, push the node boundary out for reserve hotadd */ 2396 /* If necessary, push the node boundary out for reserve hotadd */
2397 static void __init account_node_boundary(unsigned int nid, 2397 static void __init account_node_boundary(unsigned int nid,
2398 unsigned long *start_pfn, unsigned long *end_pfn) 2398 unsigned long *start_pfn, unsigned long *end_pfn)
2399 { 2399 {
2400 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 2400 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
2401 nid, *start_pfn, *end_pfn); 2401 nid, *start_pfn, *end_pfn);
2402 2402
2403 /* Return if boundary information has not been provided */ 2403 /* Return if boundary information has not been provided */
2404 if (node_boundary_end_pfn[nid] == 0) 2404 if (node_boundary_end_pfn[nid] == 0)
2405 return; 2405 return;
2406 2406
2407 /* Check the boundaries and update if necessary */ 2407 /* Check the boundaries and update if necessary */
2408 if (node_boundary_start_pfn[nid] < *start_pfn) 2408 if (node_boundary_start_pfn[nid] < *start_pfn)
2409 *start_pfn = node_boundary_start_pfn[nid]; 2409 *start_pfn = node_boundary_start_pfn[nid];
2410 if (node_boundary_end_pfn[nid] > *end_pfn) 2410 if (node_boundary_end_pfn[nid] > *end_pfn)
2411 *end_pfn = node_boundary_end_pfn[nid]; 2411 *end_pfn = node_boundary_end_pfn[nid];
2412 } 2412 }
2413 #else 2413 #else
2414 void __init push_node_boundaries(unsigned int nid, 2414 void __init push_node_boundaries(unsigned int nid,
2415 unsigned long start_pfn, unsigned long end_pfn) {} 2415 unsigned long start_pfn, unsigned long end_pfn) {}
2416 2416
2417 static void __init account_node_boundary(unsigned int nid, 2417 static void __init account_node_boundary(unsigned int nid,
2418 unsigned long *start_pfn, unsigned long *end_pfn) {} 2418 unsigned long *start_pfn, unsigned long *end_pfn) {}
2419 #endif 2419 #endif
2420 2420
2421 2421
2422 /** 2422 /**
2423 * get_pfn_range_for_nid - Return the start and end page frames for a node 2423 * get_pfn_range_for_nid - Return the start and end page frames for a node
2424 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. 2424 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
2425 * @start_pfn: Passed by reference. On return, it will have the node start_pfn. 2425 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
2426 * @end_pfn: Passed by reference. On return, it will have the node end_pfn. 2426 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
2427 * 2427 *
2428 * It returns the start and end page frame of a node based on information 2428 * It returns the start and end page frame of a node based on information
2429 * provided by an arch calling add_active_range(). If called for a node 2429 * provided by an arch calling add_active_range(). If called for a node
2430 * with no available memory, a warning is printed and the start and end 2430 * with no available memory, a warning is printed and the start and end
2431 * PFNs will be 0. 2431 * PFNs will be 0.
2432 */ 2432 */
2433 void __init get_pfn_range_for_nid(unsigned int nid, 2433 void __init get_pfn_range_for_nid(unsigned int nid,
2434 unsigned long *start_pfn, unsigned long *end_pfn) 2434 unsigned long *start_pfn, unsigned long *end_pfn)
2435 { 2435 {
2436 int i; 2436 int i;
2437 *start_pfn = -1UL; 2437 *start_pfn = -1UL;
2438 *end_pfn = 0; 2438 *end_pfn = 0;
2439 2439
2440 for_each_active_range_index_in_nid(i, nid) { 2440 for_each_active_range_index_in_nid(i, nid) {
2441 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); 2441 *start_pfn = min(*start_pfn, early_node_map[i].start_pfn);
2442 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); 2442 *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
2443 } 2443 }
2444 2444
2445 if (*start_pfn == -1UL) { 2445 if (*start_pfn == -1UL) {
2446 printk(KERN_WARNING "Node %u active with no memory\n", nid); 2446 printk(KERN_WARNING "Node %u active with no memory\n", nid);
2447 *start_pfn = 0; 2447 *start_pfn = 0;
2448 } 2448 }
2449 2449
2450 /* Push the node boundaries out if requested */ 2450 /* Push the node boundaries out if requested */
2451 account_node_boundary(nid, start_pfn, end_pfn); 2451 account_node_boundary(nid, start_pfn, end_pfn);
2452 } 2452 }
2453 2453
2454 /* 2454 /*
2455 * Return the number of pages a zone spans in a node, including holes 2455 * Return the number of pages a zone spans in a node, including holes
2456 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 2456 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2457 */ 2457 */
2458 unsigned long __init zone_spanned_pages_in_node(int nid, 2458 unsigned long __init zone_spanned_pages_in_node(int nid,
2459 unsigned long zone_type, 2459 unsigned long zone_type,
2460 unsigned long *ignored) 2460 unsigned long *ignored)
2461 { 2461 {
2462 unsigned long node_start_pfn, node_end_pfn; 2462 unsigned long node_start_pfn, node_end_pfn;
2463 unsigned long zone_start_pfn, zone_end_pfn; 2463 unsigned long zone_start_pfn, zone_end_pfn;
2464 2464
2465 /* Get the start and end of the node and zone */ 2465 /* Get the start and end of the node and zone */
2466 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2466 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2467 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; 2467 zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
2468 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; 2468 zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
2469 2469
2470 /* Check that this node has pages within the zone's required range */ 2470 /* Check that this node has pages within the zone's required range */
2471 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) 2471 if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn)
2472 return 0; 2472 return 0;
2473 2473
2474 /* Move the zone boundaries inside the node if necessary */ 2474 /* Move the zone boundaries inside the node if necessary */
2475 zone_end_pfn = min(zone_end_pfn, node_end_pfn); 2475 zone_end_pfn = min(zone_end_pfn, node_end_pfn);
2476 zone_start_pfn = max(zone_start_pfn, node_start_pfn); 2476 zone_start_pfn = max(zone_start_pfn, node_start_pfn);
2477 2477
2478 /* Return the spanned pages */ 2478 /* Return the spanned pages */
2479 return zone_end_pfn - zone_start_pfn; 2479 return zone_end_pfn - zone_start_pfn;
2480 } 2480 }
2481 2481
2482 /* 2482 /*
2483 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, 2483 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
2484 * then all holes in the requested range will be accounted for. 2484 * then all holes in the requested range will be accounted for.
2485 */ 2485 */
2486 unsigned long __init __absent_pages_in_range(int nid, 2486 unsigned long __init __absent_pages_in_range(int nid,
2487 unsigned long range_start_pfn, 2487 unsigned long range_start_pfn,
2488 unsigned long range_end_pfn) 2488 unsigned long range_end_pfn)
2489 { 2489 {
2490 int i = 0; 2490 int i = 0;
2491 unsigned long prev_end_pfn = 0, hole_pages = 0; 2491 unsigned long prev_end_pfn = 0, hole_pages = 0;
2492 unsigned long start_pfn; 2492 unsigned long start_pfn;
2493 2493
2494 /* Find the end_pfn of the first active range of pfns in the node */ 2494 /* Find the end_pfn of the first active range of pfns in the node */
2495 i = first_active_region_index_in_nid(nid); 2495 i = first_active_region_index_in_nid(nid);
2496 if (i == -1) 2496 if (i == -1)
2497 return 0; 2497 return 0;
2498 2498
2499 /* Account for ranges before physical memory on this node */ 2499 /* Account for ranges before physical memory on this node */
2500 if (early_node_map[i].start_pfn > range_start_pfn) 2500 if (early_node_map[i].start_pfn > range_start_pfn)
2501 hole_pages = early_node_map[i].start_pfn - range_start_pfn; 2501 hole_pages = early_node_map[i].start_pfn - range_start_pfn;
2502 2502
2503 prev_end_pfn = early_node_map[i].start_pfn; 2503 prev_end_pfn = early_node_map[i].start_pfn;
2504 2504
2505 /* Find all holes for the zone within the node */ 2505 /* Find all holes for the zone within the node */
2506 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { 2506 for (; i != -1; i = next_active_region_index_in_nid(i, nid)) {
2507 2507
2508 /* No need to continue if prev_end_pfn is outside the zone */ 2508 /* No need to continue if prev_end_pfn is outside the zone */
2509 if (prev_end_pfn >= range_end_pfn) 2509 if (prev_end_pfn >= range_end_pfn)
2510 break; 2510 break;
2511 2511
2512 /* Make sure the end of the zone is not within the hole */ 2512 /* Make sure the end of the zone is not within the hole */
2513 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); 2513 start_pfn = min(early_node_map[i].start_pfn, range_end_pfn);
2514 prev_end_pfn = max(prev_end_pfn, range_start_pfn); 2514 prev_end_pfn = max(prev_end_pfn, range_start_pfn);
2515 2515
2516 /* Update the hole size cound and move on */ 2516 /* Update the hole size cound and move on */
2517 if (start_pfn > range_start_pfn) { 2517 if (start_pfn > range_start_pfn) {
2518 BUG_ON(prev_end_pfn > start_pfn); 2518 BUG_ON(prev_end_pfn > start_pfn);
2519 hole_pages += start_pfn - prev_end_pfn; 2519 hole_pages += start_pfn - prev_end_pfn;
2520 } 2520 }
2521 prev_end_pfn = early_node_map[i].end_pfn; 2521 prev_end_pfn = early_node_map[i].end_pfn;
2522 } 2522 }
2523 2523
2524 /* Account for ranges past physical memory on this node */ 2524 /* Account for ranges past physical memory on this node */
2525 if (range_end_pfn > prev_end_pfn) 2525 if (range_end_pfn > prev_end_pfn)
2526 hole_pages += range_end_pfn - 2526 hole_pages += range_end_pfn -
2527 max(range_start_pfn, prev_end_pfn); 2527 max(range_start_pfn, prev_end_pfn);
2528 2528
2529 return hole_pages; 2529 return hole_pages;
2530 } 2530 }
2531 2531
2532 /** 2532 /**
2533 * absent_pages_in_range - Return number of page frames in holes within a range 2533 * absent_pages_in_range - Return number of page frames in holes within a range
2534 * @start_pfn: The start PFN to start searching for holes 2534 * @start_pfn: The start PFN to start searching for holes
2535 * @end_pfn: The end PFN to stop searching for holes 2535 * @end_pfn: The end PFN to stop searching for holes
2536 * 2536 *
2537 * It returns the number of pages frames in memory holes within a range. 2537 * It returns the number of pages frames in memory holes within a range.
2538 */ 2538 */
2539 unsigned long __init absent_pages_in_range(unsigned long start_pfn, 2539 unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2540 unsigned long end_pfn) 2540 unsigned long end_pfn)
2541 { 2541 {
2542 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn); 2542 return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
2543 } 2543 }
2544 2544
2545 /* Return the number of page frames in holes in a zone on a node */ 2545 /* Return the number of page frames in holes in a zone on a node */
2546 unsigned long __init zone_absent_pages_in_node(int nid, 2546 unsigned long __init zone_absent_pages_in_node(int nid,
2547 unsigned long zone_type, 2547 unsigned long zone_type,
2548 unsigned long *ignored) 2548 unsigned long *ignored)
2549 { 2549 {
2550 unsigned long node_start_pfn, node_end_pfn; 2550 unsigned long node_start_pfn, node_end_pfn;
2551 unsigned long zone_start_pfn, zone_end_pfn; 2551 unsigned long zone_start_pfn, zone_end_pfn;
2552 2552
2553 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); 2553 get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
2554 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], 2554 zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
2555 node_start_pfn); 2555 node_start_pfn);
2556 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], 2556 zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
2557 node_end_pfn); 2557 node_end_pfn);
2558 2558
2559 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); 2559 return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
2560 } 2560 }
2561 2561
2562 #else 2562 #else
2563 static inline unsigned long zone_spanned_pages_in_node(int nid, 2563 static inline unsigned long zone_spanned_pages_in_node(int nid,
2564 unsigned long zone_type, 2564 unsigned long zone_type,
2565 unsigned long *zones_size) 2565 unsigned long *zones_size)
2566 { 2566 {
2567 return zones_size[zone_type]; 2567 return zones_size[zone_type];
2568 } 2568 }
2569 2569
2570 static inline unsigned long zone_absent_pages_in_node(int nid, 2570 static inline unsigned long zone_absent_pages_in_node(int nid,
2571 unsigned long zone_type, 2571 unsigned long zone_type,
2572 unsigned long *zholes_size) 2572 unsigned long *zholes_size)
2573 { 2573 {
2574 if (!zholes_size) 2574 if (!zholes_size)
2575 return 0; 2575 return 0;
2576 2576
2577 return zholes_size[zone_type]; 2577 return zholes_size[zone_type];
2578 } 2578 }
2579 2579
2580 #endif 2580 #endif
2581 2581
2582 static void __init calculate_node_totalpages(struct pglist_data *pgdat, 2582 static void __init calculate_node_totalpages(struct pglist_data *pgdat,
2583 unsigned long *zones_size, unsigned long *zholes_size) 2583 unsigned long *zones_size, unsigned long *zholes_size)
2584 { 2584 {
2585 unsigned long realtotalpages, totalpages = 0; 2585 unsigned long realtotalpages, totalpages = 0;
2586 enum zone_type i; 2586 enum zone_type i;
2587 2587
2588 for (i = 0; i < MAX_NR_ZONES; i++) 2588 for (i = 0; i < MAX_NR_ZONES; i++)
2589 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, 2589 totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
2590 zones_size); 2590 zones_size);
2591 pgdat->node_spanned_pages = totalpages; 2591 pgdat->node_spanned_pages = totalpages;
2592 2592
2593 realtotalpages = totalpages; 2593 realtotalpages = totalpages;
2594 for (i = 0; i < MAX_NR_ZONES; i++) 2594 for (i = 0; i < MAX_NR_ZONES; i++)
2595 realtotalpages -= 2595 realtotalpages -=
2596 zone_absent_pages_in_node(pgdat->node_id, i, 2596 zone_absent_pages_in_node(pgdat->node_id, i,
2597 zholes_size); 2597 zholes_size);
2598 pgdat->node_present_pages = realtotalpages; 2598 pgdat->node_present_pages = realtotalpages;
2599 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, 2599 printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
2600 realtotalpages); 2600 realtotalpages);
2601 } 2601 }
2602 2602
2603 /* 2603 /*
2604 * Set up the zone data structures: 2604 * Set up the zone data structures:
2605 * - mark all pages reserved 2605 * - mark all pages reserved
2606 * - mark all memory queues empty 2606 * - mark all memory queues empty
2607 * - clear the memory bitmaps 2607 * - clear the memory bitmaps
2608 */ 2608 */
2609 static void __meminit free_area_init_core(struct pglist_data *pgdat, 2609 static void __meminit free_area_init_core(struct pglist_data *pgdat,
2610 unsigned long *zones_size, unsigned long *zholes_size) 2610 unsigned long *zones_size, unsigned long *zholes_size)
2611 { 2611 {
2612 enum zone_type j; 2612 enum zone_type j;
2613 int nid = pgdat->node_id; 2613 int nid = pgdat->node_id;
2614 unsigned long zone_start_pfn = pgdat->node_start_pfn; 2614 unsigned long zone_start_pfn = pgdat->node_start_pfn;
2615 int ret; 2615 int ret;
2616 2616
2617 pgdat_resize_init(pgdat); 2617 pgdat_resize_init(pgdat);
2618 pgdat->nr_zones = 0; 2618 pgdat->nr_zones = 0;
2619 init_waitqueue_head(&pgdat->kswapd_wait); 2619 init_waitqueue_head(&pgdat->kswapd_wait);
2620 pgdat->kswapd_max_order = 0; 2620 pgdat->kswapd_max_order = 0;
2621 2621
2622 for (j = 0; j < MAX_NR_ZONES; j++) { 2622 for (j = 0; j < MAX_NR_ZONES; j++) {
2623 struct zone *zone = pgdat->node_zones + j; 2623 struct zone *zone = pgdat->node_zones + j;
2624 unsigned long size, realsize, memmap_pages; 2624 unsigned long size, realsize, memmap_pages;
2625 2625
2626 size = zone_spanned_pages_in_node(nid, j, zones_size); 2626 size = zone_spanned_pages_in_node(nid, j, zones_size);
2627 realsize = size - zone_absent_pages_in_node(nid, j, 2627 realsize = size - zone_absent_pages_in_node(nid, j,
2628 zholes_size); 2628 zholes_size);
2629 2629
2630 /* 2630 /*
2631 * Adjust realsize so that it accounts for how much memory 2631 * Adjust realsize so that it accounts for how much memory
2632 * is used by this zone for memmap. This affects the watermark 2632 * is used by this zone for memmap. This affects the watermark
2633 * and per-cpu initialisations 2633 * and per-cpu initialisations
2634 */ 2634 */
2635 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT; 2635 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
2636 if (realsize >= memmap_pages) { 2636 if (realsize >= memmap_pages) {
2637 realsize -= memmap_pages; 2637 realsize -= memmap_pages;
2638 printk(KERN_DEBUG 2638 printk(KERN_DEBUG
2639 " %s zone: %lu pages used for memmap\n", 2639 " %s zone: %lu pages used for memmap\n",
2640 zone_names[j], memmap_pages); 2640 zone_names[j], memmap_pages);
2641 } else 2641 } else
2642 printk(KERN_WARNING 2642 printk(KERN_WARNING
2643 " %s zone: %lu pages exceeds realsize %lu\n", 2643 " %s zone: %lu pages exceeds realsize %lu\n",
2644 zone_names[j], memmap_pages, realsize); 2644 zone_names[j], memmap_pages, realsize);
2645 2645
2646 /* Account for reserved pages */ 2646 /* Account for reserved pages */
2647 if (j == 0 && realsize > dma_reserve) { 2647 if (j == 0 && realsize > dma_reserve) {
2648 realsize -= dma_reserve; 2648 realsize -= dma_reserve;
2649 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 2649 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
2650 zone_names[0], dma_reserve); 2650 zone_names[0], dma_reserve);
2651 } 2651 }
2652 2652
2653 if (!is_highmem_idx(j)) 2653 if (!is_highmem_idx(j))
2654 nr_kernel_pages += realsize; 2654 nr_kernel_pages += realsize;
2655 nr_all_pages += realsize; 2655 nr_all_pages += realsize;
2656 2656
2657 zone->spanned_pages = size; 2657 zone->spanned_pages = size;
2658 zone->present_pages = realsize; 2658 zone->present_pages = realsize;
2659 #ifdef CONFIG_NUMA 2659 #ifdef CONFIG_NUMA
2660 zone->node = nid; 2660 zone->node = nid;
2661 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 2661 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
2662 / 100; 2662 / 100;
2663 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 2663 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
2664 #endif 2664 #endif
2665 zone->name = zone_names[j]; 2665 zone->name = zone_names[j];
2666 spin_lock_init(&zone->lock); 2666 spin_lock_init(&zone->lock);
2667 spin_lock_init(&zone->lru_lock); 2667 spin_lock_init(&zone->lru_lock);
2668 zone_seqlock_init(zone); 2668 zone_seqlock_init(zone);
2669 zone->zone_pgdat = pgdat; 2669 zone->zone_pgdat = pgdat;
2670 2670
2671 zone->prev_priority = DEF_PRIORITY; 2671 zone->prev_priority = DEF_PRIORITY;
2672 2672
2673 zone_pcp_init(zone); 2673 zone_pcp_init(zone);
2674 INIT_LIST_HEAD(&zone->active_list); 2674 INIT_LIST_HEAD(&zone->active_list);
2675 INIT_LIST_HEAD(&zone->inactive_list); 2675 INIT_LIST_HEAD(&zone->inactive_list);
2676 zone->nr_scan_active = 0; 2676 zone->nr_scan_active = 0;
2677 zone->nr_scan_inactive = 0; 2677 zone->nr_scan_inactive = 0;
2678 zap_zone_vm_stats(zone); 2678 zap_zone_vm_stats(zone);
2679 atomic_set(&zone->reclaim_in_progress, 0); 2679 atomic_set(&zone->reclaim_in_progress, 0);
2680 if (!size) 2680 if (!size)
2681 continue; 2681 continue;
2682 2682
2683 ret = init_currently_empty_zone(zone, zone_start_pfn, 2683 ret = init_currently_empty_zone(zone, zone_start_pfn,
2684 size, MEMMAP_EARLY); 2684 size, MEMMAP_EARLY);
2685 BUG_ON(ret); 2685 BUG_ON(ret);
2686 zone_start_pfn += size; 2686 zone_start_pfn += size;
2687 } 2687 }
2688 } 2688 }
2689 2689
2690 static void __init alloc_node_mem_map(struct pglist_data *pgdat) 2690 static void __init alloc_node_mem_map(struct pglist_data *pgdat)
2691 { 2691 {
2692 /* Skip empty nodes */ 2692 /* Skip empty nodes */
2693 if (!pgdat->node_spanned_pages) 2693 if (!pgdat->node_spanned_pages)
2694 return; 2694 return;
2695 2695
2696 #ifdef CONFIG_FLAT_NODE_MEM_MAP 2696 #ifdef CONFIG_FLAT_NODE_MEM_MAP
2697 /* ia64 gets its own node_mem_map, before this, without bootmem */ 2697 /* ia64 gets its own node_mem_map, before this, without bootmem */
2698 if (!pgdat->node_mem_map) { 2698 if (!pgdat->node_mem_map) {
2699 unsigned long size, start, end; 2699 unsigned long size, start, end;
2700 struct page *map; 2700 struct page *map;
2701 2701
2702 /* 2702 /*
2703 * The zone's endpoints aren't required to be MAX_ORDER 2703 * The zone's endpoints aren't required to be MAX_ORDER
2704 * aligned but the node_mem_map endpoints must be in order 2704 * aligned but the node_mem_map endpoints must be in order
2705 * for the buddy allocator to function correctly. 2705 * for the buddy allocator to function correctly.
2706 */ 2706 */
2707 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); 2707 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
2708 end = pgdat->node_start_pfn + pgdat->node_spanned_pages; 2708 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
2709 end = ALIGN(end, MAX_ORDER_NR_PAGES); 2709 end = ALIGN(end, MAX_ORDER_NR_PAGES);
2710 size = (end - start) * sizeof(struct page); 2710 size = (end - start) * sizeof(struct page);
2711 map = alloc_remap(pgdat->node_id, size); 2711 map = alloc_remap(pgdat->node_id, size);
2712 if (!map) 2712 if (!map)
2713 map = alloc_bootmem_node(pgdat, size); 2713 map = alloc_bootmem_node(pgdat, size);
2714 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); 2714 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
2715 } 2715 }
2716 #ifdef CONFIG_FLATMEM 2716 #ifdef CONFIG_FLATMEM
2717 /* 2717 /*
2718 * With no DISCONTIG, the global mem_map is just set as node 0's 2718 * With no DISCONTIG, the global mem_map is just set as node 0's
2719 */ 2719 */
2720 if (pgdat == NODE_DATA(0)) { 2720 if (pgdat == NODE_DATA(0)) {
2721 mem_map = NODE_DATA(0)->node_mem_map; 2721 mem_map = NODE_DATA(0)->node_mem_map;
2722 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2722 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2723 if (page_to_pfn(mem_map) != pgdat->node_start_pfn) 2723 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
2724 mem_map -= pgdat->node_start_pfn; 2724 mem_map -= pgdat->node_start_pfn;
2725 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 2725 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2726 } 2726 }
2727 #endif 2727 #endif
2728 #endif /* CONFIG_FLAT_NODE_MEM_MAP */ 2728 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
2729 } 2729 }
2730 2730
2731 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, 2731 void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
2732 unsigned long *zones_size, unsigned long node_start_pfn, 2732 unsigned long *zones_size, unsigned long node_start_pfn,
2733 unsigned long *zholes_size) 2733 unsigned long *zholes_size)
2734 { 2734 {
2735 pgdat->node_id = nid; 2735 pgdat->node_id = nid;
2736 pgdat->node_start_pfn = node_start_pfn; 2736 pgdat->node_start_pfn = node_start_pfn;
2737 calculate_node_totalpages(pgdat, zones_size, zholes_size); 2737 calculate_node_totalpages(pgdat, zones_size, zholes_size);
2738 2738
2739 alloc_node_mem_map(pgdat); 2739 alloc_node_mem_map(pgdat);
2740 2740
2741 free_area_init_core(pgdat, zones_size, zholes_size); 2741 free_area_init_core(pgdat, zones_size, zholes_size);
2742 } 2742 }
2743 2743
2744 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP 2744 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
2745 /** 2745 /**
2746 * add_active_range - Register a range of PFNs backed by physical memory 2746 * add_active_range - Register a range of PFNs backed by physical memory
2747 * @nid: The node ID the range resides on 2747 * @nid: The node ID the range resides on
2748 * @start_pfn: The start PFN of the available physical memory 2748 * @start_pfn: The start PFN of the available physical memory
2749 * @end_pfn: The end PFN of the available physical memory 2749 * @end_pfn: The end PFN of the available physical memory
2750 * 2750 *
2751 * These ranges are stored in an early_node_map[] and later used by 2751 * These ranges are stored in an early_node_map[] and later used by
2752 * free_area_init_nodes() to calculate zone sizes and holes. If the 2752 * free_area_init_nodes() to calculate zone sizes and holes. If the
2753 * range spans a memory hole, it is up to the architecture to ensure 2753 * range spans a memory hole, it is up to the architecture to ensure
2754 * the memory is not freed by the bootmem allocator. If possible 2754 * the memory is not freed by the bootmem allocator. If possible
2755 * the range being registered will be merged with existing ranges. 2755 * the range being registered will be merged with existing ranges.
2756 */ 2756 */
2757 void __init add_active_range(unsigned int nid, unsigned long start_pfn, 2757 void __init add_active_range(unsigned int nid, unsigned long start_pfn,
2758 unsigned long end_pfn) 2758 unsigned long end_pfn)
2759 { 2759 {
2760 int i; 2760 int i;
2761 2761
2762 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) " 2762 printk(KERN_DEBUG "Entering add_active_range(%d, %lu, %lu) "
2763 "%d entries of %d used\n", 2763 "%d entries of %d used\n",
2764 nid, start_pfn, end_pfn, 2764 nid, start_pfn, end_pfn,
2765 nr_nodemap_entries, MAX_ACTIVE_REGIONS); 2765 nr_nodemap_entries, MAX_ACTIVE_REGIONS);
2766 2766
2767 /* Merge with existing active regions if possible */ 2767 /* Merge with existing active regions if possible */
2768 for (i = 0; i < nr_nodemap_entries; i++) { 2768 for (i = 0; i < nr_nodemap_entries; i++) {
2769 if (early_node_map[i].nid != nid) 2769 if (early_node_map[i].nid != nid)
2770 continue; 2770 continue;
2771 2771
2772 /* Skip if an existing region covers this new one */ 2772 /* Skip if an existing region covers this new one */
2773 if (start_pfn >= early_node_map[i].start_pfn && 2773 if (start_pfn >= early_node_map[i].start_pfn &&
2774 end_pfn <= early_node_map[i].end_pfn) 2774 end_pfn <= early_node_map[i].end_pfn)
2775 return; 2775 return;
2776 2776
2777 /* Merge forward if suitable */ 2777 /* Merge forward if suitable */
2778 if (start_pfn <= early_node_map[i].end_pfn && 2778 if (start_pfn <= early_node_map[i].end_pfn &&
2779 end_pfn > early_node_map[i].end_pfn) { 2779 end_pfn > early_node_map[i].end_pfn) {
2780 early_node_map[i].end_pfn = end_pfn; 2780 early_node_map[i].end_pfn = end_pfn;
2781 return; 2781 return;
2782 } 2782 }
2783 2783
2784 /* Merge backward if suitable */ 2784 /* Merge backward if suitable */
2785 if (start_pfn < early_node_map[i].end_pfn && 2785 if (start_pfn < early_node_map[i].end_pfn &&
2786 end_pfn >= early_node_map[i].start_pfn) { 2786 end_pfn >= early_node_map[i].start_pfn) {
2787 early_node_map[i].start_pfn = start_pfn; 2787 early_node_map[i].start_pfn = start_pfn;
2788 return; 2788 return;
2789 } 2789 }
2790 } 2790 }
2791 2791
2792 /* Check that early_node_map is large enough */ 2792 /* Check that early_node_map is large enough */
2793 if (i >= MAX_ACTIVE_REGIONS) { 2793 if (i >= MAX_ACTIVE_REGIONS) {
2794 printk(KERN_CRIT "More than %d memory regions, truncating\n", 2794 printk(KERN_CRIT "More than %d memory regions, truncating\n",
2795 MAX_ACTIVE_REGIONS); 2795 MAX_ACTIVE_REGIONS);
2796 return; 2796 return;
2797 } 2797 }
2798 2798
2799 early_node_map[i].nid = nid; 2799 early_node_map[i].nid = nid;
2800 early_node_map[i].start_pfn = start_pfn; 2800 early_node_map[i].start_pfn = start_pfn;
2801 early_node_map[i].end_pfn = end_pfn; 2801 early_node_map[i].end_pfn = end_pfn;
2802 nr_nodemap_entries = i + 1; 2802 nr_nodemap_entries = i + 1;
2803 } 2803 }
2804 2804
2805 /** 2805 /**
2806 * shrink_active_range - Shrink an existing registered range of PFNs 2806 * shrink_active_range - Shrink an existing registered range of PFNs
2807 * @nid: The node id the range is on that should be shrunk 2807 * @nid: The node id the range is on that should be shrunk
2808 * @old_end_pfn: The old end PFN of the range 2808 * @old_end_pfn: The old end PFN of the range
2809 * @new_end_pfn: The new PFN of the range 2809 * @new_end_pfn: The new PFN of the range
2810 * 2810 *
2811 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. 2811 * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node.
2812 * The map is kept at the end physical page range that has already been 2812 * The map is kept at the end physical page range that has already been
2813 * registered with add_active_range(). This function allows an arch to shrink 2813 * registered with add_active_range(). This function allows an arch to shrink
2814 * an existing registered range. 2814 * an existing registered range.
2815 */ 2815 */
2816 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn, 2816 void __init shrink_active_range(unsigned int nid, unsigned long old_end_pfn,
2817 unsigned long new_end_pfn) 2817 unsigned long new_end_pfn)
2818 { 2818 {
2819 int i; 2819 int i;
2820 2820
2821 /* Find the old active region end and shrink */ 2821 /* Find the old active region end and shrink */
2822 for_each_active_range_index_in_nid(i, nid) 2822 for_each_active_range_index_in_nid(i, nid)
2823 if (early_node_map[i].end_pfn == old_end_pfn) { 2823 if (early_node_map[i].end_pfn == old_end_pfn) {
2824 early_node_map[i].end_pfn = new_end_pfn; 2824 early_node_map[i].end_pfn = new_end_pfn;
2825 break; 2825 break;
2826 } 2826 }
2827 } 2827 }
2828 2828
2829 /** 2829 /**
2830 * remove_all_active_ranges - Remove all currently registered regions 2830 * remove_all_active_ranges - Remove all currently registered regions
2831 * 2831 *
2832 * During discovery, it may be found that a table like SRAT is invalid 2832 * During discovery, it may be found that a table like SRAT is invalid
2833 * and an alternative discovery method must be used. This function removes 2833 * and an alternative discovery method must be used. This function removes
2834 * all currently registered regions. 2834 * all currently registered regions.
2835 */ 2835 */
2836 void __init remove_all_active_ranges(void) 2836 void __init remove_all_active_ranges(void)
2837 { 2837 {
2838 memset(early_node_map, 0, sizeof(early_node_map)); 2838 memset(early_node_map, 0, sizeof(early_node_map));
2839 nr_nodemap_entries = 0; 2839 nr_nodemap_entries = 0;
2840 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 2840 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
2841 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); 2841 memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
2842 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); 2842 memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
2843 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 2843 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
2844 } 2844 }
2845 2845
2846 /* Compare two active node_active_regions */ 2846 /* Compare two active node_active_regions */
2847 static int __init cmp_node_active_region(const void *a, const void *b) 2847 static int __init cmp_node_active_region(const void *a, const void *b)
2848 { 2848 {
2849 struct node_active_region *arange = (struct node_active_region *)a; 2849 struct node_active_region *arange = (struct node_active_region *)a;
2850 struct node_active_region *brange = (struct node_active_region *)b; 2850 struct node_active_region *brange = (struct node_active_region *)b;
2851 2851
2852 /* Done this way to avoid overflows */ 2852 /* Done this way to avoid overflows */
2853 if (arange->start_pfn > brange->start_pfn) 2853 if (arange->start_pfn > brange->start_pfn)
2854 return 1; 2854 return 1;
2855 if (arange->start_pfn < brange->start_pfn) 2855 if (arange->start_pfn < brange->start_pfn)
2856 return -1; 2856 return -1;
2857 2857
2858 return 0; 2858 return 0;
2859 } 2859 }
2860 2860
2861 /* sort the node_map by start_pfn */ 2861 /* sort the node_map by start_pfn */
2862 static void __init sort_node_map(void) 2862 static void __init sort_node_map(void)
2863 { 2863 {
2864 sort(early_node_map, (size_t)nr_nodemap_entries, 2864 sort(early_node_map, (size_t)nr_nodemap_entries,
2865 sizeof(struct node_active_region), 2865 sizeof(struct node_active_region),
2866 cmp_node_active_region, NULL); 2866 cmp_node_active_region, NULL);
2867 } 2867 }
2868 2868
2869 /* Find the lowest pfn for a node */ 2869 /* Find the lowest pfn for a node */
2870 unsigned long __init find_min_pfn_for_node(unsigned long nid) 2870 unsigned long __init find_min_pfn_for_node(unsigned long nid)
2871 { 2871 {
2872 int i; 2872 int i;
2873 unsigned long min_pfn = ULONG_MAX; 2873 unsigned long min_pfn = ULONG_MAX;
2874 2874
2875 /* Assuming a sorted map, the first range found has the starting pfn */ 2875 /* Assuming a sorted map, the first range found has the starting pfn */
2876 for_each_active_range_index_in_nid(i, nid) 2876 for_each_active_range_index_in_nid(i, nid)
2877 min_pfn = min(min_pfn, early_node_map[i].start_pfn); 2877 min_pfn = min(min_pfn, early_node_map[i].start_pfn);
2878 2878
2879 if (min_pfn == ULONG_MAX) { 2879 if (min_pfn == ULONG_MAX) {
2880 printk(KERN_WARNING 2880 printk(KERN_WARNING
2881 "Could not find start_pfn for node %lu\n", nid); 2881 "Could not find start_pfn for node %lu\n", nid);
2882 return 0; 2882 return 0;
2883 } 2883 }
2884 2884
2885 return min_pfn; 2885 return min_pfn;
2886 } 2886 }
2887 2887
2888 /** 2888 /**
2889 * find_min_pfn_with_active_regions - Find the minimum PFN registered 2889 * find_min_pfn_with_active_regions - Find the minimum PFN registered
2890 * 2890 *
2891 * It returns the minimum PFN based on information provided via 2891 * It returns the minimum PFN based on information provided via
2892 * add_active_range(). 2892 * add_active_range().
2893 */ 2893 */
2894 unsigned long __init find_min_pfn_with_active_regions(void) 2894 unsigned long __init find_min_pfn_with_active_regions(void)
2895 { 2895 {
2896 return find_min_pfn_for_node(MAX_NUMNODES); 2896 return find_min_pfn_for_node(MAX_NUMNODES);
2897 } 2897 }
2898 2898
2899 /** 2899 /**
2900 * find_max_pfn_with_active_regions - Find the maximum PFN registered 2900 * find_max_pfn_with_active_regions - Find the maximum PFN registered
2901 * 2901 *
2902 * It returns the maximum PFN based on information provided via 2902 * It returns the maximum PFN based on information provided via
2903 * add_active_range(). 2903 * add_active_range().
2904 */ 2904 */
2905 unsigned long __init find_max_pfn_with_active_regions(void) 2905 unsigned long __init find_max_pfn_with_active_regions(void)
2906 { 2906 {
2907 int i; 2907 int i;
2908 unsigned long max_pfn = 0; 2908 unsigned long max_pfn = 0;
2909 2909
2910 for (i = 0; i < nr_nodemap_entries; i++) 2910 for (i = 0; i < nr_nodemap_entries; i++)
2911 max_pfn = max(max_pfn, early_node_map[i].end_pfn); 2911 max_pfn = max(max_pfn, early_node_map[i].end_pfn);
2912 2912
2913 return max_pfn; 2913 return max_pfn;
2914 } 2914 }
2915 2915
2916 /** 2916 /**
2917 * free_area_init_nodes - Initialise all pg_data_t and zone data 2917 * free_area_init_nodes - Initialise all pg_data_t and zone data
2918 * @max_zone_pfn: an array of max PFNs for each zone 2918 * @max_zone_pfn: an array of max PFNs for each zone
2919 * 2919 *
2920 * This will call free_area_init_node() for each active node in the system. 2920 * This will call free_area_init_node() for each active node in the system.
2921 * Using the page ranges provided by add_active_range(), the size of each 2921 * Using the page ranges provided by add_active_range(), the size of each
2922 * zone in each node and their holes is calculated. If the maximum PFN 2922 * zone in each node and their holes is calculated. If the maximum PFN
2923 * between two adjacent zones match, it is assumed that the zone is empty. 2923 * between two adjacent zones match, it is assumed that the zone is empty.
2924 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed 2924 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
2925 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone 2925 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
2926 * starts where the previous one ended. For example, ZONE_DMA32 starts 2926 * starts where the previous one ended. For example, ZONE_DMA32 starts
2927 * at arch_max_dma_pfn. 2927 * at arch_max_dma_pfn.
2928 */ 2928 */
2929 void __init free_area_init_nodes(unsigned long *max_zone_pfn) 2929 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
2930 { 2930 {
2931 unsigned long nid; 2931 unsigned long nid;
2932 enum zone_type i; 2932 enum zone_type i;
2933 2933
2934 /* Sort early_node_map as initialisation assumes it is sorted */ 2934 /* Sort early_node_map as initialisation assumes it is sorted */
2935 sort_node_map(); 2935 sort_node_map();
2936 2936
2937 /* Record where the zone boundaries are */ 2937 /* Record where the zone boundaries are */
2938 memset(arch_zone_lowest_possible_pfn, 0, 2938 memset(arch_zone_lowest_possible_pfn, 0,
2939 sizeof(arch_zone_lowest_possible_pfn)); 2939 sizeof(arch_zone_lowest_possible_pfn));
2940 memset(arch_zone_highest_possible_pfn, 0, 2940 memset(arch_zone_highest_possible_pfn, 0,
2941 sizeof(arch_zone_highest_possible_pfn)); 2941 sizeof(arch_zone_highest_possible_pfn));
2942 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); 2942 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
2943 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; 2943 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
2944 for (i = 1; i < MAX_NR_ZONES; i++) { 2944 for (i = 1; i < MAX_NR_ZONES; i++) {
2945 arch_zone_lowest_possible_pfn[i] = 2945 arch_zone_lowest_possible_pfn[i] =
2946 arch_zone_highest_possible_pfn[i-1]; 2946 arch_zone_highest_possible_pfn[i-1];
2947 arch_zone_highest_possible_pfn[i] = 2947 arch_zone_highest_possible_pfn[i] =
2948 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); 2948 max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
2949 } 2949 }
2950 2950
2951 /* Print out the zone ranges */ 2951 /* Print out the zone ranges */
2952 printk("Zone PFN ranges:\n"); 2952 printk("Zone PFN ranges:\n");
2953 for (i = 0; i < MAX_NR_ZONES; i++) 2953 for (i = 0; i < MAX_NR_ZONES; i++)
2954 printk(" %-8s %8lu -> %8lu\n", 2954 printk(" %-8s %8lu -> %8lu\n",
2955 zone_names[i], 2955 zone_names[i],
2956 arch_zone_lowest_possible_pfn[i], 2956 arch_zone_lowest_possible_pfn[i],
2957 arch_zone_highest_possible_pfn[i]); 2957 arch_zone_highest_possible_pfn[i]);
2958 2958
2959 /* Print out the early_node_map[] */ 2959 /* Print out the early_node_map[] */
2960 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); 2960 printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries);
2961 for (i = 0; i < nr_nodemap_entries; i++) 2961 for (i = 0; i < nr_nodemap_entries; i++)
2962 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, 2962 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid,
2963 early_node_map[i].start_pfn, 2963 early_node_map[i].start_pfn,
2964 early_node_map[i].end_pfn); 2964 early_node_map[i].end_pfn);
2965 2965
2966 /* Initialise every node */ 2966 /* Initialise every node */
2967 setup_nr_node_ids(); 2967 setup_nr_node_ids();
2968 for_each_online_node(nid) { 2968 for_each_online_node(nid) {
2969 pg_data_t *pgdat = NODE_DATA(nid); 2969 pg_data_t *pgdat = NODE_DATA(nid);
2970 free_area_init_node(nid, pgdat, NULL, 2970 free_area_init_node(nid, pgdat, NULL,
2971 find_min_pfn_for_node(nid), NULL); 2971 find_min_pfn_for_node(nid), NULL);
2972 } 2972 }
2973 } 2973 }
2974 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 2974 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
2975 2975
2976 /** 2976 /**
2977 * set_dma_reserve - set the specified number of pages reserved in the first zone 2977 * set_dma_reserve - set the specified number of pages reserved in the first zone
2978 * @new_dma_reserve: The number of pages to mark reserved 2978 * @new_dma_reserve: The number of pages to mark reserved
2979 * 2979 *
2980 * The per-cpu batchsize and zone watermarks are determined by present_pages. 2980 * The per-cpu batchsize and zone watermarks are determined by present_pages.
2981 * In the DMA zone, a significant percentage may be consumed by kernel image 2981 * In the DMA zone, a significant percentage may be consumed by kernel image
2982 * and other unfreeable allocations which can skew the watermarks badly. This 2982 * and other unfreeable allocations which can skew the watermarks badly. This
2983 * function may optionally be used to account for unfreeable pages in the 2983 * function may optionally be used to account for unfreeable pages in the
2984 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and 2984 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
2985 * smaller per-cpu batchsize. 2985 * smaller per-cpu batchsize.
2986 */ 2986 */
2987 void __init set_dma_reserve(unsigned long new_dma_reserve) 2987 void __init set_dma_reserve(unsigned long new_dma_reserve)
2988 { 2988 {
2989 dma_reserve = new_dma_reserve; 2989 dma_reserve = new_dma_reserve;
2990 } 2990 }
2991 2991
2992 #ifndef CONFIG_NEED_MULTIPLE_NODES 2992 #ifndef CONFIG_NEED_MULTIPLE_NODES
2993 static bootmem_data_t contig_bootmem_data; 2993 static bootmem_data_t contig_bootmem_data;
2994 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; 2994 struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
2995 2995
2996 EXPORT_SYMBOL(contig_page_data); 2996 EXPORT_SYMBOL(contig_page_data);
2997 #endif 2997 #endif
2998 2998
2999 void __init free_area_init(unsigned long *zones_size) 2999 void __init free_area_init(unsigned long *zones_size)
3000 { 3000 {
3001 free_area_init_node(0, NODE_DATA(0), zones_size, 3001 free_area_init_node(0, NODE_DATA(0), zones_size,
3002 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); 3002 __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
3003 } 3003 }
3004 3004
3005 static int page_alloc_cpu_notify(struct notifier_block *self, 3005 static int page_alloc_cpu_notify(struct notifier_block *self,
3006 unsigned long action, void *hcpu) 3006 unsigned long action, void *hcpu)
3007 { 3007 {
3008 int cpu = (unsigned long)hcpu; 3008 int cpu = (unsigned long)hcpu;
3009 3009
3010 if (action == CPU_DEAD) { 3010 if (action == CPU_DEAD) {
3011 local_irq_disable(); 3011 local_irq_disable();
3012 __drain_pages(cpu); 3012 __drain_pages(cpu);
3013 vm_events_fold_cpu(cpu); 3013 vm_events_fold_cpu(cpu);
3014 local_irq_enable(); 3014 local_irq_enable();
3015 refresh_cpu_vm_stats(cpu); 3015 refresh_cpu_vm_stats(cpu);
3016 } 3016 }
3017 return NOTIFY_OK; 3017 return NOTIFY_OK;
3018 } 3018 }
3019 3019
3020 void __init page_alloc_init(void) 3020 void __init page_alloc_init(void)
3021 { 3021 {
3022 hotcpu_notifier(page_alloc_cpu_notify, 0); 3022 hotcpu_notifier(page_alloc_cpu_notify, 0);
3023 } 3023 }
3024 3024
3025 /* 3025 /*
3026 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio 3026 * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
3027 * or min_free_kbytes changes. 3027 * or min_free_kbytes changes.
3028 */ 3028 */
3029 static void calculate_totalreserve_pages(void) 3029 static void calculate_totalreserve_pages(void)
3030 { 3030 {
3031 struct pglist_data *pgdat; 3031 struct pglist_data *pgdat;
3032 unsigned long reserve_pages = 0; 3032 unsigned long reserve_pages = 0;
3033 enum zone_type i, j; 3033 enum zone_type i, j;
3034 3034
3035 for_each_online_pgdat(pgdat) { 3035 for_each_online_pgdat(pgdat) {
3036 for (i = 0; i < MAX_NR_ZONES; i++) { 3036 for (i = 0; i < MAX_NR_ZONES; i++) {
3037 struct zone *zone = pgdat->node_zones + i; 3037 struct zone *zone = pgdat->node_zones + i;
3038 unsigned long max = 0; 3038 unsigned long max = 0;
3039 3039
3040 /* Find valid and maximum lowmem_reserve in the zone */ 3040 /* Find valid and maximum lowmem_reserve in the zone */
3041 for (j = i; j < MAX_NR_ZONES; j++) { 3041 for (j = i; j < MAX_NR_ZONES; j++) {
3042 if (zone->lowmem_reserve[j] > max) 3042 if (zone->lowmem_reserve[j] > max)
3043 max = zone->lowmem_reserve[j]; 3043 max = zone->lowmem_reserve[j];
3044 } 3044 }
3045 3045
3046 /* we treat pages_high as reserved pages. */ 3046 /* we treat pages_high as reserved pages. */
3047 max += zone->pages_high; 3047 max += zone->pages_high;
3048 3048
3049 if (max > zone->present_pages) 3049 if (max > zone->present_pages)
3050 max = zone->present_pages; 3050 max = zone->present_pages;
3051 reserve_pages += max; 3051 reserve_pages += max;
3052 } 3052 }
3053 } 3053 }
3054 totalreserve_pages = reserve_pages; 3054 totalreserve_pages = reserve_pages;
3055 } 3055 }
3056 3056
3057 /* 3057 /*
3058 * setup_per_zone_lowmem_reserve - called whenever 3058 * setup_per_zone_lowmem_reserve - called whenever
3059 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone 3059 * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
3060 * has a correct pages reserved value, so an adequate number of 3060 * has a correct pages reserved value, so an adequate number of
3061 * pages are left in the zone after a successful __alloc_pages(). 3061 * pages are left in the zone after a successful __alloc_pages().
3062 */ 3062 */
3063 static void setup_per_zone_lowmem_reserve(void) 3063 static void setup_per_zone_lowmem_reserve(void)
3064 { 3064 {
3065 struct pglist_data *pgdat; 3065 struct pglist_data *pgdat;
3066 enum zone_type j, idx; 3066 enum zone_type j, idx;
3067 3067
3068 for_each_online_pgdat(pgdat) { 3068 for_each_online_pgdat(pgdat) {
3069 for (j = 0; j < MAX_NR_ZONES; j++) { 3069 for (j = 0; j < MAX_NR_ZONES; j++) {
3070 struct zone *zone = pgdat->node_zones + j; 3070 struct zone *zone = pgdat->node_zones + j;
3071 unsigned long present_pages = zone->present_pages; 3071 unsigned long present_pages = zone->present_pages;
3072 3072
3073 zone->lowmem_reserve[j] = 0; 3073 zone->lowmem_reserve[j] = 0;
3074 3074
3075 idx = j; 3075 idx = j;
3076 while (idx) { 3076 while (idx) {
3077 struct zone *lower_zone; 3077 struct zone *lower_zone;
3078 3078
3079 idx--; 3079 idx--;
3080 3080
3081 if (sysctl_lowmem_reserve_ratio[idx] < 1) 3081 if (sysctl_lowmem_reserve_ratio[idx] < 1)
3082 sysctl_lowmem_reserve_ratio[idx] = 1; 3082 sysctl_lowmem_reserve_ratio[idx] = 1;
3083 3083
3084 lower_zone = pgdat->node_zones + idx; 3084 lower_zone = pgdat->node_zones + idx;
3085 lower_zone->lowmem_reserve[j] = present_pages / 3085 lower_zone->lowmem_reserve[j] = present_pages /
3086 sysctl_lowmem_reserve_ratio[idx]; 3086 sysctl_lowmem_reserve_ratio[idx];
3087 present_pages += lower_zone->present_pages; 3087 present_pages += lower_zone->present_pages;
3088 } 3088 }
3089 } 3089 }
3090 } 3090 }
3091 3091
3092 /* update totalreserve_pages */ 3092 /* update totalreserve_pages */
3093 calculate_totalreserve_pages(); 3093 calculate_totalreserve_pages();
3094 } 3094 }
3095 3095
3096 /** 3096 /**
3097 * setup_per_zone_pages_min - called when min_free_kbytes changes. 3097 * setup_per_zone_pages_min - called when min_free_kbytes changes.
3098 * 3098 *
3099 * Ensures that the pages_{min,low,high} values for each zone are set correctly 3099 * Ensures that the pages_{min,low,high} values for each zone are set correctly
3100 * with respect to min_free_kbytes. 3100 * with respect to min_free_kbytes.
3101 */ 3101 */
3102 void setup_per_zone_pages_min(void) 3102 void setup_per_zone_pages_min(void)
3103 { 3103 {
3104 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 3104 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
3105 unsigned long lowmem_pages = 0; 3105 unsigned long lowmem_pages = 0;
3106 struct zone *zone; 3106 struct zone *zone;
3107 unsigned long flags; 3107 unsigned long flags;
3108 3108
3109 /* Calculate total number of !ZONE_HIGHMEM pages */ 3109 /* Calculate total number of !ZONE_HIGHMEM pages */
3110 for_each_zone(zone) { 3110 for_each_zone(zone) {
3111 if (!is_highmem(zone)) 3111 if (!is_highmem(zone))
3112 lowmem_pages += zone->present_pages; 3112 lowmem_pages += zone->present_pages;
3113 } 3113 }
3114 3114
3115 for_each_zone(zone) { 3115 for_each_zone(zone) {
3116 u64 tmp; 3116 u64 tmp;
3117 3117
3118 spin_lock_irqsave(&zone->lru_lock, flags); 3118 spin_lock_irqsave(&zone->lru_lock, flags);
3119 tmp = (u64)pages_min * zone->present_pages; 3119 tmp = (u64)pages_min * zone->present_pages;
3120 do_div(tmp, lowmem_pages); 3120 do_div(tmp, lowmem_pages);
3121 if (is_highmem(zone)) { 3121 if (is_highmem(zone)) {
3122 /* 3122 /*
3123 * __GFP_HIGH and PF_MEMALLOC allocations usually don't 3123 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
3124 * need highmem pages, so cap pages_min to a small 3124 * need highmem pages, so cap pages_min to a small
3125 * value here. 3125 * value here.
3126 * 3126 *
3127 * The (pages_high-pages_low) and (pages_low-pages_min) 3127 * The (pages_high-pages_low) and (pages_low-pages_min)
3128 * deltas controls asynch page reclaim, and so should 3128 * deltas controls asynch page reclaim, and so should
3129 * not be capped for highmem. 3129 * not be capped for highmem.
3130 */ 3130 */
3131 int min_pages; 3131 int min_pages;
3132 3132
3133 min_pages = zone->present_pages / 1024; 3133 min_pages = zone->present_pages / 1024;
3134 if (min_pages < SWAP_CLUSTER_MAX) 3134 if (min_pages < SWAP_CLUSTER_MAX)
3135 min_pages = SWAP_CLUSTER_MAX; 3135 min_pages = SWAP_CLUSTER_MAX;
3136 if (min_pages > 128) 3136 if (min_pages > 128)
3137 min_pages = 128; 3137 min_pages = 128;
3138 zone->pages_min = min_pages; 3138 zone->pages_min = min_pages;
3139 } else { 3139 } else {
3140 /* 3140 /*
3141 * If it's a lowmem zone, reserve a number of pages 3141 * If it's a lowmem zone, reserve a number of pages
3142 * proportionate to the zone's size. 3142 * proportionate to the zone's size.
3143 */ 3143 */
3144 zone->pages_min = tmp; 3144 zone->pages_min = tmp;
3145 } 3145 }
3146 3146
3147 zone->pages_low = zone->pages_min + (tmp >> 2); 3147 zone->pages_low = zone->pages_min + (tmp >> 2);
3148 zone->pages_high = zone->pages_min + (tmp >> 1); 3148 zone->pages_high = zone->pages_min + (tmp >> 1);
3149 spin_unlock_irqrestore(&zone->lru_lock, flags); 3149 spin_unlock_irqrestore(&zone->lru_lock, flags);
3150 } 3150 }
3151 3151
3152 /* update totalreserve_pages */ 3152 /* update totalreserve_pages */
3153 calculate_totalreserve_pages(); 3153 calculate_totalreserve_pages();
3154 } 3154 }
3155 3155
3156 /* 3156 /*
3157 * Initialise min_free_kbytes. 3157 * Initialise min_free_kbytes.
3158 * 3158 *
3159 * For small machines we want it small (128k min). For large machines 3159 * For small machines we want it small (128k min). For large machines
3160 * we want it large (64MB max). But it is not linear, because network 3160 * we want it large (64MB max). But it is not linear, because network
3161 * bandwidth does not increase linearly with machine size. We use 3161 * bandwidth does not increase linearly with machine size. We use
3162 * 3162 *
3163 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: 3163 * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
3164 * min_free_kbytes = sqrt(lowmem_kbytes * 16) 3164 * min_free_kbytes = sqrt(lowmem_kbytes * 16)
3165 * 3165 *
3166 * which yields 3166 * which yields
3167 * 3167 *
3168 * 16MB: 512k 3168 * 16MB: 512k
3169 * 32MB: 724k 3169 * 32MB: 724k
3170 * 64MB: 1024k 3170 * 64MB: 1024k
3171 * 128MB: 1448k 3171 * 128MB: 1448k
3172 * 256MB: 2048k 3172 * 256MB: 2048k
3173 * 512MB: 2896k 3173 * 512MB: 2896k
3174 * 1024MB: 4096k 3174 * 1024MB: 4096k
3175 * 2048MB: 5792k 3175 * 2048MB: 5792k
3176 * 4096MB: 8192k 3176 * 4096MB: 8192k
3177 * 8192MB: 11584k 3177 * 8192MB: 11584k
3178 * 16384MB: 16384k 3178 * 16384MB: 16384k
3179 */ 3179 */
3180 static int __init init_per_zone_pages_min(void) 3180 static int __init init_per_zone_pages_min(void)
3181 { 3181 {
3182 unsigned long lowmem_kbytes; 3182 unsigned long lowmem_kbytes;
3183 3183
3184 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); 3184 lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
3185 3185
3186 min_free_kbytes = int_sqrt(lowmem_kbytes * 16); 3186 min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
3187 if (min_free_kbytes < 128) 3187 if (min_free_kbytes < 128)
3188 min_free_kbytes = 128; 3188 min_free_kbytes = 128;
3189 if (min_free_kbytes > 65536) 3189 if (min_free_kbytes > 65536)
3190 min_free_kbytes = 65536; 3190 min_free_kbytes = 65536;
3191 setup_per_zone_pages_min(); 3191 setup_per_zone_pages_min();
3192 setup_per_zone_lowmem_reserve(); 3192 setup_per_zone_lowmem_reserve();
3193 return 0; 3193 return 0;
3194 } 3194 }
3195 module_init(init_per_zone_pages_min) 3195 module_init(init_per_zone_pages_min)
3196 3196
3197 /* 3197 /*
3198 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 3198 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
3199 * that we can call two helper functions whenever min_free_kbytes 3199 * that we can call two helper functions whenever min_free_kbytes
3200 * changes. 3200 * changes.
3201 */ 3201 */
3202 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 3202 int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
3203 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3203 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3204 { 3204 {
3205 proc_dointvec(table, write, file, buffer, length, ppos); 3205 proc_dointvec(table, write, file, buffer, length, ppos);
3206 setup_per_zone_pages_min(); 3206 setup_per_zone_pages_min();
3207 return 0; 3207 return 0;
3208 } 3208 }
3209 3209
3210 #ifdef CONFIG_NUMA 3210 #ifdef CONFIG_NUMA
3211 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, 3211 int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
3212 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3212 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3213 { 3213 {
3214 struct zone *zone; 3214 struct zone *zone;
3215 int rc; 3215 int rc;
3216 3216
3217 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3217 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3218 if (rc) 3218 if (rc)
3219 return rc; 3219 return rc;
3220 3220
3221 for_each_zone(zone) 3221 for_each_zone(zone)
3222 zone->min_unmapped_pages = (zone->present_pages * 3222 zone->min_unmapped_pages = (zone->present_pages *
3223 sysctl_min_unmapped_ratio) / 100; 3223 sysctl_min_unmapped_ratio) / 100;
3224 return 0; 3224 return 0;
3225 } 3225 }
3226 3226
3227 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, 3227 int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
3228 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3228 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3229 { 3229 {
3230 struct zone *zone; 3230 struct zone *zone;
3231 int rc; 3231 int rc;
3232 3232
3233 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3233 rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3234 if (rc) 3234 if (rc)
3235 return rc; 3235 return rc;
3236 3236
3237 for_each_zone(zone) 3237 for_each_zone(zone)
3238 zone->min_slab_pages = (zone->present_pages * 3238 zone->min_slab_pages = (zone->present_pages *
3239 sysctl_min_slab_ratio) / 100; 3239 sysctl_min_slab_ratio) / 100;
3240 return 0; 3240 return 0;
3241 } 3241 }
3242 #endif 3242 #endif
3243 3243
3244 /* 3244 /*
3245 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around 3245 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
3246 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() 3246 * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
3247 * whenever sysctl_lowmem_reserve_ratio changes. 3247 * whenever sysctl_lowmem_reserve_ratio changes.
3248 * 3248 *
3249 * The reserve ratio obviously has absolutely no relation with the 3249 * The reserve ratio obviously has absolutely no relation with the
3250 * pages_min watermarks. The lowmem reserve ratio can only make sense 3250 * pages_min watermarks. The lowmem reserve ratio can only make sense
3251 * if in function of the boot time zone sizes. 3251 * if in function of the boot time zone sizes.
3252 */ 3252 */
3253 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 3253 int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
3254 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3254 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3255 { 3255 {
3256 proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3256 proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3257 setup_per_zone_lowmem_reserve(); 3257 setup_per_zone_lowmem_reserve();
3258 return 0; 3258 return 0;
3259 } 3259 }
3260 3260
3261 /* 3261 /*
3262 * percpu_pagelist_fraction - changes the pcp->high for each zone on each 3262 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
3263 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist 3263 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
3264 * can have before it gets flushed back to buddy allocator. 3264 * can have before it gets flushed back to buddy allocator.
3265 */ 3265 */
3266 3266
3267 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, 3267 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
3268 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 3268 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
3269 { 3269 {
3270 struct zone *zone; 3270 struct zone *zone;
3271 unsigned int cpu; 3271 unsigned int cpu;
3272 int ret; 3272 int ret;
3273 3273
3274 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); 3274 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
3275 if (!write || (ret == -EINVAL)) 3275 if (!write || (ret == -EINVAL))
3276 return ret; 3276 return ret;
3277 for_each_zone(zone) { 3277 for_each_zone(zone) {
3278 for_each_online_cpu(cpu) { 3278 for_each_online_cpu(cpu) {
3279 unsigned long high; 3279 unsigned long high;
3280 high = zone->present_pages / percpu_pagelist_fraction; 3280 high = zone->present_pages / percpu_pagelist_fraction;
3281 setup_pagelist_highmark(zone_pcp(zone, cpu), high); 3281 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
3282 } 3282 }
3283 } 3283 }
3284 return 0; 3284 return 0;
3285 } 3285 }
3286 3286
3287 int hashdist = HASHDIST_DEFAULT; 3287 int hashdist = HASHDIST_DEFAULT;
3288 3288
3289 #ifdef CONFIG_NUMA 3289 #ifdef CONFIG_NUMA
3290 static int __init set_hashdist(char *str) 3290 static int __init set_hashdist(char *str)
3291 { 3291 {
3292 if (!str) 3292 if (!str)
3293 return 0; 3293 return 0;
3294 hashdist = simple_strtoul(str, &str, 0); 3294 hashdist = simple_strtoul(str, &str, 0);
3295 return 1; 3295 return 1;
3296 } 3296 }
3297 __setup("hashdist=", set_hashdist); 3297 __setup("hashdist=", set_hashdist);
3298 #endif 3298 #endif
3299 3299
3300 /* 3300 /*
3301 * allocate a large system hash table from bootmem 3301 * allocate a large system hash table from bootmem
3302 * - it is assumed that the hash table must contain an exact power-of-2 3302 * - it is assumed that the hash table must contain an exact power-of-2
3303 * quantity of entries 3303 * quantity of entries
3304 * - limit is the number of hash buckets, not the total allocation size 3304 * - limit is the number of hash buckets, not the total allocation size
3305 */ 3305 */
3306 void *__init alloc_large_system_hash(const char *tablename, 3306 void *__init alloc_large_system_hash(const char *tablename,
3307 unsigned long bucketsize, 3307 unsigned long bucketsize,
3308 unsigned long numentries, 3308 unsigned long numentries,
3309 int scale, 3309 int scale,
3310 int flags, 3310 int flags,
3311 unsigned int *_hash_shift, 3311 unsigned int *_hash_shift,
3312 unsigned int *_hash_mask, 3312 unsigned int *_hash_mask,
3313 unsigned long limit) 3313 unsigned long limit)
3314 { 3314 {
3315 unsigned long long max = limit; 3315 unsigned long long max = limit;
3316 unsigned long log2qty, size; 3316 unsigned long log2qty, size;
3317 void *table = NULL; 3317 void *table = NULL;
3318 3318
3319 /* allow the kernel cmdline to have a say */ 3319 /* allow the kernel cmdline to have a say */
3320 if (!numentries) { 3320 if (!numentries) {
3321 /* round applicable memory size up to nearest megabyte */ 3321 /* round applicable memory size up to nearest megabyte */
3322 numentries = nr_kernel_pages; 3322 numentries = nr_kernel_pages;
3323 numentries += (1UL << (20 - PAGE_SHIFT)) - 1; 3323 numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
3324 numentries >>= 20 - PAGE_SHIFT; 3324 numentries >>= 20 - PAGE_SHIFT;
3325 numentries <<= 20 - PAGE_SHIFT; 3325 numentries <<= 20 - PAGE_SHIFT;
3326 3326
3327 /* limit to 1 bucket per 2^scale bytes of low memory */ 3327 /* limit to 1 bucket per 2^scale bytes of low memory */
3328 if (scale > PAGE_SHIFT) 3328 if (scale > PAGE_SHIFT)
3329 numentries >>= (scale - PAGE_SHIFT); 3329 numentries >>= (scale - PAGE_SHIFT);
3330 else 3330 else
3331 numentries <<= (PAGE_SHIFT - scale); 3331 numentries <<= (PAGE_SHIFT - scale);
3332 3332
3333 /* Make sure we've got at least a 0-order allocation.. */ 3333 /* Make sure we've got at least a 0-order allocation.. */
3334 if (unlikely((numentries * bucketsize) < PAGE_SIZE)) 3334 if (unlikely((numentries * bucketsize) < PAGE_SIZE))
3335 numentries = PAGE_SIZE / bucketsize; 3335 numentries = PAGE_SIZE / bucketsize;
3336 } 3336 }
3337 numentries = roundup_pow_of_two(numentries); 3337 numentries = roundup_pow_of_two(numentries);
3338 3338
3339 /* limit allocation size to 1/16 total memory by default */ 3339 /* limit allocation size to 1/16 total memory by default */
3340 if (max == 0) { 3340 if (max == 0) {
3341 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; 3341 max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
3342 do_div(max, bucketsize); 3342 do_div(max, bucketsize);
3343 } 3343 }
3344 3344
3345 if (numentries > max) 3345 if (numentries > max)
3346 numentries = max; 3346 numentries = max;
3347 3347
3348 log2qty = ilog2(numentries); 3348 log2qty = ilog2(numentries);
3349 3349
3350 do { 3350 do {
3351 size = bucketsize << log2qty; 3351 size = bucketsize << log2qty;
3352 if (flags & HASH_EARLY) 3352 if (flags & HASH_EARLY)
3353 table = alloc_bootmem(size); 3353 table = alloc_bootmem(size);
3354 else if (hashdist) 3354 else if (hashdist)
3355 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 3355 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
3356 else { 3356 else {
3357 unsigned long order; 3357 unsigned long order;
3358 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 3358 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
3359 ; 3359 ;
3360 table = (void*) __get_free_pages(GFP_ATOMIC, order); 3360 table = (void*) __get_free_pages(GFP_ATOMIC, order);
3361 } 3361 }
3362 } while (!table && size > PAGE_SIZE && --log2qty); 3362 } while (!table && size > PAGE_SIZE && --log2qty);
3363 3363
3364 if (!table) 3364 if (!table)
3365 panic("Failed to allocate %s hash table\n", tablename); 3365 panic("Failed to allocate %s hash table\n", tablename);
3366 3366
3367 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3367 printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
3368 tablename, 3368 tablename,
3369 (1U << log2qty), 3369 (1U << log2qty),
3370 ilog2(size) - PAGE_SHIFT, 3370 ilog2(size) - PAGE_SHIFT,
3371 size); 3371 size);
3372 3372
3373 if (_hash_shift) 3373 if (_hash_shift)
3374 *_hash_shift = log2qty; 3374 *_hash_shift = log2qty;
3375 if (_hash_mask) 3375 if (_hash_mask)
3376 *_hash_mask = (1 << log2qty) - 1; 3376 *_hash_mask = (1 << log2qty) - 1;
3377 3377
3378 return table; 3378 return table;
3379 } 3379 }
3380 3380
3381 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE 3381 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
3382 struct page *pfn_to_page(unsigned long pfn) 3382 struct page *pfn_to_page(unsigned long pfn)
3383 { 3383 {
3384 return __pfn_to_page(pfn); 3384 return __pfn_to_page(pfn);
3385 } 3385 }
3386 unsigned long page_to_pfn(struct page *page) 3386 unsigned long page_to_pfn(struct page *page)
3387 { 3387 {
3388 return __page_to_pfn(page); 3388 return __page_to_pfn(page);
3389 } 3389 }
3390 EXPORT_SYMBOL(pfn_to_page); 3390 EXPORT_SYMBOL(pfn_to_page);
3391 EXPORT_SYMBOL(page_to_pfn); 3391 EXPORT_SYMBOL(page_to_pfn);
3392 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ 3392 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
3393 3393
3394 3394
3395 3395